lightvector · ChinChangYang · May 23, 2026 · May 26, 2026
diff --git a/Compiling.md b/Compiling.md
@@ -133,14 +133,15 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * AppleClang and Swift compilers: `xcode-select --install`.
       * If using the Metal backend, [Ninja](https://ninja-build.org): `brew install ninja`
       * If using the Metal backend, protobuf and abseil: `brew install protobuf abseil`
+      * If using the MLX backend (Apple Silicon only): `brew install mlx` (≥0.18). Requires CMake ≥3.27. KataGo finds MLX via CMake's default search (Homebrew installs it at `/opt/homebrew/share/cmake/MLX/`); override with `-DMLX_ROOT=/path/to/mlx/cmake` if needed.
       * libzip: `brew install libzip`.
       * If you want to do self-play training and research, probably Google perftools `brew install gperftools` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
       * If compiling to contribute to public distributed training runs, OpenSSL is required (`brew install openssl`).
    * Clone this repo:
       * `git clone https://github.com/lightvector/KataGo.git`
    * Compile using CMake and make in the cpp directory:
       * `cd KataGo/cpp`
-      * `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
+      * `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -DUSE_BACKEND=MLX` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
          * Specify also `-DUSE_TCMALLOC=1` if using TCMalloc.
          * Compiling will also call git commands to embed the git hash into the compiled executable, specify also `-DNO_GIT_REVISION=1` to disable it if this is causing issues for you.
          * Specify `-DUSE_AVX2=1` to also compile Eigen with AVX2 and FMA support, which will make it incompatible with old CPUs but much faster. Intel-based Macs with new processors support AVX2, but Apple Silicon Macs do not support AVX2 natively. (If you want to go further, you can also add `-DCMAKE_CXX_FLAGS='-march=native'` which will specialize to precisely your machine's CPU, but the exe might not run on other machines at all).

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -1,5 +1,24 @@
 cmake_minimum_required(VERSION 3.18.2)
-if(USE_BACKEND STREQUAL "METAL")
+
+# Pre-project MLX setup. KataGo's MLX path enforces CMake 3.27 via the guard
+# below (MLX itself requires only 3.25 - 3.27 is chosen to match
+# cmake_policy(VERSION 3.27)); the global cmake_minimum_required stays at
+# 3.18.2 so non-MLX backends keep building on older CMake.
+#
+# The OSX deployment target is deliberately NOT pinned here. KataGo links
+# Homebrew's prebuilt libmlx.dylib, whose minos reflects the macOS it was
+# bottled on - that dylib, not this build, sets the real minimum macOS.
+# Pinning a lower value only stamps a misleading minos on the executable and
+# triggers a "linking with dylib built for newer version" linker warning;
+# letting CMake default the target to the build host keeps minos honest.
+if(USE_BACKEND STREQUAL "MLX")
+  if(CMAKE_VERSION VERSION_LESS 3.27)
+    message(FATAL_ERROR "KataGo's USE_BACKEND=MLX path requires CMake 3.27 or newer. You have ${CMAKE_VERSION}. Install via: brew install cmake")
+  endif()
+  cmake_policy(VERSION 3.27)
+endif()
+
+if(USE_BACKEND STREQUAL "METAL" OR USE_BACKEND STREQUAL "MLX")
   project(katago LANGUAGES CXX Swift)
 else()
   project(katago)
@@ -44,7 +63,7 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN METAL)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN MLX METAL)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
 set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
@@ -158,8 +177,73 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
+elseif(USE_BACKEND STREQUAL "MLX")
+  message(STATUS "-DUSE_BACKEND=MLX, using MLX backend (with CoreML/ANE MUX) for Apple Silicon.")
+
+  if(NOT APPLE)
+    message(FATAL_ERROR "USE_BACKEND=MLX is only supported on macOS. Detected: ${CMAKE_SYSTEM_NAME}")
+  endif()
+  if(CMAKE_OSX_ARCHITECTURES)
+    if(NOT CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
+      message(FATAL_ERROR "USE_BACKEND=MLX requires arm64. Got: ${CMAKE_OSX_ARCHITECTURES}")
+    endif()
+  elseif(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    message(FATAL_ERROR "USE_BACKEND=MLX requires Apple Silicon (arm64). Detected: ${CMAKE_SYSTEM_PROCESSOR}")
+  endif()
+
+  # CoreML/ANE MUX prerequisites — same constraints the METAL branch above
+  # enforces (same wording for grep parity).
+  if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    message(FATAL_ERROR "Bidirectional C++ Interop requires Ninja generator. Have ${CMAKE_GENERATOR}")
+  endif()
+  if("${CMAKE_Swift_COMPILER_VERSION}" VERSION_LESS 5.9)
+    message(FATAL_ERROR "Bidirectional C++ Interop requires Swift 5.9 or greater. Have ${CMAKE_Swift_COMPILER_VERSION}")
+  endif()
+  if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+    message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
+  endif()
+
+  # katagocoreml provides the native CoreML conversion C++ library used by the ANE mux.
+  add_subdirectory(external/katagocoreml)
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")
+
+  if (NOT CMAKE_OSX_SYSROOT)
+    execute_process(COMMAND xcrun --show-sdk-path OUTPUT_VARIABLE CMAKE_OSX_SYSROOT OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+
+  include(InitializeSwift)
+  include(AddSwift)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
+
+  set(MLX_MIN_VERSION "0.18")
+  set(MLX_ROOT "" CACHE PATH "Optional path to MLX's CMake package; leave empty to use CMake's default search (e.g. Homebrew's /opt/homebrew/share/cmake/MLX/)")
+
+  # Homebrew installs MLX's CMake config to /opt/homebrew/share/cmake/MLX/, which is
+  # on CMake's default search path. MLX_ROOT, when set, is added as an extra hint.
+  find_package(MLX ${MLX_MIN_VERSION} CONFIG REQUIRED HINTS "${MLX_ROOT}")
+  message(STATUS "Found MLX ${MLX_VERSION} at ${MLX_LIBRARY}")
+
+  set(NEURALNET_BACKEND_SOURCES
+    neuralnet/mlxbackend.cpp
+    neuralnet/mlxwinotuner.cpp
+    neuralnet/mlxtests.cpp
+    )
+
+  # Build the KataGoSwift static library. Same lines as the METAL branch above,
+  # kept inline to leave the Metal branch untouched. The library exposes
+  # CoreMLComputeHandle to C++ via the generated KataGoSwift-swift.h.
+  add_library(KataGoSwift STATIC
+    neuralnet/metalbackend.swift
+    neuralnet/metallayers.swift)
+  _swift_generate_cxx_header(
+    KataGoSwift
+    "${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h")
+  target_include_directories(KataGoSwift PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include")
+  set_target_properties(KataGoSwift PROPERTIES Swift_MODULE_NAME "KataGoSwift")
+  target_compile_options(KataGoSwift PUBLIC
+    "$<$<COMPILE_LANGUAGE:Swift>:-cxx-interoperability-mode=default>")
 elseif(USE_BACKEND STREQUAL "")
-  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
+  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN or -DUSE_BACKEND=MLX or -DUSE_BACKEND=METAL to compile with the respective backend.${ColorReset}")
   set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
 else()
   message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
@@ -496,6 +580,10 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
       message(STATUS "Found Eigen3 at ${EIGEN3_INCLUDE_DIRS}")
     endif()
   endif()
+elseif(USE_BACKEND STREQUAL "MLX")
+  target_compile_definitions(katago PRIVATE USE_MLX_BACKEND)
+  target_link_libraries(katago mlx KataGoSwift katagocoreml
+    ${KATAGOCOREML_DEP_LDFLAGS})
 endif()
 
 if(USE_BIGGER_BOARDS_EXPENSIVE)

diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp
@@ -267,6 +267,9 @@ int MainCmds::benchmark(const vector<string>& args) {
 #endif
 #ifdef USE_EIGEN_BACKEND
   cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
+#endif
+#ifdef USE_MLX_BACKEND
+  cout << "Your GTP config is currently set to mlxUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl;
 #endif
   cout << endl;
   cout << "Your GTP config is currently set to use numSearchThreads = " << params.numThreads << endl;

diff --git a/cpp/configs/analysis_example.cfg b/cpp/configs/analysis_example.cfg
@@ -298,6 +298,47 @@ nnRandomize = true
 # It defaults to min(numAnalysisThreads * numSearchThreadsPerAnalysisThread, numCPUCores).
 # numEigenThreadsPerModel = X
 
+# ------------------------------
+# MLX-specific settings
+# ------------------------------
+# These only apply when using the MLX backend (Apple Silicon).
+
+# MLX backend dispatch is configured via numNNServerThreadsPerModel and mlxDeviceToUseThread<N>.
+# Device index values (same convention as the Metal backend):
+#   0   = GPU only (MLX) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+# Any other value is rejected at startup. The backend-agnostic key
+# `deviceToUseThread<N>` is also accepted.
+#
+# Mux mode: pipeline GPU and ANE server threads to overlap their forward
+# passes. Set nnMaxBatchSize to roughly half of numSearchThreads.
+#
+# Example: mux mode (2x GPU + 2x ANE)
+# numNNServerThreadsPerModel = 4
+# mlxDeviceToUseThread0 = 0
+# mlxDeviceToUseThread1 = 0
+# mlxDeviceToUseThread2 = 100
+# mlxDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
+# mlxDeviceToUseThread0 = 0
+#
+# Example: ANE-only mode (CoreML on CPU+ANE)
+# numNNServerThreadsPerModel = 1
+# mlxDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode.
+
+# Whether to use FP16 (half precision) for neural net evaluation on MLX.
+# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
+# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
+# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
+# Set `false` only for bit-exact FP32 reproducibility.
+#
+# Default: auto (resolves to fp16 on MLX).
+# mlxUseFP16 = auto
+
 
 # Misc Behavior --------------------
 

diff --git a/cpp/configs/contribute_example.cfg b/cpp/configs/contribute_example.cfg
@@ -139,3 +139,31 @@ watchOngoingGameInFileName = watchgame.txt
 # This is the number of CPU threads for evaluating the neural net on the Eigen backend.
 # It defaults to numSearchThreads.
 # numEigenThreadsPerModel = X
+
+# ------------------------------
+# MLX-specific settings
+# ------------------------------
+# These only apply when using the MLX backend (Apple Silicon).
+
+# Per-server-thread dispatch (same convention as the Metal backend):
+#   0   = GPU via MLX (default)
+#   100 = ANE via CoreML (CPU + Apple Neural Engine)
+# Mix in one config to pipeline GPU and ANE work. The backend-agnostic key
+# `deviceToUseThread<N>` is also accepted.
+#
+# Example: mux mode (2x GPU + 2x ANE) - also set numNNServerThreadsPerModel = 4 above
+# mlxDeviceToUseThread0 = 0
+# mlxDeviceToUseThread1 = 0
+# mlxDeviceToUseThread2 = 100
+# mlxDeviceToUseThread3 = 100
+#
+# Example: ANE-only single instance
+# mlxDeviceToUseThread0 = 100
+
+# Whether to use FP16 (half precision) for neural net evaluation on MLX.
+# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
+# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
+# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
+#
+# Default: auto (resolves to fp16 on MLX).
+# mlxUseFP16 = auto
diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
@@ -539,6 +539,52 @@ searchFactorWhenWinningThreshold = 0.95
 # Default: numSearchThreads
 # numEigenThreadsPerModel = X
 
+# ------------------------------
+# MLX-specific settings
+# ------------------------------
+# These only apply when using the MLX backend (Apple Silicon).
+
+# MLX backend dispatch is configured via numNNServerThreadsPerModel and mlxDeviceToUseThread<N>.
+# Device index values (same convention as the Metal backend):
+#   0   = GPU only (MLX) - default
+#   100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
+# Any other value is rejected at startup. The backend-agnostic key
+# `deviceToUseThread<N>` is also accepted if you prefer not to commit to a
+# backend-specific prefix.
+#
+# Mux mode: pipeline GPU and ANE server threads to overlap their forward
+# passes. Set nnMaxBatchSize to roughly half of numSearchThreads for best
+# pipelining.
+#
+# Example: mux mode (2x GPU + 2x ANE)
+# numNNServerThreadsPerModel = 4
+# mlxDeviceToUseThread0 = 0
+# mlxDeviceToUseThread1 = 0
+# mlxDeviceToUseThread2 = 100
+# mlxDeviceToUseThread3 = 100
+#
+# Example: GPU-only mode (default)
+# numNNServerThreadsPerModel = 1
+# mlxDeviceToUseThread0 = 0
+#
+# Example: ANE-only mode (CoreML on CPU+ANE; ~3 search threads is the
+# observed throughput sweet spot since a single CoreML call serializes
+# per batch)
+# numNNServerThreadsPerModel = 1
+# mlxDeviceToUseThread0 = 100
+#
+# Default (no config): 1 server thread, GPU-only mode.
+
+# Whether to use FP16 (half precision) for neural net evaluation on MLX.
+# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
+# The ANE is FP16-only hardware: on an ANE thread (gpuIdx = 100) with
+# mlxUseFP16 = false, CoreML falls back to CPU FP32 - correct but much
+# slower than the GPU path. Set `false` only for bit-exact FP32
+# reproducibility.
+#
+# Default: auto (resolves to fp16 on MLX).
+# mlxUseFP16 = auto
+
 # ===========================================================================
 # Root move selection and biases
 # ===========================================================================

diff --git a/cpp/configs/match_example.cfg b/cpp/configs/match_example.cfg
@@ -197,6 +197,34 @@ numNNServerThreadsPerModel = 1
 # It defaults to numSearchThreads.
 # numEigenThreadsPerModel = X
 
+# ------------------------------
+# MLX-specific settings
+# ------------------------------
+# These only apply when using the MLX backend (Apple Silicon).
+
+# Per-server-thread dispatch (same convention as the Metal backend):
+#   0   = GPU via MLX (default)
+#   100 = ANE via CoreML (CPU + Apple Neural Engine)
+# Mix in one config to pipeline GPU and ANE work. The backend-agnostic key
+# `deviceToUseThread<N>` is also accepted.
+#
+# Example: mux mode (2x GPU + 2x ANE) - also set numNNServerThreadsPerModel = 4 above
+# mlxDeviceToUseThread0 = 0
+# mlxDeviceToUseThread1 = 0
+# mlxDeviceToUseThread2 = 100
+# mlxDeviceToUseThread3 = 100
+#
+# Example: ANE-only single instance
+# mlxDeviceToUseThread0 = 100
+
+# Whether to use FP16 (half precision) for neural net evaluation on MLX.
+# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
+# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
+# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
+#
+# Default: auto (resolves to fp16 on MLX).
+# mlxUseFP16 = auto
+
 
 # Root move selection and biases------------------------------------------------------------------------------
 # Uncomment and edit any of the below values to change them from their default.

diff --git a/cpp/main.cpp b/cpp/main.cpp
@@ -246,6 +246,8 @@ string Version::getKataGoVersionFullInfo() {
   out << "Using OpenCL backend" << endl;
 #elif defined(USE_EIGEN_BACKEND)
   out << "Using Eigen(CPU) backend" << endl;
+#elif defined(USE_MLX_BACKEND)
+  out << "Using MLX backend" << endl;
 #else
   out << "Using dummy backend" << endl;
 #endif
@@ -282,6 +284,8 @@ string Version::getGitRevisionWithBackend() {
   s += "-opencl";
 #elif defined(USE_EIGEN_BACKEND)
   s += "-eigen";
+#elif defined(USE_MLX_BACKEND)
+  s += "-mlx";
 #else
   s += "-dummy";
 #endif