Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Compiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,15 @@ As also mentioned in the instructions below but repeated here for visibility, if
* AppleClang and Swift compilers: `xcode-select --install`.
* If using the Metal backend, [Ninja](https://ninja-build.org): `brew install ninja`
* If using the Metal backend, protobuf and abseil: `brew install protobuf abseil`
* If using the MLX backend (Apple Silicon only): `brew install mlx` (≥0.18). Requires CMake ≥3.27. KataGo finds MLX via CMake's default search (Homebrew installs it at `/opt/homebrew/share/cmake/MLX/`); override with `-DMLX_ROOT=/path/to/mlx/cmake` if needed.
* libzip: `brew install libzip`.
* If you want to do self-play training and research, probably Google perftools `brew install gperftools` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
* If compiling to contribute to public distributed training runs, OpenSSL is required (`brew install openssl`).
* Clone this repo:
* `git clone https://github.com/lightvector/KataGo.git`
* Compile using CMake and make in the cpp directory:
* `cd KataGo/cpp`
* `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
* `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -DUSE_BACKEND=MLX` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
* Specify also `-DUSE_TCMALLOC=1` if using TCMalloc.
* Compiling will also call git commands to embed the git hash into the compiled executable, specify also `-DNO_GIT_REVISION=1` to disable it if this is causing issues for you.
* Specify `-DUSE_AVX2=1` to also compile Eigen with AVX2 and FMA support, which will make it incompatible with old CPUs but much faster. Intel-based Macs with new processors support AVX2, but Apple Silicon Macs do not support AVX2 natively. (If you want to go further, you can also add `-DCMAKE_CXX_FLAGS='-march=native'` which will specialize to precisely your machine's CPU, but the exe might not run on other machines at all).
Expand Down
94 changes: 91 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
cmake_minimum_required(VERSION 3.18.2)
if(USE_BACKEND STREQUAL "METAL")

# Pre-project MLX setup. KataGo's MLX path enforces CMake 3.27 via the guard
# below (MLX itself requires only 3.25 - 3.27 is chosen to match
# cmake_policy(VERSION 3.27)); the global cmake_minimum_required stays at
# 3.18.2 so non-MLX backends keep building on older CMake.
#
# The OSX deployment target is deliberately NOT pinned here. KataGo links
# Homebrew's prebuilt libmlx.dylib, whose minos reflects the macOS it was
# bottled on - that dylib, not this build, sets the real minimum macOS.
# Pinning a lower value only stamps a misleading minos on the executable and
# triggers a "linking with dylib built for newer version" linker warning;
# letting CMake default the target to the build host keeps minos honest.
if(USE_BACKEND STREQUAL "MLX")
if(CMAKE_VERSION VERSION_LESS 3.27)
message(FATAL_ERROR "KataGo's USE_BACKEND=MLX path requires CMake 3.27 or newer. You have ${CMAKE_VERSION}. Install via: brew install cmake")
endif()
cmake_policy(VERSION 3.27)
endif()

if(USE_BACKEND STREQUAL "METAL" OR USE_BACKEND STREQUAL "MLX")
project(katago LANGUAGES CXX Swift)
else()
project(katago)
Expand Down Expand Up @@ -44,7 +63,7 @@ endif()
set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
set(USE_BACKEND CACHE STRING "Neural net backend")
string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN METAL)
set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN MLX METAL)

set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
Expand Down Expand Up @@ -158,8 +177,73 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
set(NEURALNET_BACKEND_SOURCES
neuralnet/eigenbackend.cpp
)
elseif(USE_BACKEND STREQUAL "MLX")
message(STATUS "-DUSE_BACKEND=MLX, using MLX backend (with CoreML/ANE MUX) for Apple Silicon.")

if(NOT APPLE)
message(FATAL_ERROR "USE_BACKEND=MLX is only supported on macOS. Detected: ${CMAKE_SYSTEM_NAME}")
endif()
if(CMAKE_OSX_ARCHITECTURES)
if(NOT CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
message(FATAL_ERROR "USE_BACKEND=MLX requires arm64. Got: ${CMAKE_OSX_ARCHITECTURES}")
endif()
elseif(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(FATAL_ERROR "USE_BACKEND=MLX requires Apple Silicon (arm64). Detected: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

# CoreML/ANE MUX prerequisites — same constraints the METAL branch above
# enforces (same wording for grep parity).
if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
message(FATAL_ERROR "Bidirectional C++ Interop requires Ninja generator. Have ${CMAKE_GENERATOR}")
endif()
if("${CMAKE_Swift_COMPILER_VERSION}" VERSION_LESS 5.9)
message(FATAL_ERROR "Bidirectional C++ Interop requires Swift 5.9 or greater. Have ${CMAKE_Swift_COMPILER_VERSION}")
endif()
if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
endif()

# katagocoreml provides the native CoreML conversion C++ library used by the ANE mux.
add_subdirectory(external/katagocoreml)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")

if (NOT CMAKE_OSX_SYSROOT)
execute_process(COMMAND xcrun --show-sdk-path OUTPUT_VARIABLE CMAKE_OSX_SYSROOT OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()

include(InitializeSwift)
include(AddSwift)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)

set(MLX_MIN_VERSION "0.18")
set(MLX_ROOT "" CACHE PATH "Optional path to MLX's CMake package; leave empty to use CMake's default search (e.g. Homebrew's /opt/homebrew/share/cmake/MLX/)")

# Homebrew installs MLX's CMake config to /opt/homebrew/share/cmake/MLX/, which is
# on CMake's default search path. MLX_ROOT, when set, is added as an extra hint.
find_package(MLX ${MLX_MIN_VERSION} CONFIG REQUIRED HINTS "${MLX_ROOT}")
message(STATUS "Found MLX ${MLX_VERSION} at ${MLX_LIBRARY}")

set(NEURALNET_BACKEND_SOURCES
neuralnet/mlxbackend.cpp
neuralnet/mlxwinotuner.cpp
neuralnet/mlxtests.cpp
)

# Build the KataGoSwift static library. Same lines as the METAL branch above,
# kept inline to leave the Metal branch untouched. The library exposes
# CoreMLComputeHandle to C++ via the generated KataGoSwift-swift.h.
add_library(KataGoSwift STATIC
neuralnet/metalbackend.swift
neuralnet/metallayers.swift)
_swift_generate_cxx_header(
KataGoSwift
"${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h")
target_include_directories(KataGoSwift PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include")
set_target_properties(KataGoSwift PROPERTIES Swift_MODULE_NAME "KataGoSwift")
target_compile_options(KataGoSwift PUBLIC
"$<$<COMPILE_LANGUAGE:Swift>:-cxx-interoperability-mode=default>")
elseif(USE_BACKEND STREQUAL "")
message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN or -DUSE_BACKEND=MLX or -DUSE_BACKEND=METAL to compile with the respective backend.${ColorReset}")
set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
else()
message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
Expand Down Expand Up @@ -496,6 +580,10 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
message(STATUS "Found Eigen3 at ${EIGEN3_INCLUDE_DIRS}")
endif()
endif()
elseif(USE_BACKEND STREQUAL "MLX")
target_compile_definitions(katago PRIVATE USE_MLX_BACKEND)
target_link_libraries(katago mlx KataGoSwift katagocoreml
${KATAGOCOREML_DEP_LDFLAGS})
endif()

if(USE_BIGGER_BOARDS_EXPENSIVE)
Expand Down
3 changes: 3 additions & 0 deletions cpp/command/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,9 @@ int MainCmds::benchmark(const vector<string>& args) {
#endif
#ifdef USE_EIGEN_BACKEND
cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
#endif
#ifdef USE_MLX_BACKEND
cout << "Your GTP config is currently set to mlxUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl;
#endif
cout << endl;
cout << "Your GTP config is currently set to use numSearchThreads = " << params.numThreads << endl;
Expand Down
41 changes: 41 additions & 0 deletions cpp/configs/analysis_example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,47 @@ nnRandomize = true
# It defaults to min(numAnalysisThreads * numSearchThreadsPerAnalysisThread, numCPUCores).
# numEigenThreadsPerModel = X

# ------------------------------
# MLX-specific settings
# ------------------------------
# These only apply when using the MLX backend (Apple Silicon).

# MLX backend dispatch is configured via numNNServerThreadsPerModel and mlxDeviceToUseThread<N>.
# Device index values (same convention as the Metal backend):
# 0 = GPU only (MLX) - default
# 100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
# Any other value is rejected at startup. The backend-agnostic key
# `deviceToUseThread<N>` is also accepted.
#
# Mux mode: pipeline GPU and ANE server threads to overlap their forward
# passes. Set nnMaxBatchSize to roughly half of numSearchThreads.
#
# Example: mux mode (2x GPU + 2x ANE)
# numNNServerThreadsPerModel = 4
# mlxDeviceToUseThread0 = 0
# mlxDeviceToUseThread1 = 0
# mlxDeviceToUseThread2 = 100
# mlxDeviceToUseThread3 = 100
#
# Example: GPU-only mode (default)
# numNNServerThreadsPerModel = 1
# mlxDeviceToUseThread0 = 0
#
# Example: ANE-only mode (CoreML on CPU+ANE)
# numNNServerThreadsPerModel = 1
# mlxDeviceToUseThread0 = 100
#
# Default (no config): 1 server thread, GPU-only mode.

# Whether to use FP16 (half precision) for neural net evaluation on MLX.
# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
# Set `false` only for bit-exact FP32 reproducibility.
#
# Default: auto (resolves to fp16 on MLX).
# mlxUseFP16 = auto


# Misc Behavior --------------------

Expand Down
28 changes: 28 additions & 0 deletions cpp/configs/contribute_example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,31 @@ watchOngoingGameInFileName = watchgame.txt
# This is the number of CPU threads for evaluating the neural net on the Eigen backend.
# It defaults to numSearchThreads.
# numEigenThreadsPerModel = X

# ------------------------------
# MLX-specific settings
# ------------------------------
# These only apply when using the MLX backend (Apple Silicon).

# Per-server-thread dispatch (same convention as the Metal backend):
# 0 = GPU via MLX (default)
# 100 = ANE via CoreML (CPU + Apple Neural Engine)
# Mix in one config to pipeline GPU and ANE work. The backend-agnostic key
# `deviceToUseThread<N>` is also accepted.
#
# Example: mux mode (2x GPU + 2x ANE) - also set numNNServerThreadsPerModel = 4 above
# mlxDeviceToUseThread0 = 0
# mlxDeviceToUseThread1 = 0
# mlxDeviceToUseThread2 = 100
# mlxDeviceToUseThread3 = 100
#
# Example: ANE-only single instance
# mlxDeviceToUseThread0 = 100

# Whether to use FP16 (half precision) for neural net evaluation on MLX.
# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
#
# Default: auto (resolves to fp16 on MLX).
# mlxUseFP16 = auto
46 changes: 46 additions & 0 deletions cpp/configs/gtp_example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,52 @@ searchFactorWhenWinningThreshold = 0.95
# Default: numSearchThreads
# numEigenThreadsPerModel = X

# ------------------------------
# MLX-specific settings
# ------------------------------
# These only apply when using the MLX backend (Apple Silicon).

# MLX backend dispatch is configured via numNNServerThreadsPerModel and mlxDeviceToUseThread<N>.
# Device index values (same convention as the Metal backend):
# 0 = GPU only (MLX) - default
# 100 = ANE only (CoreML, runs on CPU + Apple Neural Engine)
# Any other value is rejected at startup. The backend-agnostic key
# `deviceToUseThread<N>` is also accepted if you prefer not to commit to a
# backend-specific prefix.
#
# Mux mode: pipeline GPU and ANE server threads to overlap their forward
# passes. Set nnMaxBatchSize to roughly half of numSearchThreads for best
# pipelining.
#
# Example: mux mode (2x GPU + 2x ANE)
# numNNServerThreadsPerModel = 4
# mlxDeviceToUseThread0 = 0
# mlxDeviceToUseThread1 = 0
# mlxDeviceToUseThread2 = 100
# mlxDeviceToUseThread3 = 100
#
# Example: GPU-only mode (default)
# numNNServerThreadsPerModel = 1
# mlxDeviceToUseThread0 = 0
#
# Example: ANE-only mode (CoreML on CPU+ANE; ~3 search threads is the
# observed throughput sweet spot since a single CoreML call serializes
# per batch)
# numNNServerThreadsPerModel = 1
# mlxDeviceToUseThread0 = 100
#
# Default (no config): 1 server thread, GPU-only mode.

# Whether to use FP16 (half precision) for neural net evaluation on MLX.
# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
# The ANE is FP16-only hardware: on an ANE thread (gpuIdx = 100) with
# mlxUseFP16 = false, CoreML falls back to CPU FP32 - correct but much
# slower than the GPU path. Set `false` only for bit-exact FP32
# reproducibility.
#
# Default: auto (resolves to fp16 on MLX).
# mlxUseFP16 = auto

# ===========================================================================
# Root move selection and biases
# ===========================================================================
Expand Down
28 changes: 28 additions & 0 deletions cpp/configs/match_example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,34 @@ numNNServerThreadsPerModel = 1
# It defaults to numSearchThreads.
# numEigenThreadsPerModel = X

# ------------------------------
# MLX-specific settings
# ------------------------------
# These only apply when using the MLX backend (Apple Silicon).

# Per-server-thread dispatch (same convention as the Metal backend):
# 0 = GPU via MLX (default)
# 100 = ANE via CoreML (CPU + Apple Neural Engine)
# Mix in one config to pipeline GPU and ANE work. The backend-agnostic key
# `deviceToUseThread<N>` is also accepted.
#
# Example: mux mode (2x GPU + 2x ANE) - also set numNNServerThreadsPerModel = 4 above
# mlxDeviceToUseThread0 = 0
# mlxDeviceToUseThread1 = 0
# mlxDeviceToUseThread2 = 100
# mlxDeviceToUseThread3 = 100
#
# Example: ANE-only single instance
# mlxDeviceToUseThread0 = 100

# Whether to use FP16 (half precision) for neural net evaluation on MLX.
# FP16 is faster than FP32 on Apple Silicon via the MLX Winograd path.
# The ANE is FP16-only: on an ANE thread (gpuIdx = 100) with mlxUseFP16 = false,
# CoreML falls back to CPU FP32 - correct but much slower than the GPU path.
#
# Default: auto (resolves to fp16 on MLX).
# mlxUseFP16 = auto


# Root move selection and biases------------------------------------------------------------------------------
# Uncomment and edit any of the below values to change them from their default.
Expand Down
4 changes: 4 additions & 0 deletions cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ string Version::getKataGoVersionFullInfo() {
out << "Using OpenCL backend" << endl;
#elif defined(USE_EIGEN_BACKEND)
out << "Using Eigen(CPU) backend" << endl;
#elif defined(USE_MLX_BACKEND)
out << "Using MLX backend" << endl;
#else
out << "Using dummy backend" << endl;
#endif
Expand Down Expand Up @@ -282,6 +284,8 @@ string Version::getGitRevisionWithBackend() {
s += "-opencl";
#elif defined(USE_EIGEN_BACKEND)
s += "-eigen";
#elif defined(USE_MLX_BACKEND)
s += "-mlx";
#else
s += "-dummy";
#endif
Expand Down
Loading
Loading