flexflow · elliottslaughter · Mar 3, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,26 @@
+name: deploy
+on: [push, pull_request, workflow_dispatch]
+jobs:
+  deploy:
+    name: Test Deployment (GCC ${{ matrix.gcc }})
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.9.1-cudnn-devel-ubuntu24.04
+
+    steps:
+      - name: Checkout Git Repository
+        uses: actions/checkout@v3
+
+      - name: Install Depedencies
+        run: |
+          apt-get update -qq
+          apt-get install -y build-essential cmake curl gcc-${{ matrix.gcc }} g++-${{ matrix.gcc }} git libibverbs-dev libmpich-dev mpich python3 python3-venv zlib1g-dev
+
+      - name: Run Deploy Script
+        run: ./deploy/sapling.sh
+        env:
+          CC: gcc-${{ matrix.gcc }}
+          CXX: g++-${{ matrix.gcc }}
+
+    strategy:
+      matrix:
+        gcc: [10, 11]
diff --git a/.gitmodules b/.gitmodules
@@ -1,24 +0,0 @@
-[submodule "deps/nccl"]
-	path = deps/nccl
-	url = https://github.com/NVIDIA/nccl.git
-[submodule "deps/json"]
-	path = deps/json
-	url = https://github.com/nlohmann/json.git
-[submodule "deps/spdlog"]
-	path = deps/spdlog
-	url = https://github.com/gabime/spdlog.git
-[submodule "deps/rapidcheck"]
-	path = deps/rapidcheck
-	url = https://github.com/emil-e/rapidcheck.git
-[submodule "deps/doctest"]
-	path = deps/doctest
-	url = https://github.com/doctest/doctest.git
-[submodule "deps/visit_struct"]
-	path = deps/visit_struct
-	url = https://github.com/cbeck88/visit_struct.git
-[submodule "deps/expected"]
-	path = deps/expected
-	url = https://github.com/TartanLlama/expected.git
-[submodule "deps/fmt"]
-	path = deps/fmt
-	url = https://github.com/fmtlib/fmt.git

diff --git a/.proj.toml b/.proj.toml
@@ -111,6 +111,10 @@ cuda = false
 type = "bin"
 cuda = false
 
+[targets.run-model]
+type = "bin"
+cuda = false
+
 # default_build_targets = [
 #   "utils",
 # ]
@@ -124,5 +128,4 @@ cuda = false
 # ]
 
 [cmake_flags_extra]
-FF_CUDA_ARCH = "60"
 CMAKE_CUDA_ARCHITECTURES = "60"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,25 +1,13 @@
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(FlexFlow)
 
-set(
-  CMAKE_MODULE_PATH 
-  ${CMAKE_MODULE_PATH} 
-  ${CMAKE_CURRENT_LIST_DIR}/cmake 
+list(
+  APPEND
+  CMAKE_MODULE_PATH
+  ${CMAKE_CURRENT_LIST_DIR}/cmake
   ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules
 )
 
-# Detect OS type and Linux version (if it applies)
-set(LINUX_VERSION "")
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-  find_program(LSB_RELEASE_EXEC lsb_release)
-  if(LSB_RELEASE_EXEC)
-    execute_process(COMMAND ${LSB_RELEASE_EXEC} -r --short 
-                    OUTPUT_VARIABLE LINUX_VERSION 
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
-    message(STATUS "Linux Version: ${LINUX_VERSION}")
-  endif()
-endif()
-
 set(FF_MAX_DIM "5" CACHE STRING "Maximum tensor order")
 set(FF_MAX_OPNAME "128" CACHE STRING "Maximum op name length")
 set(FF_MAX_NUM_OUTPUTS "256" CACHE STRING "Maximum number of outputs (per operator)")
@@ -28,13 +16,12 @@ set(FF_MAX_NUM_WEIGHTS "64" CACHE STRING "Maximum number of weights (per operato
 set(FF_MAX_NUM_FUSED_OPERATORS "64" CACHE STRING "Maximum number of fused tensors")
 set(FF_MAX_NUM_FUSED_TENSORS "64" CACHE STRING "Maximum number of input and output tensors per fused op")
 set(FF_MAX_NUM_WORKERS "1024" CACHE STRING "Maximum number of GPUs")
-set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING 
+set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING
   "Maximum number of regions that can be passed to a task through the TaskSpec interface")
 set(FF_MAX_NUM_TASK_ARGUMENTS "5" CACHE STRING
   "Maximum number of arguments that can be declared in a TaskSignature")
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
 option(FF_USE_PREALM "Build with PRealm profiling interface" ON)
-option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
 option(FF_USE_PYTHON "Enable Python" ON)
 option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
 option(FF_USE_CODE_COVERAGE "Enable code coverage" OFF)
@@ -43,16 +30,6 @@ set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel)
 set(FF_GPU_BACKEND "cuda" CACHE STRING "Select GPU Backend ${FF_GPU_BACKENDS}")
 set_property(CACHE FF_GPU_BACKEND PROPERTY STRINGS ${FF_GPU_BACKENDS})
 
-option(FF_USE_EXTERNAL_NCCL "Use pre-installed NCCL" OFF)
-option(FF_USE_EXTERNAL_JSON "Use pre-installed nlohmann::json" OFF)
-option(FF_USE_EXTERNAL_FMT "Use pre-installed fmt" OFF)
-option(FF_USE_EXTERNAL_SPDLOG "Use pre-installed spdlog" OFF)
-option(FF_USE_EXTERNAL_DOCTEST "Use pre-installed doctest" OFF)
-option(FF_USE_EXTERNAL_RAPIDCHECK "Use pre-installed rapidcheck" OFF)
-option(FF_USE_EXTERNAL_EXPECTED "Use pre-installed tl::expected" OFF)
-option(FF_USE_EXTERNAL_GBENCHMARK "Use pre-installed google benchmark" OFF)
-option(FF_USE_EXTERNAL_LIBASSERT "Use pre-installed libassert" OFF)
-
 option(FF_BUILD_RESNET "build resnet example" OFF)
 option(FF_BUILD_RESNEXT "build resnext example" OFF)
 option(FF_BUILD_ALEXNET "build alexnet example" OFF)
@@ -72,15 +49,7 @@ option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" ON)
 option(FF_BUILD_SP_IZATION_BENCHMARKING "build sp-ization benchmarking" ON)
 option(FF_BUILD_ARG_PARSER "build command line argument parser" OFF)
 option(FF_BUILD_BIN_EXPORT_MODEL_ARCH "build export-model-arch utility" ON)
-
-set(FF_CUDA_ARCH "autodetect" CACHE STRING "Target CUDA Arch")
-if (FF_CUDA_ARCH STREQUAL "")
-  message(FATAL_ERROR "FF_CUDA_ARCH cannot be an empty string. Set it to `autodetect`, `all`, or pass one or multiple valid CUDA archs.")
-endif()
-
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-  set(LIBEXT ".so")
-endif()
+option(FF_BUILD_BIN_RUN_MODEL "build run-model binary" ON)
 
 include(cuda)
 include(cudnn)

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
@@ -13,3 +13,7 @@ endif()
 if(FF_BUILD_BIN_EXPORT_MODEL_ARCH)
   add_subdirectory(export-model-arch)
 endif()
+
+if(FF_BUILD_BIN_RUN_MODEL)
+  add_subdirectory(run-model)
+endif()
diff --git a/bin/run-model/CMakeLists.txt b/bin/run-model/CMakeLists.txt
@@ -0,0 +1,10 @@
+ff_add_executable(
+  NAME
+    run-model
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE
+    include/
+  DEPS
+    realm-execution
+)
diff --git a/bin/run-model/src/run-model/main.cc b/bin/run-model/src/run-model/main.cc
@@ -0,0 +1,117 @@
+#include "pcg/file_format/v1/v1_mapped_parallel_computation_graph.h"
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "realm-execution/distributed_ff_handle.h"
+#include "realm-execution/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "utils/cli/cli_get_help_message.h"
+#include "utils/cli/cli_parse.h"
+#include "utils/cli/cli_parse_result.h"
+#include "utils/cli/cli_spec.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
+#include <fstream>
+#include <string_view>
+
+using namespace FlexFlow;
+
+static char *leak_string_contents(std::string_view str) {
+  // Realm command-line arguments require char* so intentionally leak the
+  // allocated string contents here
+  std::vector<char> *content = new std::vector<char>{str.begin(), str.end()};
+  content->push_back(0); // NUL byte
+  return content->data();
+}
+
+static std::vector<char *> make_realm_args(std::string_view executable_name) {
+  std::vector<char *> result;
+  result.push_back(leak_string_contents(executable_name));
+  return result;
+}
+
+int main(int argc, char **argv) {
+  CLISpec cli = empty_cli_spec();
+
+  CLIArgumentKey arg_key_help = cli_add_help_flag(cli);
+
+  CLIArgumentKey key_mapped_pcg_json = cli_add_positional_argument(
+      cli,
+      CLIPositionalArgumentSpec{
+          "mapped_pcg_json",
+          std::nullopt,
+          "path to a file containing mappped PCG encoded as JSON"});
+
+  ASSERT(argc >= 1);
+  std::string prog_name = argv[0];
+
+  CLIParseResult parsed = ({
+    tl::expected<CLIParseResult, std::string> result =
+        cli_parse(cli, argc, argv);
+    if (!result.has_value()) {
+      std::string error_msg = result.error();
+      std::cerr << cli_get_help_message(prog_name, cli);
+      std::cerr << std::endl;
+      std::cerr << "error: " << error_msg << std::endl;
+      return 1;
+    }
+
+    result.value();
+  });
+
+  bool help = cli_get_flag(parsed, arg_key_help);
+  if (help) {
+    std::cerr << cli_get_help_message(prog_name, cli);
+    return 1;
+  }
+
+  std::string mapped_pcg_json = cli_get_argument(parsed, key_mapped_pcg_json);
+
+  std::vector<char *> realm_args = make_realm_args(prog_name);
+  int realm_argc = realm_args.size();
+  char **realm_argv = realm_args.data();
+  RealmManager manager(&realm_argc, &realm_argv);
+
+  ControllerTaskResult result = manager.start_controller([&](RealmContext
+                                                                 &ctx) {
+    MappedParallelComputationGraph mpcg = [&]() {
+      std::ifstream f(mapped_pcg_json);
+      nlohmann::json mpcg_json = nlohmann::json::parse(f);
+      return from_v1(mpcg_json.get<V1MappedParallelComputationGraph>());
+    }();
+
+    // instantiate computation graph
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> input_tensors;
+
+    DistributedFfHandle device_handle =
+        create_distributed_ff_handle(ctx,
+                                     /*workSpaceSize=*/1024 * 1024,
+                                     /*allowTensorOpMathConversion=*/true);
+
+    PCGInstance pcg_instance = create_pcg_instance(
+        /*ctx=*/ctx,
+        /*mpcg=*/mpcg,
+        /*optimizer=*/optimizer_attrs,
+        /*loss=*/std::nullopt,
+        /*input_tensors=*/input_tensors,
+        /*profiling_settings=*/ProfilingSettings{0, 0},
+        /*device_handle=*/device_handle);
+
+    // begin training loop
+    int num_epochs = 5;
+    for (int i = 0; i < num_epochs; i++) {
+      perform_all_passes_for_pcg_instance(
+          /*instance=*/pcg_instance,
+          /*profiling_settings=*/ProfilingSettings{0, 1},
+          /*device_handle=*/device_handle);
+    }
+  });
+  result.wait();
+
+  return 0;
+}
diff --git a/bin/sp-ization-benchmarking/CMakeLists.txt b/bin/sp-ization-benchmarking/CMakeLists.txt
@@ -5,6 +5,8 @@ ff_add_executable(
     include/
   SRC_PATTERNS
     *.cc
+  PRIVATE_INCLUDE
+    include/
   DEPS
     utils
     rapidcheck

diff --git a/cmake/Modules/CodeCoverage.cmake b/cmake/Modules/CodeCoverage.cmake
@@ -161,7 +161,7 @@ foreach(LANG ${LANGUAGES})
     endif()
   elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
          AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
-    message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
+    message(FATAL_ERROR "Compiler ${CMAKE_${LANG}_COMPILER_ID} is not GNU or Flang! Aborting...")
   endif()
 endforeach()
 
@@ -748,4 +748,4 @@ function(append_coverage_compiler_flags_to_target name)
     if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
         target_link_libraries(${name} PRIVATE gcov)
     endif()
-endfunction()
+endfunction()