diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
deleted file mode 100644
index 361a66c4ff..0000000000
--- a/.flake/pkgs/legion.nix
+++ /dev/null
@@ -1,48 +0,0 @@
-{ lib
-, stdenv
-, fetchFromGitLab
-, cmake
-, cudaPackages ? { }
-, cudaCapabilities ? [ "60" "70" "80" "86" ]
-, maxDim ? 5
-}:
-
-# from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix
-
-let 
-  cmakeFlag = x: if x then "1" else "0";
-
-  inherit (cudaPackages) cudatoolkit;
-in
-
-stdenv.mkDerivation rec {
-  pname = "legion";
-  version = "2025-01-06";
-
-  src = fetchFromGitLab {
-    owner = "StanfordLegion";
-    repo = "legion";
-    rev = "7be1abd0207eb1126c7629b16d1123fa6f58ce9d";
-    sha256 = "sha256-gTjnGYYTQwTsrV1WcY0qqpTrlwbzAPcndurRy6XnG8A=";
-  };
-
-  nativeBuildInputs = [
-    cmake
-  ];
-
-  cmakeFlags = [
-    "-DLegion_USE_CUDA=1"
-    "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
-    "-DLegion_MAX_DIM=${toString maxDim}"
-  ];
-
-  buildInputs = [ 
-    cudatoolkit
-  ];
-
-  meta = with lib; {
-    description = "Legion is a parallel programming model for distributed, heterogeneous machines";
-    homepage = "https://legion.stanford.edu/";
-    license = licenses.asl20;
-  };
-}
diff --git a/.flake/pkgs/realm.nix b/.flake/pkgs/realm.nix
new file mode 100644
index 0000000000..336b1c050c
--- /dev/null
+++ b/.flake/pkgs/realm.nix
@@ -0,0 +1,46 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, cmake
+, cudaPackages ? { }
+, zlib
+, maxDim ? 5
+}:
+
+let
+  inherit (cudaPackages) cudatoolkit;
+in
+
+stdenv.mkDerivation rec {
+  pname = "realm";
+  version = "2026-02-24";
+
+  src = fetchFromGitHub {
+    owner = "StanfordLegion";
+    repo = "realm";
+    rev = "42f7484a80e0bdacaf47d9a758822f5327348dd0";
+    sha256 = "sha256-IHiokPmTjEV5df3fr1Xubuyt2N1CFI2fA7Q2TsbxS3Y=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+  ];
+
+  cmakeFlags = [
+    "-DBUILD_SHARED_LIBS=ON"
+    "-DREALM_ENABLE_CUDA=ON"
+    "-DREALM_ENABLE_PREALM=ON"
+    "-DREALM_MAX_DIM=${toString maxDim}"
+  ];
+
+  buildInputs = [
+    cudatoolkit
+    zlib
+  ];
+
+  meta = with lib; {
+    description = "Realm is a distributed, event–based tasking runtime for building high-performance applications that span clusters of CPUs, GPUs, and other accelerators";
+    homepage = "https://legion.stanford.edu/realm";
+    license = licenses.asl20;
+  };
+}
diff --git a/.proj.toml b/.proj.toml
index 38690f710b..5dbbfbcdd7 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -85,6 +85,13 @@ has-cpu-only-benchmarks = false
 has-cuda-tests = true
 has-cuda-benchmarks = false
 
+[targets.realm-execution]
+type = "lib"
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
+
 # [targets.local-pcg-execution]
 # type = "lib"
 # has-cpu-only-tests = true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2239cdcb0..df60e24d72 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING
 set(FF_MAX_NUM_TASK_ARGUMENTS "5" CACHE STRING
   "Maximum number of arguments that can be declared in a TaskSignature")
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
+option(FF_USE_PREALM "Build with PRealm profiling interface" ON)
 option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
 option(FF_USE_PYTHON "Enable Python" ON)
 option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index ef5d6d9d11..795668e32a 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -17,6 +17,7 @@ function(define_ff_vars target)
     MAX_NUM_FUSED_TENSORS=${FF_MAX_NUM_FUSED_TENSORS}
     MAX_NUM_WORKERS=${FF_MAX_NUM_WORKERS}
     FF_USE_NCCL=${FF_USE_NCCL}
+    FF_USE_PREALM=${FF_USE_PREALM}
     MAX_TENSOR_DIM=${FF_MAX_DIM}
     MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
     MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
diff --git a/flake.nix b/flake.nix
index 6ccd5616cd..dad0e2fc32 100644
--- a/flake.nix
+++ b/flake.nix
@@ -30,8 +30,8 @@
     };
   };
 
-  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
-    let 
+  outputs = { self, nixpkgs, flake-utils, proj-repo, nixGL, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system:
+    let
       pkgs = import nixpkgs {
         inherit system;
         config.allowUnfree = true;
@@ -41,21 +41,21 @@
       mkShell = attrs: pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
       } (attrs // {
-        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch 
-                                    # signed overflows due to the signedoverflow hardening setting. 
-                                    # for more details, see the following (long-running) nixpkgs github issues: 
+        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch
+                                    # signed overflows due to the signedoverflow hardening setting.
+                                    # for more details, see the following (long-running) nixpkgs github issues:
                                     # - https://github.com/NixOS/nixpkgs/issues/18995
                                     # - https://github.com/NixOS/nixpkgs/issues/60919
       });
 
       proj = proj-repo.packages.${system}.proj;
-    in 
+    in
     {
       packages = rec {
         libdwarf-lite = pkgs.callPackage ./.flake/pkgs/libdwarf-lite.nix { };
         cpptrace = pkgs.callPackage ./.flake/pkgs/cpptrace.nix { inherit libdwarf-lite; };
         libassert = pkgs.callPackage ./.flake/pkgs/libassert.nix { inherit cpptrace; };
-        legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        realm = pkgs.callPackage ./.flake/pkgs/realm.nix { };
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
@@ -83,8 +83,7 @@
           shellHook = ''
             export PATH="$HOME/ff/.scripts/:$PATH"
             export RC_PARAMS="max_discard_ratio=100"
-            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_LEGION=ON \
-                                -DFF_USE_EXTERNAL_NCCL=ON \
+            export CMAKE_FLAGS="-DFF_USE_EXTERNAL_NCCL=ON \
                                 -DFF_USE_EXTERNAL_JSON=ON \
                                 -DFF_USE_EXTERNAL_FMT=ON \
                                 -DFF_USE_EXTERNAL_SPDLOG=ON \
@@ -94,7 +93,7 @@
                                 -DFF_USE_EXTERNAL_GBENCHMARK=ON \
                                 -DFF_USE_EXTERNAL_LIBASSERT=ON"
           '';
-          
+
           buildInputs = builtins.concatLists [
             (with pkgs; [
               zlib
@@ -125,7 +124,7 @@
             ])
             (with self.packages.${system}; [
               libassert
-              legion
+              realm
               rapidcheckFull
               doctest
             ])
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 2e71e577c0..cb3bd6d6ae 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
 add_subdirectory(local-pcg-execution)
+add_subdirectory(realm-execution)
 add_subdirectory(task-spec)
 add_subdirectory(utils)
 add_subdirectory(ffi)
diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h
index 9b7769355e..0836503717 100644
--- a/lib/kernels/include/kernels/device_handle_t.h
+++ b/lib/kernels/include/kernels/device_handle_t.h
@@ -9,6 +9,9 @@ namespace FlexFlow {
 device_handle_t device_handle_t_from_managed_handle(
     std::optional<ManagedPerDeviceFFHandle> const &managed_handle);
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle);
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle);
 device_handle_t cpu_make_device_handle_t();
 
diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc
index 85f9e2a388..0225ee8e94 100644
--- a/lib/kernels/src/kernels/device_handle_t.cc
+++ b/lib/kernels/src/kernels/device_handle_t.cc
@@ -11,6 +11,15 @@ device_handle_t device_handle_t_from_managed_handle(
   }
 }
 
+device_handle_t device_handle_t_from_managed_handle_ptr(
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  if (managed_handle.has_value()) {
+    return gpu_make_device_handle_t(managed_handle.value()->raw_handle());
+  } else {
+    return cpu_make_device_handle_t();
+  }
+}
+
 device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle) {
   return device_handle_t{
       ff_handle,
diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
index e251fafe5f..40d9b187c4 100644
--- a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
+++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc
@@ -81,7 +81,8 @@ ComputationGraphInstance create_computation_graph_instance(
     auto [loss_inserted_dg, label_v, logit_grad_v] = perform_loss_insertion(
         dg,
         assert_unwrap(loss_attrs),
-        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)});
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        std::nullopt);
     dg = loss_inserted_dg;
     logit_grad_value = logit_grad_v;
     inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
diff --git a/lib/local-execution/test/src/local-execution/test_e2e.cc b/lib/local-execution/test/src/local-execution/test_e2e.cc
index a74d165a31..615ba204cf 100644
--- a/lib/local-execution/test/src/local-execution/test_e2e.cc
+++ b/lib/local-execution/test/src/local-execution/test_e2e.cc
@@ -21,8 +21,8 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
-                       GenericTensorAccessorR const &last_epoch) {
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
   return tensor_accessor_all(
diff --git a/lib/pcg/include/pcg/layer_guid_t.dtg.toml b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
index d73cf547da..2f2f7694a0 100644
--- a/lib/pcg/include/pcg/layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
index 5b1cad5e99..ebfdefa478 100644
--- a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
+++ b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h
@@ -5,6 +5,7 @@
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
 #include "utils/bidict/bidict.h"
+#include <nlohmann/json.hpp>
 
 namespace FlexFlow {
 
@@ -45,4 +46,15 @@ struct hash<::FlexFlow::MappedOperatorTaskGroup> {
 };
 
 } // namespace std
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<::FlexFlow::MappedOperatorTaskGroup> {
+  static ::FlexFlow::MappedOperatorTaskGroup from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::MappedOperatorTaskGroup const &t);
+};
+
+} // namespace nlohmann
+
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 25dc0721cd..21f33f6d3d 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -32,6 +32,10 @@ ParallelLayerAddedResult add_parallel_layer(
 ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                                              TensorShape const &tensor_shape);
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape);
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
 
@@ -54,6 +58,9 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &);
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &,
+                         parallel_layer_guid_t const &);
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
@@ -107,6 +114,9 @@ ParallelTensorShape get_parallel_tensor_shape(ParallelComputationGraph const &,
 std::vector<parallel_layer_guid_t>
     topological_ordering(ParallelComputationGraph const &);
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg);
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name);
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
index 618bcb0dc4..292b361fc8 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
index 4494a31ac2..2710a15664 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
index 151f7b1f0f..e8caf0021f 100644
--- a/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
+++ b/lib/pcg/include/pcg/tensor_guid_t.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
index b96a447383..4436efd727 100644
--- a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
+++ b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -90,3 +90,20 @@ size_t hash<::FlexFlow::MappedOperatorTaskGroup>::operator()(
 }
 
 } // namespace std
+
+namespace nlohmann {
+
+::FlexFlow::MappedOperatorTaskGroup
+    adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::from_json(
+        json const &j) {
+  return ::FlexFlow::MappedOperatorTaskGroup{j.template get<
+      ::FlexFlow::bidict<::FlexFlow::MachineSpaceCoordinate,
+                         ::FlexFlow::OperatorAtomicTaskShardBinding>>()};
+}
+
+void adl_serializer<::FlexFlow::MappedOperatorTaskGroup>::to_json(
+    json &j, ::FlexFlow::MappedOperatorTaskGroup const &t) {
+  j = t.get_shard_bindings();
+}
+
+} // namespace nlohmann
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index f83628b8e1..959747dbc7 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -142,6 +142,27 @@ ParallelLayerAddedResult pcg_add_input_layer(ParallelComputationGraph &pcg,
                             });
 }
 
+ParallelLayerAddedResult
+    pcg_add_input_layer_with_grad(ParallelComputationGraph &pcg,
+                                  TensorShape const &tensor_shape) {
+  ParallelLayerAttrs layer_attrs = ParallelLayerAttrs{
+      /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
+  };
+
+  return add_parallel_layer(/*pcg=*/pcg,
+                            /*layer_attrs=*/layer_attrs,
+                            /*inputs=*/{},
+                            /*weights=*/{},
+                            /*output_flags=*/
+                            std::unordered_map<TensorSlotName, CreateGrad>{
+                                {
+                                    TensorSlotName::OUTPUT,
+                                    CreateGrad::YES,
+                                },
+                            });
+}
+
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer) {
   PCGOperatorAttrs op_attrs = pcg_get_op_attrs(pcg, layer);
@@ -212,6 +233,16 @@ std::unordered_set<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
+    get_outgoing_tensors(ParallelComputationGraph const &pcg,
+                         parallel_layer_guid_t const &l) {
+  return map_values(get_outgoing_kwarg_dataflow_outputs_for_node(
+                        pcg.raw_graph, l.raw_graph_node),
+                    [](KwargDataflowOutput<TensorSlotName> const &o) {
+                      return parallel_tensor_guid_t{o};
+                    });
+}
+
 std::unordered_map<TensorSlotName, parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &pcg,
                          parallel_layer_guid_t const &l) {
@@ -378,6 +409,17 @@ std::vector<parallel_layer_guid_t>
                    [](Node const &n) { return parallel_layer_guid_t{n}; });
 }
 
+std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+    get_parallel_layer_attrs_mapping(ParallelComputationGraph const &pcg) {
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+      layer_attrs_mapping;
+  for (parallel_layer_guid_t const &layer_guid : get_parallel_layers(pcg)) {
+    layer_attrs_mapping.insert(
+        {layer_guid, get_parallel_layer_attrs(pcg, layer_guid)});
+  }
+  return layer_attrs_mapping;
+}
+
 parallel_layer_guid_t
     get_parallel_layer_by_name(ParallelComputationGraph const &pcg,
                                std::string const &name) {
diff --git a/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
new file mode 100644
index 0000000000..1c3667afc7
--- /dev/null
+++ b/lib/pcg/test/src/pcg/mapped_parallel_computation_graph/mapped_operator_task_group.cc
@@ -0,0 +1,42 @@
+#include "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h"
+#include "op-attrs/parallel_tensor_space_coordinate.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include <doctest/doctest.h>
+#include <nlohmann/json.hpp>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("adl_serializer<MappedOperatorTaskGroup>") {
+    bidict<MachineSpaceCoordinate, OperatorAtomicTaskShardBinding>
+        shard_bindings{
+            {MachineSpaceCoordinate{0_n, 0_n, DeviceType::CPU},
+             OperatorAtomicTaskShardBinding{
+                 {
+                     {TensorSlotName::INPUT,
+                      ParallelTensorSpaceCoordinate{
+                          0_n, 0_n, FFOrdered{1_n, 2_n, 3_n}}},
+                 },
+             }},
+        };
+    MappedOperatorTaskGroup deserialized{shard_bindings};
+    nlohmann::json serialized = shard_bindings;
+
+    SUBCASE("to_json") {
+      nlohmann::json result = deserialized;
+      nlohmann::json correct = serialized;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("from_json") {
+      MappedOperatorTaskGroup result = serialized;
+      MappedOperatorTaskGroup correct = deserialized;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/realm-execution/CMakeLists.txt b/lib/realm-execution/CMakeLists.txt
new file mode 100644
index 0000000000..08676525e1
--- /dev/null
+++ b/lib/realm-execution/CMakeLists.txt
@@ -0,0 +1,22 @@
+ff_add_library(
+  NAME
+    realm-execution
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    compiler
+    kernels
+    local-execution
+    op-attrs
+    pcg
+    spdlog
+    task-spec
+    utils
+    Realm::Realm
+)
+
+add_subdirectory(test)
diff --git a/lib/realm-execution/README.md b/lib/realm-execution/README.md
new file mode 100644
index 0000000000..1454c7eac8
--- /dev/null
+++ b/lib/realm-execution/README.md
@@ -0,0 +1,32 @@
+The Realm backend for distributed execution.
+
+This is a single-controller implementation. That means the controller (the task that launches all other work) runs on a single node and remotely launches work onto other nodes. Aside from caveats mentioned below, this implementation is (mostly) capable of distributed execution.
+
+Major components:
+
+* `PCGInstance`: the main public interface for the Realm backend. It takes a mapped PCG and lowers it through the dynamic graph to get the fully-specified execution order of tasks to be executed. Besides the usual dynamic graph passes (pass expansion, update insertion, shard expansion), this class also tracks the allocation of Realm instances for tensors.
+* `RealmManager`: manages the initialization and shutdown of the Realm runtime. Provides the interface to launch the controller that runs the rest of the computation.
+* `RealmContext`: an interface that wraps the rest of Realm and protects against certain classes of bugs, such as shutdown bugs. **Do NOT call Realm directly unless you know what you are doing.**
+* `tasks/`: the Realm task implementations and their supporting infrastructure.
+  * `impl/`: the actual bodies of Realm tasks, along with interfaces to call them, and the serialization infrastructure for their arguments.
+  * `serializer/`: additional support for serializing Realm data types.
+  * `realm_task_registry.h`: manages the registration of Realm tasks. All Realm tasks go through this interface.
+  * `task_id_t.h` and `realm_task_id_t.h`: types to represent Realm tasks, along with an encoding to Realm's native task ID type.
+
+Other components used mainly within `PCGInstance`:
+
+ * `DistributedDeviceHandle`: represents a distributed device handle (i.e., device handles on all the GPUs on the system), for convenience.
+ * `DependenceSet`: tracks dependencies during execution of tasks.
+ * `distributed_device_state_initialization.h`: performs device state initialization of dynamic graph nodes and returns the resulting `PerDeviceOpStateBacking`.
+ * `instance_allocation.h`: allocates instances for tensors in the dynamic graph and returns the resulting `TensorInstanceBacking`.
+
+TODO list:
+
+* external instances
+* copies
+* task fusion
+* parallel operator implementation (partition, reduce, gather, etc.)
+* and fused parallel operators (reduce + broadcast = allreduce)
+* memory-optimizing compiler integration (tensor creation/destruction, tensor reuse)
+* control replication
+* Realm subgraphs
diff --git a/lib/realm-execution/include/realm-execution/atomic_dependency_set.h b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
new file mode 100644
index 0000000000..da6ba86638
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/atomic_dependency_set.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_ATOMIC_DEPENDENCY_SET_H
+
+#include "realm-execution/realm.h"
+#include <vector>
+
+namespace FlexFlow {
+
+struct AtomicDependencySet {
+public:
+  AtomicDependencySet() = delete;
+  explicit AtomicDependencySet(Realm::Event precondition);
+
+  void add_writer(Realm::Event writer);
+  void add_reader(Realm::Event reader);
+
+  Realm::Event get_dependency_for_writer() const;
+  Realm::Event get_dependency_for_reader() const;
+
+private:
+  Realm::Event writer;
+  std::vector<Realm::Event> readers;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/dependency_set.h b/lib/realm-execution/include/realm-execution/dependency_set.h
new file mode 100644
index 0000000000..629a40e2e7
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dependency_set.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEPENDENCY_SET_H
+
+#include "realm-execution/atomic_dependency_set.h"
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include <unordered_map>
+
+namespace FlexFlow {
+
+struct DependencySet {
+public:
+  DependencySet() = delete;
+  explicit DependencySet(Realm::Event precondition);
+
+  void add_writer(DynamicValueAttrs const &value, Realm::Event writer);
+  void add_reader(DynamicValueAttrs const &value, Realm::Event reader);
+
+  Realm::Event get_dependency_for_writer(DynamicValueAttrs const &value) const;
+  Realm::Event get_dependency_for_reader(DynamicValueAttrs const &value) const;
+
+private:
+  AtomicDependencySet &
+      get_atomic_dependency_set(DynamicValueAttrs const &value);
+
+private:
+  Realm::Event precondition;
+  std::unordered_map<DynamicValueAttrs, AtomicDependencySet>
+      atomic_dependencies;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
new file mode 100644
index 0000000000..9a42861fcd
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_managed_per_device_ff_handle.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_MANAGED_PER_DEVICE_FF_HANDLE_H
+
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/device_specific_ptr.h"
+#include <optional>
+
+namespace FlexFlow {
+
+using DeviceSpecificManagedPerDeviceFFHandle =
+    DeviceSpecificPtr<ManagedPerDeviceFFHandle>;
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &, std::optional<ManagedPerDeviceFFHandle *> const &);
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &, device_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/device_specific_ptr.h b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
new file mode 100644
index 0000000000..590b7dbc74
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/device_specific_ptr.h
@@ -0,0 +1,36 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DEVICE_SPECIFIC_PTR_H
+
+#include "pcg/device_id_t.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename T>
+struct DeviceSpecificPtr {
+public:
+  DeviceSpecificPtr() = delete;
+  explicit DeviceSpecificPtr(device_id_t device_idx, std::optional<T *> ptr)
+      : device_idx(device_idx), ptr(ptr) {}
+
+  std::optional<T *> get(device_id_t device_idx) const {
+    ASSERT(this->device_idx == device_idx);
+    return this->ptr;
+  }
+
+  device_id_t get_device_idx() const {
+    return this->device_idx;
+  }
+
+  std::optional<T *> get_unsafe_raw_ptr() const {
+    return this->ptr;
+  }
+
+private:
+  device_id_t device_idx;
+  std::optional<T *> ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_handle.h b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
new file mode 100644
index 0000000000..1173d75b27
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_device_handle.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_DEVICE_HANDLE_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include <unordered_map>
+
+namespace FlexFlow {
+
+struct DistributedDeviceHandle {
+public:
+  DistributedDeviceHandle() = delete;
+  explicit DistributedDeviceHandle(
+      std::unordered_map<Realm::Processor,
+                         DeviceSpecificManagedPerDeviceFFHandle> const
+          &handles);
+
+  DeviceSpecificManagedPerDeviceFFHandle const &
+      at(Realm::Processor processor) const;
+
+private:
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
+};
+
+DistributedDeviceHandle create_distributed_device_handle(
+    RealmContext &ctx,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    Realm::Event precondition = Realm::Event::NO_EVENT);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
new file mode 100644
index 0000000000..b26a69078e
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/distributed_device_state_initialization.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DISTRIBUTED_PER_DEVICE_OP_STATE_BACKING_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+
+namespace FlexFlow {
+
+PerDeviceOpStateBacking perform_distributed_device_state_initialization(
+    RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
+    ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
new file mode 100644
index 0000000000..8c8ccf6ac4
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/dynamic_tensor_accessor_from_instance.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_DYNAMIC_TENSOR_ACCESSOR_FROM_INSTANCE_H
+
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "realm-execution/realm.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/permissions.h"
+
+namespace FlexFlow {
+
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_event.h b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
new file mode 100644
index 0000000000..a245968f39
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_event.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<
+    ::FlexFlow::Realm::Event,
+    Char,
+    std::enable_if_t<!detail::has_format_as<::FlexFlow::Realm::Event>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::Event const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
+    std::string result = fmt::format("<Event {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/fmt/realm_instance.h b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
new file mode 100644
index 0000000000..e6d2846c1f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/fmt/realm_instance.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_FMT_REALM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "utils/check_fmtable.h"
+#include <fmt/format.h>
+#include <utility>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<::FlexFlow::Realm::RegionInstance,
+                 Char,
+                 std::enable_if_t<!detail::has_format_as<
+                     ::FlexFlow::Realm::RegionInstance>::value>>
+    : formatter<::std::string> {
+  template <typename FormatContext>
+  auto format(::FlexFlow::Realm::RegionInstance const &m, FormatContext &ctx)
+      -> decltype(ctx.out()) {
+    std::string result = fmt::format("<RegionInstance {}>", m.id);
+
+    return formatter<std::string>::format(result, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/instance_allocation.h b/lib/realm-execution/include/realm-execution/instance_allocation.h
new file mode 100644
index 0000000000..95530c0eee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/instance_allocation.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_INSTANCE_ALLOCATION_H
+
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+std::pair<Realm::RegionInstance, Realm::Event>
+    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+                                          DynamicValueAttrs const &value,
+                                          RealmContext &ctx);
+
+TensorInstanceBacking perform_instance_allocation(
+    DynamicOpenDataflowGraph const &g,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &ctx);
+
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
new file mode 100644
index 0000000000..db338e4e4b
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/pcg_instance/pcg_instance.h
@@ -0,0 +1,100 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_PCG_INSTANCE_PCG_INSTANCE_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/per_device_op_state_backing.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "utils/units/milliseconds_t.h"
+#include <optional>
+
+namespace FlexFlow {
+
+struct PCGInstance {
+public:
+  PCGInstance() = delete;
+  PCGInstance(PCGInstance const &) = delete;
+  PCGInstance(PCGInstance &&) = delete;
+  explicit PCGInstance(
+      RealmContext &ctx,
+      std::vector<DynamicNodeInvocation> const &execution_order,
+      TensorInstanceBacking const &tensor_instance_backing,
+      PerDeviceOpStateBacking const &device_state_backing,
+      OptimizerAttrs const &optimizer_attrs,
+      std::optional<Realm::RegionInstance> logit_grad_tensor);
+  ~PCGInstance();
+  RealmContext &get_realm_context();
+  std::vector<DynamicNodeInvocation> const &get_execution_order() const;
+  TensorInstanceBacking const &get_tensor_instance_backing() const;
+  PerDeviceOpStateBacking const &get_device_state_backing() const;
+  OptimizerAttrs const &get_optimizer_attrs() const;
+  void update_optimizer_attrs_for_next_iter();
+  std::optional<Realm::RegionInstance> get_loss_tensor_instance() const;
+
+private:
+  RealmContext &ctx;
+  std::vector<DynamicNodeInvocation> execution_order;
+  TensorInstanceBacking tensor_instance_backing;
+  PerDeviceOpStateBacking device_state_backing;
+  OptimizerAttrs optimizer_attrs;
+  std::optional<Realm::RegionInstance> logit_grad_tensor;
+};
+
+PCGInstance create_pcg_instance(
+    RealmContext &ctx,
+    MappedParallelComputationGraph const &mpcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
+    FFIterationConfig const &iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
new file mode 100644
index 0000000000..90a9d01e69
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/per_device_op_state_backing.dtg.toml
@@ -0,0 +1,15 @@
+namespace = "FlexFlow"
+name = "PerDeviceOpStateBacking"
+type = "struct"
+features = []
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/device_specific_ptr.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicNodeInvocation, ::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
diff --git a/lib/realm-execution/include/realm-execution/realm.h b/lib/realm-execution/include/realm-execution/realm.h
new file mode 100644
index 0000000000..814132d355
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_REALM_H
+
+#ifdef FF_USE_PREALM
+#include <realm/prealm/prealm.h>
+#else
+#include <realm.h>
+#endif
+
+namespace FlexFlow {
+
+#ifdef FF_USE_PREALM
+namespace Realm = ::PRealm;
+#else
+namespace Realm = ::Realm;
+#endif
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_allocator.h b/lib/realm-execution/include/realm-execution/realm_allocator.h
new file mode 100644
index 0000000000..d716016676
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_allocator.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_ALLOCATOR_H
+
+#include "kernels/allocation.h"
+#include "realm-execution/realm.h"
+
+namespace FlexFlow {
+
+struct RealmAllocator : public IAllocator {
+  RealmAllocator(Realm::Processor processor, Realm::Memory memory);
+
+  RealmAllocator() = delete;
+  RealmAllocator(RealmAllocator const &) = delete;
+  RealmAllocator(RealmAllocator &&) = delete;
+  ~RealmAllocator() override;
+
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
+private:
+  Realm::Processor processor;
+  Realm::Memory memory;
+  std::unordered_map<void *, Realm::RegionInstance> ptr_instances;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmAllocator);
+
+Allocator get_realm_allocator(Realm::Processor processor, Realm::Memory memory);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
new file mode 100644
index 0000000000..b018a04a87
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -0,0 +1,85 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_CONTEXT_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include <optional>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+struct RealmContext {
+public:
+  RealmContext(Realm::Processor processor);
+  virtual ~RealmContext();
+
+  RealmContext() = delete;
+  RealmContext(RealmContext const &) = delete;
+  RealmContext(RealmContext &&) = delete;
+
+  // Device mapping
+  Realm::Processor
+      map_device_coord_to_processor(MachineSpaceCoordinate const &);
+  static Realm::Memory get_nearest_memory(Realm::Processor);
+
+  // Current device context
+  Realm::Processor get_current_processor() const;
+  Allocator &get_current_device_allocator();
+  device_id_t get_current_device_idx() const;
+
+  // Task creation
+  Realm::Event spawn_task(Realm::Processor proc,
+                          task_id_t task_id,
+                          void const *args,
+                          size_t arglen,
+                          Realm::ProfilingRequestSet const &requests,
+                          Realm::Event wait_on = Realm::Event::NO_EVENT,
+                          int priority = 0);
+
+  Realm::Event
+      collective_spawn_task(Realm::Processor target_proc,
+                            task_id_t task_id,
+                            void const *args,
+                            size_t arglen,
+                            Realm::Event wait_on = Realm::Event::NO_EVENT,
+                            int priority = 0);
+
+  // Instance management
+  std::pair<Realm::RegionInstance, Realm::Event>
+      create_instance(Realm::Memory memory,
+                      TensorShape const &shape,
+                      Realm::ProfilingRequestSet const &prs,
+                      Realm::Event wait_on = Realm::Event::NO_EVENT);
+
+  // Get the current set of outstanding events
+  Realm::Event get_outstanding_events();
+
+protected:
+  // Compact AND CLEAR the outstanding event queue
+  // Important: USER MUST BLOCK on event or else use it, or it WILL BE LOST
+  [[nodiscard]] Realm::Event merge_outstanding_events();
+
+  void discover_machine_topology();
+
+  static std::optional<ManagedPerDeviceFFHandle>
+      make_device_handle_for_processor(Realm::Processor processor);
+
+protected:
+  Realm::Runtime runtime;
+  Realm::Processor processor;
+  Allocator allocator;
+  std::vector<Realm::Event> outstanding_events;
+  std::unordered_map<std::pair<Realm::AddressSpace, Realm::Processor::Kind>,
+                     std::vector<Realm::Processor>>
+      processors;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/realm_manager.h b/lib/realm-execution/include/realm-execution/realm_manager.h
new file mode 100644
index 0000000000..8a79476bcf
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/realm_manager.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REALM_MANAGER_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+struct RealmManager : private RealmContext {
+public:
+  RealmManager(int *argc, char ***argv);
+  virtual ~RealmManager();
+
+  RealmManager() = delete;
+  RealmManager(RealmManager const &) = delete;
+  RealmManager(RealmManager &&) = delete;
+
+  [[nodiscard]] Realm::Event
+      start_controller(std::function<void(RealmContext &)>,
+                       Realm::Event wait_on = Realm::Event::NO_EVENT);
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
new file mode 100644
index 0000000000..7134973ead
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/controller_task.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_CONTROLLER_TASK_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+void controller_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
new file mode 100644
index 0000000000..a87652b5ce
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_return_task.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_RETURN_TASK_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+void device_handle_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
new file mode 100644
index 0000000000..312ed26add
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_HANDLE_INIT_TASK_H
+
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+
+namespace FlexFlow {
+
+void device_handle_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..c0ba37bb5d
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_handle_init_task_args.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "DeviceHandleInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/realm.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
new file mode 100644
index 0000000000..4de7e5689f
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_return_task.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_RETURN_TASK_H
+
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "task-spec/per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+void device_state_init_return_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_device_state_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
new file mode 100644
index 0000000000..657d2e8401
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_DEVICE_STATE_INIT_TASK_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
+
+namespace FlexFlow {
+
+void device_state_init_task_body(
+    void const *, size_t, void const *, size_t, Realm::Processor);
+
+std::optional<Realm::Event> spawn_device_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..9a7c2781d2
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/device_state_init_task_args.dtg.toml
@@ -0,0 +1,48 @@
+namespace = "FlexFlow"
+name = "DeviceStateInitTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
+  "realm-execution/realm.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "TensorInstanceBacking"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::Realm::Processor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "::FlexFlow::DeviceSpecificPtr<PerDeviceOpState> *"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
new file mode 100644
index 0000000000..8399742424
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_OP_TASK_H
+
+#include "kernels/profiling_settings.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/ff_iteration_config.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+void op_task_body(void const *, size_t, void const *, size_t, Realm::Processor);
+
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
new file mode 100644
index 0000000000..f6bb83fbca
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/op_task_args.dtg.toml
@@ -0,0 +1,43 @@
+namespace = "FlexFlow"
+name = "OpTaskArgs"
+type = "struct"
+features = []
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/device_specific_managed_per_device_ff_handle.h",
+  "realm-execution/device_specific_ptr.h",
+  "realm-execution/tensor_instance_backing.dtg.h",
+  "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+  "task-spec/per_device_op_state.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::DynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::TensorInstanceBacking"
+
+[[fields]]
+name = "device_state"
+type = "std::optional<::FlexFlow::DeviceSpecificPtr<::FlexFlow::PerDeviceOpState>>"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::DeviceSpecificManagedPerDeviceFFHandle"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
new file mode 100644
index 0000000000..34f52880f8
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.toml
@@ -0,0 +1,29 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceHandleInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+]
+
+[[fields]]
+name = "workSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "allowTensorOpMathConversion"
+type = "bool"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
new file mode 100644
index 0000000000..63d70fe10a
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_handle_init_task_args.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_HANDLE_INIT_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDeviceHandleInitTaskArgs
+    device_handle_init_task_args_to_serializable(
+        DeviceHandleInitTaskArgs const &);
+DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
+    SerializableDeviceHandleInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
new file mode 100644
index 0000000000..034132f9d1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.toml
@@ -0,0 +1,52 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceStateInitTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
+  "task-spec/device_specific_per_device_op_state.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "origin_proc"
+type = "::FlexFlow::SerializableRealmProcessor"
+
+[[fields]]
+name = "origin_result_ptr"
+type = "uintptr_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
new file mode 100644
index 0000000000..f028820974
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_device_state_init_task_args.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_DEVICE_STATE_INIT_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
+    DeviceStateInitTaskArgs const &);
+DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
+    SerializableDeviceStateInitTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
new file mode 100644
index 0000000000..adac6631ee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.dtg.toml
@@ -0,0 +1,51 @@
+namespace = "FlexFlow"
+name = "SerializableOpTaskArgs"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "kernels/profiling_settings.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+  "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h",
+  "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h",
+  "task-spec/ff_iteration_config.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "invocation"
+type = "::FlexFlow::SerializableDynamicNodeInvocation"
+
+[[fields]]
+name = "tensor_backing"
+type = "::FlexFlow::SerializableTensorInstanceBacking"
+
+[[fields]]
+name = "device_state"
+type = "std::optional<::FlexFlow::SerializableDeviceSpecificPtr>"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "device_handle"
+type = "::FlexFlow::SerializableDeviceSpecificPtr"
+
+[[fields]]
+name = "iteration_config"
+type = "::FlexFlow::FFIterationConfig"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "std::optional<::FlexFlow::OptimizerAttrs>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
new file mode 100644
index 0000000000..3b2d05d0b6
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/impl/serializable_op_task_args.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_IMPL_SERIALIZABLE_OP_TASK_ARGS_H
+
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.dtg.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &);
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
new file mode 100644
index 0000000000..a3c6891fb0
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_id_t.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_ID_T_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
new file mode 100644
index 0000000000..8114f1a82c
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/realm_task_registry.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_REALM_TASK_REGISTRY_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+[[nodiscard]] Realm::Event register_task(Realm::Processor::Kind target_kind,
+                                         task_id_t func_id,
+                                         void (*task_body)(void const *,
+                                                           size_t,
+                                                           void const *,
+                                                           size_t,
+                                                           Realm::Processor));
+
+[[nodiscard]] Realm::Event register_all_tasks();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
new file mode 100644
index 0000000000..07cf61f7e1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "SerializableDeviceSpecificPtr"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "pcg/device_id_t.dtg.h",
+  "cstdint",
+  "optional",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "device_idx"
+type = "::FlexFlow::device_id_t"
+
+[[fields]]
+name = "ptr"
+type = "std::optional<uintptr_t>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
new file mode 100644
index 0000000000..726aef84ba
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_device_specific_ptr.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_DEVICE_SPECIFIC_PTR_H
+
+#include "realm-execution/device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.dtg.h"
+
+namespace FlexFlow {
+
+template <typename T>
+SerializableDeviceSpecificPtr device_specific_ptr_to_serializable(
+    DeviceSpecificPtr<T> const &device_specific) {
+  return SerializableDeviceSpecificPtr{
+      /*device_idx=*/device_specific.get_device_idx(),
+      /*ptr=*/
+      transform(device_specific.get_unsafe_raw_ptr(),
+                [](T *ptr) { return reinterpret_cast<uintptr_t>(ptr); }),
+  };
+}
+
+template <typename T>
+DeviceSpecificPtr<T> device_specific_ptr_from_serializable(
+    SerializableDeviceSpecificPtr const &device_specific) {
+  return DeviceSpecificPtr<T>{
+      /*device_idx*/ device_specific.device_idx,
+      /*ptr=*/transform(device_specific.ptr, [](uintptr_t ptrval) {
+        return reinterpret_cast<T *>(ptrval);
+      })};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
new file mode 100644
index 0000000000..3217d58608
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmEvent"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Event::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
new file mode 100644
index 0000000000..ae1f1e8265
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_event.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_EVENT_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &);
+Realm::Event realm_event_from_serializable(SerializableRealmEvent const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
new file mode 100644
index 0000000000..5b70c6888b
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.dtg.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "SerializableRealmInstance"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+src_includes = [
+  "utils/fmt/vector.h",
+  "utils/hash/vector.h",
+]
+
+[[fields]]
+name = "instance"
+# Realm::RegionInstance has hidden fields in PRealm so we need to encode it as bytes
+type = "std::vector<uint8_t>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
new file mode 100644
index 0000000000..7262ec4f09
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_instance.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_INSTANCE_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &);
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
new file mode 100644
index 0000000000..3cb64d95c1
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.dtg.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SerializableRealmProcessor"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "realm-execution/realm.h",
+]
+
+[[fields]]
+name = "id"
+type = "::FlexFlow::Realm::Processor::id_t"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
new file mode 100644
index 0000000000..6b29b6e223
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_realm_processor.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_REALM_PROCESSOR_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.dtg.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &);
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..75a796b2ee
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "SerializableTensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/tasks/serializer/serializable_realm_event.dtg.h",
+  "realm-execution/tasks/serializer/serializable_realm_instance.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/pair.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::SerializableDynamicValueAttrs, std::pair<::FlexFlow::SerializableRealmInstance, ::FlexFlow::SerializableRealmEvent>>"
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
new file mode 100644
index 0000000000..b536972b40
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/serializable_tensor_instance_backing.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_SERIALIZABLE_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.dtg.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking
+    tensor_instance_backing_to_serializable(TensorInstanceBacking const &);
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
new file mode 100644
index 0000000000..3208368d2d
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/serializer/task_arg_serializer.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_SERIALIZER_TASK_ARG_SERIALIZER_H
+
+#include <nlohmann/json.hpp>
+#include <string>
+#include <string_view>
+
+namespace FlexFlow {
+
+template <typename T>
+std::string serialize_task_args(T const &args) {
+  nlohmann::json j = args;
+  return j.dump();
+}
+
+template <typename T>
+T deserialize_task_args(void const *args, size_t arglen) {
+  nlohmann::json j = nlohmann::json::parse(
+      std::string_view{reinterpret_cast<char const *>(args), arglen});
+  return j.get<T>();
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/task_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
similarity index 97%
rename from lib/task-spec/include/task-spec/task_id_t.dtg.toml
rename to lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
index ce2de52d40..97b19b5f51 100644
--- a/lib/task-spec/include/task-spec/task_id_t.dtg.toml
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.dtg.toml
@@ -9,10 +9,16 @@ features = [
 ]
 
 [[values]]
-name = "TOP_LEVEL_TASK_ID"
+name = "CONTROLLER_TASK_ID"
 
 [[values]]
-name = "FF_INIT_TASK_ID"
+name = "DEVICE_HANDLE_INIT_TASK_ID"
+
+[[values]]
+name = "DEVICE_HANDLE_INIT_RETURN_TASK_ID"
+
+[[values]]
+name = "DEVICE_STATE_INIT_RETURN_TASK_ID"
 
 [[values]]
 name = "IMAGE_INIT_TASK_ID"
diff --git a/lib/realm-execution/include/realm-execution/tasks/task_id_t.h b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
new file mode 100644
index 0000000000..53945d2e5b
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tasks/task_id_t.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TASKS_TASK_ID_T_H
+
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeAttrs const &,
+                       std::optional<OptimizerAttrs> const &);
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t> get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &);
+
+std::optional<task_id_t>
+    get_update_task_id_for_optimizer_attrs(OptimizerAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
new file mode 100644
index 0000000000..b8533dbcc9
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.dtg.toml
@@ -0,0 +1,25 @@
+namespace = "FlexFlow"
+name = "TensorInstanceBacking"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  #"hash",
+]
+
+includes = [
+  "<unordered_map>",
+  "realm-execution/realm.h",
+  "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "realm-execution/fmt/realm_event.h",
+  "realm-execution/fmt/realm_instance.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h",
+]
+
+[[fields]]
+name = "backing"
+type = "std::unordered_map<::FlexFlow::DynamicValueAttrs, std::pair<::FlexFlow::Realm::RegionInstance, ::FlexFlow::Realm::Event>>"
diff --git a/lib/realm-execution/include/realm-execution/tensor_instance_backing.h b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
new file mode 100644
index 0000000000..72a8bf439a
--- /dev/null
+++ b/lib/realm-execution/include/realm-execution/tensor_instance_backing.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_TENSOR_INSTANCE_BACKING_H
+
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+TensorInstanceBacking make_empty_tensor_instance_backing();
+
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &, DynamicNodeInvocation const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
new file mode 100644
index 0000000000..ba4fcc5a9f
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/atomic_dependency_set.cc
@@ -0,0 +1,27 @@
+#include "realm-execution/atomic_dependency_set.h"
+
+namespace FlexFlow {
+
+AtomicDependencySet::AtomicDependencySet(Realm::Event precondition)
+    : writer(precondition) {}
+
+void AtomicDependencySet::add_writer(Realm::Event writer) {
+  this->writer =
+      Realm::Event::merge_events(writer, this->get_dependency_for_writer());
+  this->readers.clear();
+}
+
+void AtomicDependencySet::add_reader(Realm::Event reader) {
+  this->readers.push_back(reader);
+}
+
+Realm::Event AtomicDependencySet::get_dependency_for_writer() const {
+  Realm::Event readers = Realm::Event::merge_events(this->readers);
+  return Realm::Event::merge_events(this->writer, readers);
+}
+
+Realm::Event AtomicDependencySet::get_dependency_for_reader() const {
+  return this->writer;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dependency_set.cc b/lib/realm-execution/src/realm-execution/dependency_set.cc
new file mode 100644
index 0000000000..84412a125d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dependency_set.cc
@@ -0,0 +1,49 @@
+#include "realm-execution/dependency_set.h"
+#include "realm-execution/atomic_dependency_set.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+DependencySet::DependencySet(Realm::Event precondition)
+    : precondition(precondition) {}
+
+void DependencySet::add_writer(DynamicValueAttrs const &value,
+                               Realm::Event writer) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_writer(writer);
+}
+
+void DependencySet::add_reader(DynamicValueAttrs const &value,
+                               Realm::Event reader) {
+  AtomicDependencySet &atomic_dependence_set =
+      this->get_atomic_dependency_set(value);
+  atomic_dependence_set.add_reader(reader);
+}
+
+Realm::Event DependencySet::get_dependency_for_writer(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_dependency_for_writer();
+  }
+  return this->precondition;
+}
+
+Realm::Event DependencySet::get_dependency_for_reader(
+    DynamicValueAttrs const &value) const {
+  if (contains_key(this->atomic_dependencies, value)) {
+    return this->atomic_dependencies.at(value).get_dependency_for_reader();
+  }
+  return this->precondition;
+}
+
+AtomicDependencySet &
+    DependencySet::get_atomic_dependency_set(DynamicValueAttrs const &value) {
+  if (!contains_key(this->atomic_dependencies, value)) {
+    this->atomic_dependencies.insert(
+        {value, AtomicDependencySet{this->precondition}});
+  }
+  return this->atomic_dependencies.at(value);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..ae9fc669d3
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/device_specific_managed_per_device_ff_handle.cc
@@ -0,0 +1,22 @@
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "kernels/device_handle_t.h"
+#include "utils/containers/transform.h"
+#include "utils/json/optional.h"
+#include <cstdint>
+
+namespace FlexFlow {
+
+DeviceSpecificManagedPerDeviceFFHandle make_device_specific_managed_handle(
+    device_id_t const &device_id,
+    std::optional<ManagedPerDeviceFFHandle *> const &managed_handle) {
+  return DeviceSpecificManagedPerDeviceFFHandle{device_id, managed_handle};
+}
+
+device_handle_t device_handle_t_from_device_specific_managed_handle(
+    DeviceSpecificManagedPerDeviceFFHandle const &device_specific,
+    device_id_t device_idx) {
+  return device_handle_t_from_managed_handle_ptr(
+      device_specific.get(device_idx));
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
new file mode 100644
index 0000000000..87376be9b1
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_device_handle.cc
@@ -0,0 +1,51 @@
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "task-spec/device_specific.h"
+
+namespace FlexFlow {
+
+DistributedDeviceHandle::DistributedDeviceHandle(
+    std::unordered_map<Realm::Processor,
+                       DeviceSpecificManagedPerDeviceFFHandle> const &handles)
+    : handles(handles) {}
+
+DeviceSpecificManagedPerDeviceFFHandle const &
+    DistributedDeviceHandle::at(Realm::Processor processor) const {
+  return this->handles.at(processor);
+}
+
+DistributedDeviceHandle
+    create_distributed_device_handle(RealmContext &ctx,
+                                     size_t workSpaceSize,
+                                     bool allowTensorOpMathConversion,
+                                     Realm::Event precondition) {
+  std::unordered_map<Realm::Processor, DeviceSpecificManagedPerDeviceFFHandle>
+      handles;
+
+  // Allocate space for the result before launching any tasks
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    if (proc.kind() == Realm::Processor::LOC_PROC ||
+        proc.kind() == Realm::Processor::TOC_PROC) {
+      handles.insert({proc,
+                      make_device_specific_managed_handle(
+                          ctx.get_current_device_idx(), std::nullopt)});
+    }
+  }
+
+  for (auto &[proc, handle] : handles) {
+    spawn_device_handle_init_task(ctx,
+                                  proc,
+                                  workSpaceSize,
+                                  allowTensorOpMathConversion,
+                                  &handle,
+                                  precondition);
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  return DistributedDeviceHandle{handles};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
new file mode 100644
index 0000000000..5c0aff00c2
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/distributed_device_state_initialization.cc
@@ -0,0 +1,84 @@
+#include "realm-execution/distributed_device_state_initialization.h"
+#include "local-execution/device_state_initialization.h"
+#include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/tensor_instance_backing.dtg.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+#include "utils/optional.h"
+#include <optional>
+#include <unordered_map>
+#include <utility>
+
+namespace FlexFlow {
+
+PerDeviceOpStateBacking perform_distributed_device_state_initialization(
+    RealmContext &ctx,
+    DynamicOpenDataflowGraph const &dg,
+    TensorInstanceBacking const &tensor_instance_backing,
+    ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    Realm::Event precondition) {
+
+  // Initialize all operators and save the per-device op state
+  ASSERT(no_nodes_are_initialized(dg));
+
+  std::unordered_map<DynamicNodeInvocation,
+                     DeviceSpecificPtr<PerDeviceOpState> *>
+      device_state_map;
+  for (DynamicNodeInvocation const &invocation : dg.invocations) {
+    Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+        assert_unwrap(invocation.node_attrs.device_coord));
+
+    TensorInstanceBacking tensor_backing =
+        subset_tensor_instance_backing_for_invocation(tensor_instance_backing,
+                                                      invocation);
+
+    DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr =
+        new DeviceSpecificPtr<PerDeviceOpState>{ctx.get_current_device_idx(),
+                                                std::nullopt};
+
+    std::optional<Realm::Event> completion_event =
+        spawn_device_state_init_task(ctx,
+                                     target_proc,
+                                     invocation,
+                                     tensor_backing,
+                                     profiling_settings,
+                                     device_handle.at(target_proc),
+                                     iteration_config,
+                                     optimizer_attrs,
+                                     device_state_ptr,
+                                     precondition);
+
+    if (completion_event.has_value()) {
+      device_state_map.insert(std::pair{invocation, device_state_ptr});
+    } else {
+      // Task doesn't require initialization, clean up and don't store result
+      delete device_state_ptr;
+    }
+  }
+
+  ctx.get_outstanding_events().wait();
+
+  auto deref = [](DynamicNodeInvocation const &i,
+                  DeviceSpecificPtr<PerDeviceOpState> *const &p) {
+    return std::pair{i, *p};
+  };
+  std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
+      result = transform(device_state_map, deref);
+
+  for (DeviceSpecificPtr<PerDeviceOpState> *device_state_ptr :
+       values(device_state_map)) {
+    delete device_state_ptr;
+  }
+
+  return PerDeviceOpStateBacking{/*backing=*/result};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
new file mode 100644
index 0000000000..a2a40e3752
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/dynamic_tensor_accessor_from_instance.cc
@@ -0,0 +1,64 @@
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/device_type.dtg.h"
+#include "task-spec/permissions.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+static DeviceType infer_device_type_from_memory_and_processor(
+    Realm::Memory inst_memory, Realm::Processor for_processor) {
+  DeviceType device_type;
+  switch (inst_memory.kind()) {
+    case Realm::Memory::SYSTEM_MEM:
+      // Only accessible on CPU
+      device_type = DeviceType::CPU;
+      break;
+    case Realm::Memory::GPU_FB_MEM:
+      // Only accessible on GPU
+      device_type = DeviceType::GPU;
+      break;
+    case Realm::Memory::Z_COPY_MEM: {
+      // Accessible on either CPU or GPU, so infer based on where we're trying
+      // to access from
+      switch (for_processor.kind()) {
+        case Realm::Processor::LOC_PROC:
+          device_type = DeviceType::CPU;
+          break;
+        case Realm::Processor::TOC_PROC:
+          device_type = DeviceType::GPU;
+          break;
+        default:
+          PANIC("Unexpected Realm Processor kind", for_processor.kind());
+      }
+    } break;
+    default:
+      PANIC("Unexpected Realm Memory kind", inst_memory.kind());
+  }
+  return device_type;
+}
+
+DynamicTensorAccessor dynamic_tensor_accessor_from_instance(
+    Realm::RegionInstance inst,
+    Realm::Event ready,
+    ParallelTensorShape const &parallel_tensor_shape,
+    Permissions const &permissions,
+    Realm::Processor for_processor) {
+  ready.wait();
+
+  DeviceType device_type = infer_device_type_from_memory_and_processor(
+      inst.get_location(), for_processor);
+
+  size_t expected_size =
+      int{get_piece_size_in_bytes(parallel_tensor_shape).unwrap_num_bytes()};
+  void *ptr = inst.pointer_untyped(/*offset=*/0, /*datalen=*/expected_size);
+  if (permissions == Permissions::RO) {
+    return DynamicTensorAccessor{GenericTensorAccessorR{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  } else {
+    return DynamicTensorAccessor{GenericTensorAccessorW{
+        get_piece_shape(parallel_tensor_shape), ptr, device_type}};
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_event.cc b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
new file mode 100644
index 0000000000..a5aed9481d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_event.cc
@@ -0,0 +1,9 @@
+#include "realm-execution/fmt/realm_event.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s, ::FlexFlow::Realm::Event const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
new file mode 100644
index 0000000000..301954f824
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/fmt/realm_instance.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/fmt/realm_instance.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s,
+                         ::FlexFlow::Realm::RegionInstance const &m) {
+  return s << fmt::to_string(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/instance_allocation.cc b/lib/realm-execution/src/realm-execution/instance_allocation.cc
new file mode 100644
index 0000000000..e003e5b71a
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/instance_allocation.cc
@@ -0,0 +1,82 @@
+#include "realm-execution/instance_allocation.h"
+#include "local-execution/tensor_allocation.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "utils/bidict/generate_bidict.h"
+#include "utils/containers/all_are_true.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/make.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/containers/values.h"
+#include "utils/exception.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+std::pair<Realm::RegionInstance, Realm::Event>
+    perform_instance_allocation_for_value(DynamicNodeAttrs const &node,
+                                          DynamicValueAttrs const &value,
+                                          RealmContext &ctx) {
+  ASSERT(value.accessor == std::nullopt);
+
+  TensorShape shape = get_piece_shape(value.parallel_tensor_shape.value());
+
+  MachineSpaceCoordinate device_coord = assert_unwrap(node.device_coord);
+  Realm::Processor proc = ctx.map_device_coord_to_processor(device_coord);
+  Realm::Memory memory = ctx.get_nearest_memory(proc);
+  return ctx.create_instance(memory, shape, Realm::ProfilingRequestSet());
+}
+
+TensorInstanceBacking perform_instance_allocation(
+    DynamicOpenDataflowGraph const &g,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &preallocated,
+    RealmContext &ctx) {
+  ASSERT(no_tensors_are_allocated(g));
+  ASSERT(tensors_are_ready_for_allocation(g));
+  for (DynamicValueAttrs const &v : keys(preallocated)) {
+    ASSERT(v.accessor == std::nullopt);
+  }
+
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
+  auto allocate = [&](DynamicNodeAttrs const &n, DynamicValueAttrs const &v) {
+    if (contains_key(preallocated, v)) {
+      // FIXME: Attach external instance to existing allocation and use that
+      NOT_IMPLEMENTED();
+    } else {
+      if (!contains_key(result.backing, v)) {
+        result.backing.insert(
+            std::pair{v, perform_instance_allocation_for_value(n, v, ctx)});
+      }
+      return result.backing.at(v);
+    }
+  };
+
+  for (DynamicNodeInvocation const &invocation : g.invocations) {
+    for (DynamicValueAttrs const &input : values(invocation.inputs)) {
+      allocate(invocation.node_attrs, input);
+    }
+    for (DynamicValueAttrs const &output : values(invocation.outputs)) {
+      allocate(invocation.node_attrs, output);
+    }
+  }
+
+  return result;
+}
+
+void destroy_instances(TensorInstanceBacking const &instances,
+                       Realm::Event precondition) {
+  for (auto const &[instance, ready] : values(instances.backing)) {
+    instance.destroy(Realm::Event::merge_events(precondition, ready));
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
new file mode 100644
index 0000000000..d78ed68988
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/pcg_instance/pcg_instance.cc
@@ -0,0 +1,324 @@
+#include "realm-execution/pcg_instance/pcg_instance.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/optimizer_attrs.h"
+#include "realm-execution/dependency_set.h"
+#include "realm-execution/distributed_device_state_initialization.h"
+#include "realm-execution/instance_allocation.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_task_type.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/loss_insertion.h"
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "task-spec/dynamic_graph/pass_expansion.h"
+#include "task-spec/dynamic_graph/shard_expansion.h"
+#include "task-spec/dynamic_graph/training_operation_attrs.dtg.h"
+#include "task-spec/dynamic_graph/update_insertion.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/try_at.h"
+#include "utils/containers/values.h"
+#include "utils/graph/digraph/algorithms/get_topological_ordering.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+PCGInstance::PCGInstance(
+    RealmContext &ctx,
+    std::vector<DynamicNodeInvocation> const &execution_order,
+    TensorInstanceBacking const &tensor_instance_backing,
+    PerDeviceOpStateBacking const &device_state_backing,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<Realm::RegionInstance> logit_grad_tensor)
+    : ctx(ctx), execution_order(execution_order),
+      tensor_instance_backing(tensor_instance_backing),
+      device_state_backing(device_state_backing),
+      optimizer_attrs(optimizer_attrs), logit_grad_tensor(logit_grad_tensor) {}
+
+PCGInstance::~PCGInstance() {
+  destroy_instances(this->tensor_instance_backing,
+                    ctx.get_outstanding_events());
+}
+
+RealmContext &PCGInstance::get_realm_context() {
+  return this->ctx;
+}
+std::vector<DynamicNodeInvocation> const &
+    PCGInstance::get_execution_order() const {
+  return this->execution_order;
+}
+TensorInstanceBacking const &PCGInstance::get_tensor_instance_backing() const {
+  return this->tensor_instance_backing;
+}
+PerDeviceOpStateBacking const &PCGInstance::get_device_state_backing() const {
+  return this->device_state_backing;
+}
+OptimizerAttrs const &PCGInstance::get_optimizer_attrs() const {
+  return this->optimizer_attrs;
+}
+void PCGInstance::update_optimizer_attrs_for_next_iter() {
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+}
+std::optional<Realm::RegionInstance>
+    PCGInstance::get_loss_tensor_instance() const {
+  return this->logit_grad_tensor;
+}
+
+PCGInstance create_pcg_instance(
+    RealmContext &ctx,
+    MappedParallelComputationGraph const &mpcg,
+    OptimizerAttrs const &optimizer_attrs,
+    std::optional<LossAttrs> const &loss_attrs,
+    std::optional<GenericTensorAccessorR> label_tensor,
+    std::optional<parallel_tensor_guid_t> logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping,
+    std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> const
+        &input_tensors,
+    ProfilingSettings const &profiling_settings,
+    DistributedDeviceHandle const &device_handle,
+    FFIterationConfig const &iteration_config) {
+
+  DynamicOpenDataflowGraph dg =
+      make_dynamic_open_dataflow_graph_from_mpcg(mpcg);
+  dg = perform_pass_expansion(dg);
+
+  std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor> inputs =
+      input_tensors;
+  std::optional<DynamicValueAttrs> logit_grad_value;
+  if (loss_attrs) {
+    auto [dg2, label_v, logit_grad_v] = perform_loss_insertion(
+        dg,
+        assert_unwrap(loss_attrs),
+        dynamic_tensor_guid_t{assert_unwrap(logit_tensor)},
+        loss_mapping);
+    dg = dg2;
+    logit_grad_value = logit_grad_v;
+    inputs.insert(std::pair{label_v, assert_unwrap(label_tensor)});
+  }
+
+  dg = perform_update_insertion(dg, optimizer_attrs);
+  dg = perform_shard_expansion(dg);
+  TensorInstanceBacking tensor_instance_backing =
+      perform_instance_allocation(dg, inputs, ctx);
+
+  logit_grad_value =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        for (DynamicNodeInvocation const &invocation : dg.invocations) {
+          if (invocation.node_attrs.task_type != DynamicTaskType::LOSS) {
+            continue;
+          }
+          for (auto const &[slot, value] : invocation.outputs) {
+            if (slot.slot_name == TensorSlotName::LOGIT &&
+                value.tensor_guid == lgv.tensor_guid &&
+                value.role == lgv.role) {
+              return value;
+            }
+          }
+        }
+        PANIC("couldn't find updated logit grad in the shard-expanded dynamic "
+              "graph");
+      });
+
+  std::optional<Realm::RegionInstance> logit_grad_tensor =
+      transform(logit_grad_value, [&](DynamicValueAttrs const &lgv) {
+        return tensor_instance_backing.backing.at(lgv).first;
+      });
+
+  PerDeviceOpStateBacking device_state_backing =
+      perform_distributed_device_state_initialization(
+          ctx,
+          dg,
+          tensor_instance_backing,
+          profiling_settings,
+          device_handle,
+          iteration_config,
+          optimizer_attrs,
+          ctx.get_outstanding_events());
+
+  // Compute the topological ordering of the graph
+  auto [kwarg_graph, node_map] =
+      labelled_open_kwarg_dataflow_graph_from_dynamic_open_dataflow_graph(dg);
+  std::vector<Node> node_topo_order = get_topological_ordering(kwarg_graph);
+  std::vector<DynamicNodeInvocation> invocation_topo_order = transform(
+      node_topo_order, [&](Node node) { return node_map.at_l(node); });
+
+  return PCGInstance{/*ctx=*/ctx,
+                     /*execution_order=*/invocation_topo_order,
+                     /*tensor_instance_backing=*/tensor_instance_backing,
+                     /*device_state_backing=*/device_state_backing,
+                     /*optimizer_attrs=*/optimizer_attrs,
+                     /*logit_grad_tensor=*/logit_grad_tensor};
+}
+
+static std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    execute_distributed_dynamic_node_invocation_set(
+        RealmContext &ctx,
+        std::vector<DynamicNodeInvocation> const &invocations,
+        TensorInstanceBacking const &tensor_instance_backing,
+        PerDeviceOpStateBacking const &device_state_backing,
+        OptimizerAttrs const &optimizer_attrs,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  // For simplicity we'll track a dependency on all outstanding operations up to
+  // this point. This will create an effective barrier between phases.
+  DependencySet dependency_set{ctx.get_outstanding_events()};
+  return unordered_map_from_pairs(
+      transform(invocations, [&](DynamicNodeInvocation const &invocation) {
+        TrainingOperationAttrs op_attrs =
+            assert_unwrap(invocation.node_attrs.op_attrs);
+        if (op_attrs.is_pcg_op() && (op_attrs.require_pcg_op().is_input() ||
+                                     op_attrs.require_pcg_op().is_weight())) {
+          return std::pair{invocation.node_attrs.layer_guid,
+                           Realm::Event::NO_EVENT};
+        }
+
+        std::vector<Realm::Event> input_dependencies =
+            transform(vector_of(values(invocation.inputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_reader(value);
+                      });
+        std::vector<Realm::Event> output_dependencies =
+            transform(vector_of(values(invocation.outputs)),
+                      [&](DynamicValueAttrs const &value) {
+                        return dependency_set.get_dependency_for_writer(value);
+                      });
+        Realm::Event dependencies = Realm::Event::merge_events(
+            Realm::Event::merge_events(input_dependencies),
+            Realm::Event::merge_events(output_dependencies));
+        Realm::Processor target_proc = ctx.map_device_coord_to_processor(
+            assert_unwrap(invocation.node_attrs.device_coord));
+
+        TensorInstanceBacking tensor_backing =
+            subset_tensor_instance_backing_for_invocation(
+                tensor_instance_backing, invocation);
+
+        Realm::Event result =
+            spawn_op_task(ctx,
+                          target_proc,
+                          invocation,
+                          tensor_backing,
+                          try_at(device_state_backing.backing, invocation),
+                          profiling_settings,
+                          device_handle.at(target_proc),
+                          iteration_config,
+                          optimizer_attrs,
+                          dependencies);
+        for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+          dependency_set.add_reader(value, result);
+        }
+        for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+          dependency_set.add_writer(value, result);
+        }
+        return std::pair{invocation.node_attrs.layer_guid, result};
+      }));
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_all_passes_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      pcg_instance.get_execution_order();
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/pcg_instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/iteration_config);
+  pcg_instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_forward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::FWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/pcg_instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_backward_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::BWD;
+             });
+
+  return execute_distributed_dynamic_node_invocation_set(
+      /*ctx=*/pcg_instance.get_realm_context(),
+      /*invocations=*/execution_order,
+      /*tensor_instance_backing=*/pcg_instance.get_tensor_instance_backing(),
+      /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+      /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+      /*profiling_settings=*/profiling_settings,
+      /*device_handle=*/device_handle,
+      /*iteration_config=*/iteration_config);
+}
+
+std::unordered_map<dynamic_layer_guid_t, Realm::Event>
+    perform_update_pass_for_pcg_instance(
+        PCGInstance &pcg_instance,
+        ProfilingSettings const &profiling_settings,
+        DistributedDeviceHandle const &device_handle,
+        FFIterationConfig iteration_config) {
+  std::vector<DynamicNodeInvocation> execution_order =
+      filter(pcg_instance.get_execution_order(),
+             [](DynamicNodeInvocation const &invocation) {
+               DynamicTaskType task_type =
+                   assert_unwrap(invocation.node_attrs.task_type);
+               return task_type == DynamicTaskType::UPD;
+             });
+
+  std::unordered_map<dynamic_layer_guid_t, Realm::Event> result =
+      execute_distributed_dynamic_node_invocation_set(
+          /*ctx=*/pcg_instance.get_realm_context(),
+          /*invocations=*/execution_order,
+          /*tensor_instance_backing=*/
+          pcg_instance.get_tensor_instance_backing(),
+          /*device_state_backing=*/pcg_instance.get_device_state_backing(),
+          /*optimizer_attrs=*/pcg_instance.get_optimizer_attrs(),
+          /*profiling_settings=*/profiling_settings,
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/iteration_config);
+  pcg_instance.update_optimizer_attrs_for_next_iter();
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_allocator.cc b/lib/realm-execution/src/realm-execution/realm_allocator.cc
new file mode 100644
index 0000000000..194210cf5a
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_allocator.cc
@@ -0,0 +1,64 @@
+#include "realm-execution/realm_allocator.h"
+#include "kernels/device.h"
+#include "pcg/device_type.dtg.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+RealmAllocator::RealmAllocator(Realm::Processor processor, Realm::Memory memory)
+    : processor(processor), memory(memory) {}
+
+RealmAllocator::~RealmAllocator() {
+  for (Realm::RegionInstance const &instance : values(this->ptr_instances)) {
+    instance.destroy(Realm::Event::NO_EVENT);
+  }
+}
+
+void *RealmAllocator::allocate(size_t requested_memory_size) {
+  Realm::Rect<1> bounds{Realm::Point<1>::ZEROES(),
+                        Realm::Point<1>{requested_memory_size} -
+                            Realm::Point<1>::ONES()};
+  std::vector<size_t> field_sizes{1};
+  Realm::RegionInstance inst;
+  Realm::Event ready =
+      Realm::RegionInstance::create_instance(inst,
+                                             this->memory,
+                                             bounds,
+                                             field_sizes,
+                                             0 /*SOA*/,
+                                             Realm::ProfilingRequestSet{});
+  ready.wait();
+  void *ptr =
+      inst.pointer_untyped(/*offset=*/0, /*datalen=*/requested_memory_size);
+  ASSERT(ptr);
+  this->ptr_instances.insert({ptr, inst});
+  return ptr;
+}
+
+void RealmAllocator::deallocate(void *ptr) {
+  ASSERT(contains_key(this->ptr_instances, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  this->ptr_instances.at(ptr).destroy(Realm::Event::NO_EVENT);
+  this->ptr_instances.erase(ptr);
+}
+
+DeviceType RealmAllocator::get_allocation_device_type() const {
+  switch (this->processor.kind()) {
+    case Realm::Processor::Kind::LOC_PROC:
+      return DeviceType::CPU;
+    case Realm::Processor::Kind::TOC_PROC:
+      return DeviceType::GPU;
+    default:
+      PANIC("Unhandled FwbTensorType", this->processor.kind());
+  }
+}
+
+Allocator get_realm_allocator(Realm::Processor processor,
+                              Realm::Memory memory) {
+  Allocator allocator = Allocator::create<RealmAllocator>(processor, memory);
+  return allocator;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_context.cc b/lib/realm-execution/src/realm-execution/realm_context.cc
new file mode 100644
index 0000000000..10ed07118b
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_context.cc
@@ -0,0 +1,253 @@
+#include "realm-execution/realm_context.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_handle_t.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/tensor_dims.dtg.h"
+#include "pcg/device_id_t.h"
+#include "pcg/device_type.dtg.h"
+#include "realm-execution/realm_allocator.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/one_to_many/one_to_many.h"
+#include "utils/positive_int/positive_int.h"
+
+namespace FlexFlow {
+
+RealmContext::RealmContext(Realm::Processor processor)
+    : processor(processor),
+      allocator(get_realm_allocator(
+          processor, RealmContext::get_nearest_memory(processor))) {}
+
+RealmContext::~RealmContext() {
+  if (!this->outstanding_events.empty()) {
+    Realm::Event outstanding = this->merge_outstanding_events();
+    outstanding.wait();
+  }
+}
+
+static std::tuple<Realm::AddressSpace, Realm::Processor::Kind, nonnegative_int>
+    convert_machine_space_coordinate(
+        MachineSpaceCoordinate const &device_coord) {
+  Realm::AddressSpace as = int{device_coord.node_idx};
+  Realm::Processor::Kind kind;
+  switch (device_coord.device_type) {
+    case DeviceType::CPU:
+      kind = Realm::Processor::Kind::LOC_PROC;
+      break;
+    case DeviceType::GPU:
+      kind = Realm::Processor::Kind::TOC_PROC;
+      break;
+    default:
+      PANIC("Unhandled DeviceType", fmt::to_string(device_coord.device_type));
+      break;
+  }
+  nonnegative_int proc_in_node = device_coord.device_idx;
+  return std::tuple{as, kind, proc_in_node};
+}
+
+Realm::Processor RealmContext::map_device_coord_to_processor(
+    MachineSpaceCoordinate const &device_coord) {
+  this->discover_machine_topology();
+  auto [as, kind, proc_in_node] =
+      convert_machine_space_coordinate(device_coord);
+  return this->processors.at(std::pair{as, kind}).at(int{proc_in_node});
+}
+
+Realm::Memory RealmContext::get_nearest_memory(Realm::Processor proc) {
+  if (!proc.exists()) {
+    return Realm::Memory::NO_MEMORY;
+  }
+
+  // FIMXE: this isn't going to do what you expect until
+  // https://github.com/StanfordLegion/realm/pull/392 merges
+  Realm::Machine::MemoryQuery mq(Realm::Machine::get_machine());
+  mq.best_affinity_to(proc);
+  ASSERT(mq.count() > 0);
+  return mq.first();
+}
+
+Realm::Processor RealmContext::get_current_processor() const {
+  return this->processor;
+}
+
+Allocator &RealmContext::get_current_device_allocator() {
+  return this->allocator;
+}
+
+device_id_t RealmContext::get_current_device_idx() const {
+  Realm::Processor proc = this->get_current_processor();
+
+  // FIXME: find a more efficient way to implement this than scanning the
+  // machine every time
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  pq.same_address_space_as(proc);
+  nonnegative_int idx{0};
+  for (Realm::Processor p : pq) {
+    if (p == proc) {
+      break;
+    }
+    idx++;
+  }
+
+  switch (proc.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::CPU);
+    case Realm::Processor::TOC_PROC:
+      return make_device_id_t_from_idx(idx, DeviceType::GPU);
+    default:
+      PANIC("Unhandled Realm::ProcessorKind", fmt::to_string(int{proc.kind()}));
+  }
+}
+
+Realm::Event
+    RealmContext::spawn_task(Realm::Processor proc,
+                             task_id_t task_id,
+                             void const *args,
+                             size_t arglen,
+                             Realm::ProfilingRequestSet const &requests,
+                             Realm::Event wait_on,
+                             int priority) {
+  Realm::Event result = proc.spawn(get_realm_task_id_for_task_id(task_id),
+                                   args,
+                                   arglen,
+                                   requests,
+                                   wait_on,
+                                   priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+Realm::Event RealmContext::collective_spawn_task(Realm::Processor target_proc,
+                                                 task_id_t task_id,
+                                                 void const *args,
+                                                 size_t arglen,
+                                                 Realm::Event wait_on,
+                                                 int priority) {
+  Realm::Event result =
+      this->runtime.collective_spawn(target_proc,
+                                     get_realm_task_id_for_task_id(task_id),
+                                     args,
+                                     arglen,
+                                     wait_on,
+                                     priority);
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+template <int N, typename T = int>
+static Realm::Rect<N, T> rect_from_dims(TensorDims const &dims) {
+  std::vector<int> values{dims.ff_ordered.begin(), dims.ff_ordered.end()};
+  ASSERT(values.size() == N);
+  return Realm::Rect<N, T>{Realm::Point<N, T>::ZEROES(),
+                           Realm::Point<N, T>{values.data()} -
+                               Realm::Point<N, T>::ONES()};
+}
+
+std::pair<Realm::RegionInstance, Realm::Event>
+    RealmContext::create_instance(Realm::Memory memory,
+                                  TensorShape const &shape,
+                                  Realm::ProfilingRequestSet const &prs,
+                                  Realm::Event wait_on) {
+  std::vector<size_t> field_sizes{
+      static_cast<size_t>(int{size_of_datatype(shape.data_type)})};
+  Realm::RegionInstance inst;
+  Realm::Event ready;
+  switch (shape.dims.ff_ordered.num_dims()) {
+#if REALM_MAX_DIM >= 1
+    case 1:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<1>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 2
+    case 2:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<2>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 3
+    case 3:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<3>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 4
+    case 4:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<4>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+#if REALM_MAX_DIM >= 5
+    case 5:
+      ready =
+          Realm::RegionInstance::create_instance(inst,
+                                                 memory,
+                                                 rect_from_dims<5>(shape.dims),
+                                                 field_sizes,
+                                                 0 /*SOA*/,
+                                                 prs,
+                                                 wait_on);
+      break;
+#endif
+    default:
+      PANIC("TensorShape dims greater than REALM_MAX_DIM",
+            fmt::to_string(shape.dims.ff_ordered.num_dims()));
+      break;
+  }
+  this->outstanding_events.push_back(ready);
+  return std::pair{inst, ready};
+}
+
+Realm::Event RealmContext::get_outstanding_events() {
+  Realm::Event result = this->merge_outstanding_events();
+  this->outstanding_events.push_back(result);
+  return result;
+}
+
+Realm::Event RealmContext::merge_outstanding_events() {
+  Realm::Event result = Realm::Event::merge_events(this->outstanding_events);
+  this->outstanding_events.clear();
+  return result;
+}
+
+void RealmContext::discover_machine_topology() {
+  if (!this->processors.empty()) {
+    return;
+  }
+
+  Realm::Machine::ProcessorQuery pq(Realm::Machine::get_machine());
+  for (Realm::Processor proc : pq) {
+    Realm::AddressSpace as = proc.address_space();
+    Realm::Processor::Kind kind = proc.kind();
+    this->processors[std::pair{as, kind}].push_back(proc);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/realm_manager.cc b/lib/realm-execution/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..fc74fffe5d
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/realm_manager.cc
@@ -0,0 +1,34 @@
+#include "realm-execution/realm_manager.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/realm_task_registry.h"
+
+namespace FlexFlow {
+
+RealmManager::RealmManager(int *argc, char ***argv)
+    : RealmContext(Realm::Processor::NO_PROC) {
+  bool ok = this->runtime.init(argc, argv);
+  ASSERT(ok);
+
+  // Register all tasks at initialization time so we don't need to later
+  register_all_tasks().wait();
+}
+
+RealmManager::~RealmManager() {
+  Realm::Event outstanding = this->merge_outstanding_events();
+  this->runtime.shutdown(outstanding);
+  this->runtime.wait_for_shutdown();
+}
+
+Realm::Event
+    RealmManager::start_controller(std::function<void(RealmContext &)> thunk,
+                                   Realm::Event wait_on) {
+  Realm::Processor target_proc =
+      Realm::Machine::ProcessorQuery(Realm::Machine::get_machine())
+          .only_kind(Realm::Processor::LOC_PROC)
+          .first();
+
+  return collective_spawn_controller_task(*this, target_proc, thunk, wait_on);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
new file mode 100644
index 0000000000..285e8acaa7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/controller_task.cc
@@ -0,0 +1,39 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/task_id_t.h"
+
+namespace FlexFlow {
+
+struct ControllerTaskArgs {
+public:
+  std::function<void(RealmContext &)> thunk;
+};
+
+void controller_task_body(void const *args,
+                          size_t arglen,
+                          void const *userdata,
+                          size_t userlen,
+                          Realm::Processor proc) {
+  ASSERT(arglen == sizeof(ControllerTaskArgs));
+  ControllerTaskArgs task_args =
+      *reinterpret_cast<ControllerTaskArgs const *>(args);
+
+  RealmContext ctx{proc};
+  task_args.thunk(ctx);
+}
+
+Realm::Event
+    collective_spawn_controller_task(RealmContext &ctx,
+                                     Realm::Processor &target_proc,
+                                     std::function<void(RealmContext &)> thunk,
+                                     Realm::Event precondition) {
+  ControllerTaskArgs task_args;
+  task_args.thunk = thunk;
+
+  return ctx.collective_spawn_task(target_proc,
+                                   task_id_t::CONTROLLER_TASK_ID,
+                                   &task_args,
+                                   sizeof(task_args),
+                                   precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
new file mode 100644
index 0000000000..bda6f7781c
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_return_task.cc
@@ -0,0 +1,52 @@
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+struct DeviceHandleInitReturnTaskArgs {
+public:
+  DeviceHandleInitReturnTaskArgs() = delete;
+  DeviceHandleInitReturnTaskArgs(
+      DeviceSpecificManagedPerDeviceFFHandle result,
+      Realm::Processor origin_proc,
+      DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificManagedPerDeviceFFHandle result;
+  Realm::Processor origin_proc;
+  DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr;
+};
+
+void device_handle_init_return_task_body(void const *args,
+                                         size_t arglen,
+                                         void const *userdata,
+                                         size_t userlen,
+                                         Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceHandleInitReturnTaskArgs));
+  DeviceHandleInitReturnTaskArgs task_args =
+      *reinterpret_cast<DeviceHandleInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_device_handle_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificManagedPerDeviceFFHandle const &result,
+    DeviceSpecificManagedPerDeviceFFHandle *origin_result_ptr,
+    Realm::Event precondition) {
+  DeviceHandleInitReturnTaskArgs task_args{
+      result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
new file mode 100644
index 0000000000..b806aa1277
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_handle_init_task.cc
@@ -0,0 +1,81 @@
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+static std::optional<ManagedPerDeviceFFHandle *>
+    make_device_handle_for_processor(Realm::Processor processor,
+                                     size_t workSpaceSize,
+                                     bool allowTensorOpMathConversion) {
+  switch (processor.kind()) {
+    case Realm::Processor::LOC_PROC:
+      return std::nullopt;
+    case Realm::Processor::TOC_PROC:
+      return new ManagedPerDeviceFFHandle{initialize_multi_gpu_handle(
+          /*num_ranks=*/Realm::Machine::get_machine().get_address_space_count(),
+          /*my_rank=*/processor.address_space(),
+          /*workSpaceSize=*/workSpaceSize,
+          /*allowTensorOpMathConversion=*/allowTensorOpMathConversion)};
+    default:
+      PANIC("Unhandled Realm::ProcessorKind",
+            fmt::to_string(int{processor.kind()}));
+  }
+}
+
+void device_handle_init_task_body(void const *args,
+                                  size_t arglen,
+                                  void const *userdata,
+                                  size_t userlen,
+                                  Realm::Processor proc) {
+  DeviceHandleInitTaskArgs task_args =
+      device_handle_init_task_args_from_serializable(
+          deserialize_task_args<SerializableDeviceHandleInitTaskArgs>(args,
+                                                                      arglen));
+
+  RealmContext ctx{proc};
+  DeviceSpecificManagedPerDeviceFFHandle managed_handle =
+      make_device_specific_managed_handle(
+          ctx.get_current_device_idx(),
+          make_device_handle_for_processor(
+              proc,
+              task_args.workSpaceSize,
+              task_args.allowTensorOpMathConversion));
+
+  spawn_device_handle_init_return_task(ctx,
+                                       task_args.origin_proc,
+                                       managed_handle,
+                                       task_args.origin_result_ptr,
+                                       Realm::Event::NO_EVENT);
+}
+
+Realm::Event spawn_device_handle_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion,
+    DeviceSpecificManagedPerDeviceFFHandle *result_ptr,
+    Realm::Event precondition) {
+  DeviceHandleInitTaskArgs task_args{
+      workSpaceSize,
+      allowTensorOpMathConversion,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
+
+  std::string args = serialize_task_args(
+      device_handle_init_task_args_to_serializable(task_args));
+  return ctx.spawn_task(target_proc,
+                        task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                        args.data(),
+                        args.size(),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
new file mode 100644
index 0000000000..a1a7eb84a8
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_return_task.cc
@@ -0,0 +1,52 @@
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+
+namespace FlexFlow {
+
+struct DeviceStateInitReturnTaskArgs {
+public:
+  DeviceStateInitReturnTaskArgs() = delete;
+  DeviceStateInitReturnTaskArgs(
+      DeviceSpecificPtr<PerDeviceOpState> result,
+      Realm::Processor origin_proc,
+      DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr)
+      : result(result), origin_proc(origin_proc),
+        origin_result_ptr(origin_result_ptr) {}
+
+public:
+  DeviceSpecificPtr<PerDeviceOpState> result;
+  Realm::Processor origin_proc;
+  DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr;
+};
+
+void device_state_init_return_task_body(void const *args,
+                                        size_t arglen,
+                                        void const *userdata,
+                                        size_t userlen,
+                                        Realm::Processor proc) {
+  ASSERT(arglen == sizeof(DeviceStateInitReturnTaskArgs));
+  DeviceStateInitReturnTaskArgs task_args =
+      *reinterpret_cast<DeviceStateInitReturnTaskArgs const *>(args);
+
+  ASSERT(task_args.origin_proc.address_space() == proc.address_space());
+  *task_args.origin_result_ptr = task_args.result;
+}
+
+Realm::Event spawn_device_state_init_return_task(
+    RealmContext &ctx,
+    Realm::Processor origin_proc,
+    DeviceSpecificPtr<PerDeviceOpState> const &result,
+    DeviceSpecificPtr<PerDeviceOpState> *origin_result_ptr,
+    Realm::Event precondition) {
+  DeviceStateInitReturnTaskArgs task_args{
+      result, origin_proc, origin_result_ptr};
+
+  return ctx.spawn_task(origin_proc,
+                        task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                        &task_args,
+                        sizeof(task_args),
+                        Realm::ProfilingRequestSet{},
+                        precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
new file mode 100644
index 0000000000..50c8daffb0
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/device_state_init_task.cc
@@ -0,0 +1,117 @@
+#include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "local-execution/device_state_initialization.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/impl/device_state_init_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.dtg.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/per_device_op_state.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/optional.h"
+#include <optional>
+#include <type_traits>
+
+namespace FlexFlow {
+
+void device_state_init_task_body(void const *args,
+                                 size_t arglen,
+                                 void const *userdata,
+                                 size_t userlen,
+                                 Realm::Processor proc) {
+  DeviceStateInitTaskArgs task_args =
+      device_state_init_task_args_from_serializable(
+          deserialize_task_args<SerializableDeviceStateInitTaskArgs>(args,
+                                                                     arglen));
+
+  RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
+  DynamicNodeInvocation result_invocation =
+      initialize_node(invocation,
+                      ctx.get_current_device_allocator(),
+                      task_args.profiling_settings,
+                      device_handle,
+                      task_args.iteration_config,
+                      task_args.optimizer_attrs,
+                      ctx.get_current_device_idx());
+  DeviceSpecificPerDeviceOpState result_state =
+      assert_unwrap(result_invocation.node_attrs.per_device_op_state);
+  // Important: to make sure this doesn't get deallocated, we intentionally leak
+  // the allocation here
+  PerDeviceOpState *result_state_ptr =
+      new PerDeviceOpState{get_device_state_from_device_specific(
+          result_state, ctx.get_current_device_idx())};
+  DeviceSpecificPtr<PerDeviceOpState> result_device_specific{
+      ctx.get_current_device_idx(), result_state_ptr};
+  spawn_device_state_init_return_task(ctx,
+                                      task_args.origin_proc,
+                                      result_device_specific,
+                                      task_args.origin_result_ptr,
+                                      Realm::Event::NO_EVENT);
+}
+
+std::optional<Realm::Event> spawn_device_state_init_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    OptimizerAttrs const &optimizer_attrs,
+    DeviceSpecificPtr<PerDeviceOpState> *result_ptr,
+    Realm::Event precondition) {
+  DeviceStateInitTaskArgs task_args{
+      invocation,
+      tensor_backing,
+      profiling_settings,
+      device_handle,
+      iteration_config,
+      optimizer_attrs,
+      ctx.get_current_processor(),
+      result_ptr,
+  };
+
+  std::optional<task_id_t> task_id =
+      and_then(and_then(invocation.node_attrs.op_attrs,
+                        [](TrainingOperationAttrs const &op_attrs) {
+                          return op_attrs.try_require_pcg_op();
+                        }),
+               get_init_task_id_for_op_attrs);
+  if (task_id.has_value()) {
+    std::string args = serialize_task_args(
+        device_state_init_task_args_to_serializable(task_args));
+    return ctx.spawn_task(target_proc,
+                          assert_unwrap(task_id),
+                          args.data(),
+                          args.size(),
+                          Realm::ProfilingRequestSet{},
+                          precondition);
+  }
+  return std::nullopt;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
new file mode 100644
index 0000000000..c7dcdb39c2
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/op_task.cc
@@ -0,0 +1,93 @@
+#include "realm-execution/tasks/impl/op_task.h"
+#include "local-execution/task_execution.h"
+#include "realm-execution/device_specific_managed_per_device_ff_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/tasks/impl/op_task_args.dtg.h"
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/task_arg_serializer.h"
+#include "realm-execution/tasks/task_id_t.h"
+#include "task-spec/per_device_op_state.dtg.h"
+#include "task-spec/per_device_op_state.h"
+#include "task-spec/permissions.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/optional.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+void op_task_body(void const *args,
+                  size_t arglen,
+                  void const *userdata,
+                  size_t userlen,
+                  Realm::Processor proc) {
+  OpTaskArgs task_args = op_task_args_from_serializable(
+      deserialize_task_args<SerializableOpTaskArgs>(args, arglen));
+
+  RealmContext ctx{proc};
+  device_handle_t device_handle =
+      device_handle_t_from_device_specific_managed_handle(
+          task_args.device_handle, ctx.get_current_device_idx());
+
+  // Patch the invocation to include the provided instances
+  auto map_instance_to_accessor = [&](DynamicValueAttrs const &value) {
+    DynamicValueAttrs result = value;
+    auto const &[inst, event] = task_args.tensor_backing.backing.at(value);
+    result.accessor = dynamic_tensor_accessor_from_instance(
+        inst,
+        event,
+        assert_unwrap(value.parallel_tensor_shape),
+        Permissions::RW, // FIXME: get real permissions?
+        ctx.get_current_processor());
+    return result;
+  };
+  DynamicNodeInvocation invocation = task_args.invocation;
+  invocation.inputs = map_values(invocation.inputs, map_instance_to_accessor);
+  invocation.outputs = map_values(invocation.outputs, map_instance_to_accessor);
+
+  execute_dynamic_node_invocation(
+      /*invocation=*/invocation,
+      /*allocator=*/ctx.get_current_device_allocator(),
+      /*profiling_settings=*/task_args.profiling_settings,
+      /*ff_handle=*/device_handle,
+      /*per_device_op_state=*/
+      transform(and_then(task_args.device_state,
+                         [&](DeviceSpecificPtr<PerDeviceOpState> const &d) {
+                           return d.get(ctx.get_current_device_idx());
+                         }),
+                [](PerDeviceOpState *ptr) { return *ptr; }),
+      /*iteration_config=*/task_args.iteration_config,
+      /*optimizer_attrs=*/task_args.optimizer_attrs,
+      /*device_idx=*/ctx.get_current_device_idx());
+}
+
+Realm::Event spawn_op_task(
+    RealmContext &ctx,
+    Realm::Processor target_proc,
+    DynamicNodeInvocation const &invocation,
+    TensorInstanceBacking const &tensor_backing,
+    std::optional<DeviceSpecificPtr<PerDeviceOpState>> const &device_state,
+    ProfilingSettings const &profiling_settings,
+    DeviceSpecificManagedPerDeviceFFHandle const &device_handle,
+    FFIterationConfig const &iteration_config,
+    std::optional<OptimizerAttrs> const &optimizer_attrs,
+    Realm::Event precondition) {
+  OpTaskArgs task_args{invocation,
+                       tensor_backing,
+                       device_state,
+                       profiling_settings,
+                       device_handle,
+                       iteration_config,
+                       optimizer_attrs};
+  std::string args =
+      serialize_task_args(op_task_args_to_serializable(task_args));
+  return ctx.spawn_task(
+      target_proc,
+      assert_unwrap(get_task_id_for_op(invocation.node_attrs, optimizer_attrs)),
+      args.data(),
+      args.size(),
+      Realm::ProfilingRequestSet{},
+      precondition);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
new file mode 100644
index 0000000000..a44a5a5db1
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_handle_init_task_args.cc
@@ -0,0 +1,28 @@
+#include "realm-execution/tasks/impl/serializable_device_handle_init_task_args.h"
+
+namespace FlexFlow {
+
+SerializableDeviceHandleInitTaskArgs
+    device_handle_init_task_args_to_serializable(
+        DeviceHandleInitTaskArgs const &args) {
+  return SerializableDeviceHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+DeviceHandleInitTaskArgs device_handle_init_task_args_from_serializable(
+    SerializableDeviceHandleInitTaskArgs const &args) {
+  return DeviceHandleInitTaskArgs{
+      /*workSpaceSize=*/args.workSpaceSize,
+      /*allowTensorOpMathConversion=*/args.allowTensorOpMathConversion,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificManagedPerDeviceFFHandle *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
new file mode 100644
index 0000000000..2e7e02b529
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_device_state_init_task_args.cc
@@ -0,0 +1,43 @@
+#include "realm-execution/tasks/impl/serializable_device_state_init_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+
+namespace FlexFlow {
+
+SerializableDeviceStateInitTaskArgs device_state_init_task_args_to_serializable(
+    DeviceStateInitTaskArgs const &args) {
+  return SerializableDeviceStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_to_serializable(args.tensor_backing),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_to_serializable(args.origin_proc),
+      /*origin_result_ptr=*/reinterpret_cast<uintptr_t>(args.origin_result_ptr),
+  };
+}
+
+DeviceStateInitTaskArgs device_state_init_task_args_from_serializable(
+    SerializableDeviceStateInitTaskArgs const &args) {
+  return DeviceStateInitTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_from_serializable(args.tensor_backing),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+      /*origin_proc=*/realm_processor_from_serializable(args.origin_proc),
+      /*origin_result_ptr=*/
+      reinterpret_cast<DeviceSpecificPtr<PerDeviceOpState> *>(
+          args.origin_result_ptr),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
new file mode 100644
index 0000000000..32d54adc37
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/impl/serializable_op_task_args.cc
@@ -0,0 +1,41 @@
+#include "realm-execution/tasks/impl/serializable_op_task_args.h"
+#include "realm-execution/tasks/serializer/serializable_device_specific_ptr.h"
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+SerializableOpTaskArgs op_task_args_to_serializable(OpTaskArgs const &args) {
+  return SerializableOpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_to_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_to_serializable(args.tensor_backing),
+      /*device_state=*/
+      transform(args.device_state,
+                device_specific_ptr_to_serializable<PerDeviceOpState>),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/device_specific_ptr_to_serializable(args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+OpTaskArgs op_task_args_from_serializable(SerializableOpTaskArgs const &args) {
+  return OpTaskArgs{
+      /*invocation=*/dynamic_node_invocation_from_serializable(args.invocation),
+      /*tensor_backing*/
+      tensor_instance_backing_from_serializable(args.tensor_backing),
+      /*device_state=*/
+      transform(args.device_state,
+                device_specific_ptr_from_serializable<PerDeviceOpState>),
+      /*profiling_settings=*/args.profiling_settings,
+      /*device_handle=*/
+      device_specific_ptr_from_serializable<ManagedPerDeviceFFHandle>(
+          args.device_handle),
+      /*iteration_config=*/args.iteration_config,
+      /*optimizer_attrs=*/args.optimizer_attrs,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
new file mode 100644
index 0000000000..ec1aa143a6
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_id_t.cc
@@ -0,0 +1,10 @@
+#include "realm-execution/tasks/realm_task_id_t.h"
+
+namespace FlexFlow {
+
+Realm::Processor::TaskFuncID get_realm_task_id_for_task_id(task_id_t task_id) {
+  return Realm::Processor::TASK_ID_FIRST_AVAILABLE +
+         static_cast<Realm::Processor::TaskFuncID>(task_id);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
new file mode 100644
index 0000000000..09d99655c0
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/realm_task_registry.cc
@@ -0,0 +1,159 @@
+#include "realm-execution/tasks/realm_task_registry.h"
+#include "realm-execution/tasks/impl/controller_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_return_task.h"
+#include "realm-execution/tasks/impl/device_handle_init_task.h"
+#include "realm-execution/tasks/impl/device_state_init_return_task.h"
+#include "realm-execution/tasks/impl/device_state_init_task.h"
+#include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/realm_task_id_t.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+Realm::Event register_task(Realm::Processor::Kind target_kind,
+                           task_id_t func_id,
+                           void (*task_body)(void const *,
+                                             size_t,
+                                             void const *,
+                                             size_t,
+                                             Realm::Processor)) {
+  Realm::Processor::TaskFuncID realm_task_id =
+      get_realm_task_id_for_task_id(func_id);
+#ifdef FF_USE_PREALM
+  Realm::prealm_task_name(realm_task_id, fmt::format("{}", func_id));
+#endif
+  return Realm::Processor::register_task_by_kind(
+      target_kind,
+      /*global=*/false,
+      realm_task_id,
+      Realm::CodeDescriptor(task_body),
+      Realm::ProfilingRequestSet());
+}
+
+Realm::Event register_all_tasks() {
+  std::vector<Realm::Event> pending_registrations;
+
+  std::vector<task_id_t> init_task_ids = {
+      // Init tasks
+      task_id_t::BATCHNORM_INIT_TASK_ID,
+      task_id_t::COMBINE_INIT_TASK_ID,
+      task_id_t::CONV2D_INIT_TASK_ID,
+      task_id_t::DROPOUT_INIT_TASK_ID,
+      task_id_t::ELEMENTBINARY_INIT_TASK_ID,
+      task_id_t::ELEMENTUNARY_INIT_TASK_ID,
+      task_id_t::GATHER_INIT_TASK_ID,
+      task_id_t::LAYERNORM_INIT_TASK_ID,
+      task_id_t::LINEAR_INIT_TASK_ID,
+      task_id_t::ATTENTION_INIT_TASK_ID,
+      task_id_t::POOL2D_INIT_TASK_ID,
+      task_id_t::REDUCE_INIT_TASK_ID,
+      task_id_t::REDUCTION_INIT_TASK_ID,
+      task_id_t::REPARTITION_INIT_TASK_ID,
+      task_id_t::REPLICATE_INIT_TASK_ID,
+      task_id_t::SOFTMAX_INIT_TASK_ID,
+  };
+
+  for (task_id_t task_id : init_task_ids) {
+    pending_registrations.push_back(register_task(
+        Realm::Processor::LOC_PROC, task_id, device_state_init_task_body));
+    pending_registrations.push_back(register_task(
+        Realm::Processor::TOC_PROC, task_id, device_state_init_task_body));
+  }
+
+  std::vector<task_id_t> task_ids = {
+      // Forward tasks
+      task_id_t::BATCHMATMUL_FWD_TASK_ID,
+      task_id_t::BATCHNORM_FWD_TASK_ID,
+      task_id_t::BROADCAST_FWD_TASK_ID,
+      task_id_t::CAST_FWD_TASK_ID,
+      task_id_t::COMBINE_FWD_TASK_ID,
+      task_id_t::CONCAT_FWD_TASK_ID,
+      task_id_t::CONV2D_FWD_TASK_ID,
+      task_id_t::DROPOUT_FWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_FWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_FWD_TASK_ID,
+      task_id_t::EMBED_FWD_TASK_ID,
+      task_id_t::FLAT_FWD_TASK_ID,
+      task_id_t::GATHER_FWD_TASK_ID,
+      task_id_t::LAYERNORM_FWD_TASK_ID,
+      task_id_t::LINEAR_FWD_TASK_ID,
+      task_id_t::ATTENTION_FWD_TASK_ID,
+      task_id_t::POOL2D_FWD_TASK_ID,
+      task_id_t::REDUCE_FWD_TASK_ID,
+      task_id_t::REDUCTION_FWD_TASK_ID,
+      task_id_t::REPARTITION_FWD_TASK_ID,
+      task_id_t::REPLICATE_FWD_TASK_ID,
+      task_id_t::RESHAPE_FWD_TASK_ID,
+      task_id_t::REVERSE_FWD_TASK_ID,
+      task_id_t::SOFTMAX_FWD_TASK_ID,
+      task_id_t::SPLIT_FWD_TASK_ID,
+      task_id_t::TOPK_FWD_TASK_ID,
+      task_id_t::TRANSPOSE_FWD_TASK_ID,
+
+      // Backward tasks
+      task_id_t::BATCHMATMUL_BWD_TASK_ID,
+      task_id_t::BATCHNORM_BWD_TASK_ID,
+      task_id_t::BROADCAST_BWD_TASK_ID,
+      task_id_t::CAST_BWD_TASK_ID,
+      task_id_t::COMBINE_BWD_TASK_ID,
+      task_id_t::CONCAT_BWD_TASK_ID,
+      task_id_t::CONV2D_BWD_TASK_ID,
+      task_id_t::DROPOUT_BWD_TASK_ID,
+      task_id_t::ELEMENTBINARY_BWD_TASK_ID,
+      task_id_t::ELEMENTUNARY_BWD_TASK_ID,
+      task_id_t::EMBED_BWD_TASK_ID,
+      task_id_t::FLAT_BWD_TASK_ID,
+      task_id_t::GATHER_BWD_TASK_ID,
+      task_id_t::LAYERNORM_BWD_TASK_ID,
+      task_id_t::LINEAR_BWD_TASK_ID,
+      task_id_t::ATTENTION_BWD_TASK_ID,
+      task_id_t::POOL2D_BWD_TASK_ID,
+      task_id_t::REDUCE_BWD_TASK_ID,
+      task_id_t::REDUCTION_BWD_TASK_ID,
+      task_id_t::REPARTITION_BWD_TASK_ID,
+      task_id_t::REPLICATE_BWD_TASK_ID,
+      task_id_t::RESHAPE_BWD_TASK_ID,
+      task_id_t::REVERSE_BWD_TASK_ID,
+      task_id_t::SOFTMAX_BWD_TASK_ID,
+      task_id_t::SPLIT_BWD_TASK_ID,
+      task_id_t::TOPK_BWD_TASK_ID,
+      task_id_t::TRANSPOSE_BWD_TASK_ID,
+
+      // Update tasks
+      task_id_t::SGD_UPD_NCCL_TASK_ID,
+      task_id_t::ADAM_UPD_NCCL_TASK_ID,
+
+      // Loss task
+      task_id_t::LOSS_BWD_TASK_ID,
+  };
+
+  for (task_id_t task_id : task_ids) {
+    pending_registrations.push_back(
+        register_task(Realm::Processor::LOC_PROC, task_id, op_task_body));
+    pending_registrations.push_back(
+        register_task(Realm::Processor::TOC_PROC, task_id, op_task_body));
+  }
+
+  pending_registrations.push_back(register_task(Realm::Processor::LOC_PROC,
+                                                task_id_t::CONTROLLER_TASK_ID,
+                                                controller_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    device_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::TOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_TASK_ID,
+                    device_handle_init_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_HANDLE_INIT_RETURN_TASK_ID,
+                    device_handle_init_return_task_body));
+  pending_registrations.push_back(
+      register_task(Realm::Processor::LOC_PROC,
+                    task_id_t::DEVICE_STATE_INIT_RETURN_TASK_ID,
+                    device_state_init_return_task_body));
+  return Realm::Event::merge_events(pending_registrations);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
new file mode 100644
index 0000000000..806059f3ed
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_event.cc
@@ -0,0 +1,14 @@
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+
+namespace FlexFlow {
+
+SerializableRealmEvent realm_event_to_serializable(Realm::Event const &event) {
+  return SerializableRealmEvent{event.id};
+}
+
+Realm::Event
+    realm_event_from_serializable(SerializableRealmEvent const &event) {
+  return Realm::Event{event.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
new file mode 100644
index 0000000000..0e58d6e36c
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_instance.cc
@@ -0,0 +1,23 @@
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "utils/exception.h"
+#include <type_traits>
+
+namespace FlexFlow {
+
+// Realm::RegionInstance is trivially copyable so it's safe to treat it as bytes
+static_assert(std::is_trivially_copy_constructible_v<Realm::RegionInstance>);
+
+SerializableRealmInstance
+    realm_instance_to_serializable(Realm::RegionInstance const &inst) {
+  uint8_t const *data = reinterpret_cast<uint8_t const *>(&inst);
+  return SerializableRealmInstance{
+      std::vector<uint8_t>{data, data + sizeof(inst)}};
+}
+
+Realm::RegionInstance
+    realm_instance_from_serializable(SerializableRealmInstance const &inst) {
+  ASSERT(inst.instance.size() == sizeof(Realm::RegionInstance));
+  return *reinterpret_cast<Realm::RegionInstance const *>(inst.instance.data());
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
new file mode 100644
index 0000000000..b16e2891c4
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_realm_processor.cc
@@ -0,0 +1,15 @@
+#include "realm-execution/tasks/serializer/serializable_realm_processor.h"
+
+namespace FlexFlow {
+
+SerializableRealmProcessor
+    realm_processor_to_serializable(Realm::Processor const &proc) {
+  return SerializableRealmProcessor{proc.id};
+}
+
+Realm::Processor
+    realm_processor_from_serializable(SerializableRealmProcessor const &proc) {
+  return Realm::Processor{proc.id};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
new file mode 100644
index 0000000000..79a5176c4f
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/serializer/serializable_tensor_instance_backing.cc
@@ -0,0 +1,32 @@
+#include "realm-execution/tasks/serializer/serializable_tensor_instance_backing.h"
+#include "realm-execution/tasks/serializer/serializable_realm_event.h"
+#include "realm-execution/tasks/serializer/serializable_realm_instance.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_keys_and_values.h"
+
+namespace FlexFlow {
+
+SerializableTensorInstanceBacking tensor_instance_backing_to_serializable(
+    TensorInstanceBacking const &backing) {
+  return SerializableTensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_to_serializable,
+      [](std::pair<Realm::RegionInstance, Realm::Event> const &p) {
+        return std::pair{realm_instance_to_serializable(p.first),
+                         realm_event_to_serializable(p.second)};
+      })};
+}
+
+TensorInstanceBacking tensor_instance_backing_from_serializable(
+    SerializableTensorInstanceBacking const &backing) {
+  return TensorInstanceBacking{/*backing=*/map_keys_and_values(
+      backing.backing,
+      dynamic_value_attrs_from_serializable,
+      [](std::pair<SerializableRealmInstance, SerializableRealmEvent> const
+             &p) {
+        return std::pair{realm_instance_from_serializable(p.first),
+                         realm_event_from_serializable(p.second)};
+      })};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
new file mode 100644
index 0000000000..94e1b887e7
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tasks/task_id_t.cc
@@ -0,0 +1,193 @@
+#include "realm-execution/tasks/task_id_t.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "utils/optional.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+std::optional<task_id_t>
+    get_task_id_for_op(DynamicNodeAttrs const &node_attrs,
+                       std::optional<OptimizerAttrs> const &optimizer_attrs) {
+  DynamicTaskType task_type = assert_unwrap(node_attrs.task_type);
+  switch (task_type) {
+    case DynamicTaskType::FWD:
+      return get_fwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
+    case DynamicTaskType::BWD:
+      return get_bwd_task_id_for_op_attrs(
+          assert_unwrap(node_attrs.op_attrs).require_pcg_op());
+    case DynamicTaskType::UPD:
+      return get_update_task_id_for_optimizer_attrs(
+          assert_unwrap(optimizer_attrs));
+    case DynamicTaskType::LOSS:
+      return task_id_t::LOSS_BWD_TASK_ID;
+    default:
+      PANIC("Unhandled DynamicTaskType", task_type);
+  }
+}
+
+std::optional<task_id_t>
+    get_init_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) { return std::nullopt; },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_INIT_TASK_ID; },
+      [](BroadcastAttrs const &) { return std::nullopt; },
+      [](CastAttrs const &) { return std::nullopt; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_INIT_TASK_ID; },
+      [](ConcatAttrs const &) { return std::nullopt; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_INIT_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_INIT_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_INIT_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_INIT_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return std::nullopt; },
+      [](FlatAttrs const &) { return std::nullopt; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_INIT_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_INIT_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_INIT_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_INIT_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_INIT_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_INIT_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_INIT_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_INIT_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_INIT_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return std::nullopt; },
+      [](ReverseAttrs const &) { return std::nullopt; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_INIT_TASK_ID; },
+      [](SplitAttrs const &) { return std::nullopt; },
+      [](TopKAttrs const &) { return std::nullopt; },
+      [](TransposeAttrs const &) { return std::nullopt; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_fwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_FWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_FWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_FWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_FWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_FWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_FWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_FWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_FWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_FWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_FWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_FWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_FWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_FWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_FWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_FWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_FWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_FWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_FWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_FWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_FWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_FWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_FWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_FWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_FWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_FWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_FWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_FWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t>
+    get_bwd_task_id_for_op_attrs(PCGOperatorAttrs const &op_attrs) {
+
+  return op_attrs.visit<std::optional<task_id_t>>(overload{
+      [](BatchMatmulAttrs const &) {
+        return task_id_t::BATCHMATMUL_BWD_TASK_ID;
+      },
+      [](BatchNormAttrs const &) { return task_id_t::BATCHNORM_BWD_TASK_ID; },
+      [](BroadcastAttrs const &) { return task_id_t::BROADCAST_BWD_TASK_ID; },
+      [](CastAttrs const &) { return task_id_t::CAST_BWD_TASK_ID; },
+      [](CombineAttrs const &attrs) { return task_id_t::COMBINE_BWD_TASK_ID; },
+      [](ConcatAttrs const &) { return task_id_t::CONCAT_BWD_TASK_ID; },
+      [](Conv2DAttrs const &) { return task_id_t::CONV2D_BWD_TASK_ID; },
+      [](DropoutAttrs const &) { return task_id_t::DROPOUT_BWD_TASK_ID; },
+      [](ElementBinaryAttrs const &) {
+        return task_id_t::ELEMENTBINARY_BWD_TASK_ID;
+      },
+      [](ElementUnaryAttrs const &) {
+        return task_id_t::ELEMENTUNARY_BWD_TASK_ID;
+      },
+      [](EmbeddingAttrs const &) { return task_id_t::EMBED_BWD_TASK_ID; },
+      [](FlatAttrs const &) { return task_id_t::FLAT_BWD_TASK_ID; },
+      [](GatherAttrs const &) { return task_id_t::GATHER_BWD_TASK_ID; },
+      [](InputAttrs const &) { return std::nullopt; },
+      [](LayerNormAttrs const &) { return task_id_t::LAYERNORM_BWD_TASK_ID; },
+      [](LinearAttrs const &) { return task_id_t::LINEAR_BWD_TASK_ID; },
+      [](MultiHeadAttentionAttrs const &) {
+        return task_id_t::ATTENTION_BWD_TASK_ID;
+      },
+      [](NoopAttrs const &) { return std::nullopt; },
+      [](Pool2DAttrs const &) { return task_id_t::POOL2D_BWD_TASK_ID; },
+      [](ReduceAttrs const &) { return task_id_t::REDUCE_BWD_TASK_ID; },
+      [](ReductionAttrs const &attrs) {
+        return task_id_t::REDUCTION_BWD_TASK_ID;
+      },
+      [](RepartitionAttrs const &attrs) {
+        return task_id_t::REPARTITION_BWD_TASK_ID;
+      },
+      [](ReplicateAttrs const &attrs) {
+        return task_id_t::REPLICATE_BWD_TASK_ID;
+      },
+      [](ReshapeAttrs const &) { return task_id_t::RESHAPE_BWD_TASK_ID; },
+      [](ReverseAttrs const &) { return task_id_t::REVERSE_BWD_TASK_ID; },
+      [](SoftmaxAttrs const &) { return task_id_t::SOFTMAX_BWD_TASK_ID; },
+      [](SplitAttrs const &) { return task_id_t::SPLIT_BWD_TASK_ID; },
+      [](TopKAttrs const &) { return task_id_t::TOPK_BWD_TASK_ID; },
+      [](TransposeAttrs const &) { return task_id_t::TRANSPOSE_BWD_TASK_ID; },
+      [](WeightAttrs const &) { return std::nullopt; },
+  });
+}
+
+std::optional<task_id_t> get_update_task_id_for_optimizer_attrs(
+    OptimizerAttrs const &optimizer_attrs) {
+
+  return optimizer_attrs.visit<std::optional<task_id_t>>(overload{
+      [](SGDOptimizerAttrs const &) { return task_id_t::SGD_UPD_NCCL_TASK_ID; },
+      [](AdamOptimizerAttrs const &) {
+        return task_id_t::ADAM_UPD_NCCL_TASK_ID;
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
new file mode 100644
index 0000000000..dea51d8c92
--- /dev/null
+++ b/lib/realm-execution/src/realm-execution/tensor_instance_backing.cc
@@ -0,0 +1,25 @@
+#include "realm-execution/tensor_instance_backing.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+TensorInstanceBacking make_empty_tensor_instance_backing() {
+  return TensorInstanceBacking{
+      /*backing=*/{},
+  };
+}
+
+TensorInstanceBacking subset_tensor_instance_backing_for_invocation(
+    TensorInstanceBacking const &backing,
+    DynamicNodeInvocation const &invocation) {
+  TensorInstanceBacking result = make_empty_tensor_instance_backing();
+  for (DynamicValueAttrs const &value : values(invocation.inputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  for (DynamicValueAttrs const &value : values(invocation.outputs)) {
+    result.backing.insert(std::pair{value, backing.backing.at(value)});
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/CMakeLists.txt b/lib/realm-execution/test/CMakeLists.txt
new file mode 100644
index 0000000000..b3beff42c0
--- /dev/null
+++ b/lib/realm-execution/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+ff_add_test_executable(
+  NAME
+    realm-execution-tests
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    doctest
+    utils-test-common
+    realm-execution
+    kernels
+    op-attrs
+    task-spec
+)
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.cc b/lib/realm-execution/test/src/internal/realm_test_utils.cc
new file mode 100644
index 0000000000..e381feb8de
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.cc
@@ -0,0 +1,28 @@
+#include "internal/realm_test_utils.h"
+#include <fmt/format.h>
+#include <string>
+
+namespace FlexFlow {
+
+static char *leak_string_contents(std::string const &str) {
+  // Realm command-line arguments require char* so intentionally leak the
+  // allocated string contents here
+  std::vector<char> *content = new std::vector<char>{str.begin(), str.end()};
+  content->push_back(0); // NUL byte
+  return content->data();
+}
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus) {
+  std::vector<char *> result;
+  result.push_back(leak_string_contents("fake_executable_name"));
+  result.push_back(leak_string_contents("-ll:cpu"));
+  result.push_back(leak_string_contents(fmt::to_string(num_cpus)));
+  if (num_gpus > 0) {
+    result.push_back(leak_string_contents("-ll:gpu"));
+    result.push_back(leak_string_contents(fmt::to_string(num_gpus)));
+  }
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/test/src/internal/realm_test_utils.h b/lib/realm-execution/test/src/internal/realm_test_utils.h
new file mode 100644
index 0000000000..8e2775ad8b
--- /dev/null
+++ b/lib/realm-execution/test/src/internal/realm_test_utils.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_TEST_SRC_INTERNAL_REALM_TEST_UTILS_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
+#include <vector>
+
+namespace FlexFlow {
+
+std::vector<char *> make_fake_realm_args(positive_int num_cpus,
+                                         nonnegative_int num_gpus);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
new file mode 100644
index 0000000000..aaefe337db
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/distributed_device_handle.cc
@@ -0,0 +1,70 @@
+#include "realm-execution/distributed_device_handle.h"
+#include "internal/realm_test_utils.h"
+#include "realm-execution/realm_manager.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("DistributedDeviceHandle") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedDeviceHandle handle = create_distributed_device_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("DistributedDeviceHandle (GPU)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      DistributedDeviceHandle handle = create_distributed_device_handle(
+          /*ctx=*/ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      // Make sure we have handles for the processors we're expecting
+      Realm::Machine::ProcessorQuery cpus(Realm::Machine::get_machine());
+      cpus.only_kind(Realm::Processor::LOC_PROC);
+      CHECK(cpus.count() == 2);
+      for (Realm::Processor proc : cpus) {
+        handle.at(proc);
+      }
+
+      Realm::Machine::ProcessorQuery gpus(Realm::Machine::get_machine());
+      gpus.only_kind(Realm::Processor::TOC_PROC);
+      CHECK(gpus.count() == 1);
+      for (Realm::Processor proc : gpus) {
+        handle.at(proc);
+      }
+    });
+  }
+}
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/realm_manager.cc b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
new file mode 100644
index 0000000000..450d7fd3ec
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/realm_manager.cc
@@ -0,0 +1,33 @@
+#include "realm-execution/realm_manager.h"
+#include "internal/realm_test_utils.h"
+#include "realm-execution/distributed_device_handle.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmManager") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    // Initialize Realm
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    // Launch a controller
+    int some_data = 123;
+    Realm::Event event = manager.start_controller([&](RealmContext &ctx) {
+      // Data is captured and retains value
+      ASSERT(some_data == 123);
+    });
+    // Need to block on the completion of the event to ensure we don't race,
+    // because the lambda captures the environment
+    event.wait();
+  }
+}
+
+} // namespace test
diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
new file mode 100644
index 0000000000..0914c054d7
--- /dev/null
+++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc
@@ -0,0 +1,487 @@
+#include "internal/realm_test_utils.h"
+#include "kernels/allocation.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_slot_name.dtg.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/mapped_parallel_computation_graph/operator_atomic_task_shard_binding.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "realm-execution/distributed_device_handle.h"
+#include "realm-execution/dynamic_tensor_accessor_from_instance.h"
+#include "realm-execution/pcg_instance/pcg_instance.h"
+#include "realm-execution/realm_context.h"
+#include "realm-execution/realm_manager.h"
+#include "task-spec/permissions.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/require_only_key.h"
+#include <doctest/doctest.h>
+
+namespace test {
+
+using namespace ::FlexFlow;
+namespace Realm = ::FlexFlow::Realm;
+
+static bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                              GenericTensorAccessorR const &last_epoch,
+                              Allocator &allocator) {
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, allocator));
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training (CPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/2_p, /*num_gpus=*/0_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      TensorShape weight_shape_1 = TensorShape{
+          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+      TensorShape weight_shape_2 = TensorShape{
+          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_1 =
+          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_2 =
+          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_input,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_1,
+              },
+          });
+      parallel_tensor_guid_t t_linear_1 =
+          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_linear_1,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_2,
+              },
+          });
+      parallel_tensor_guid_t t_linear_2 =
+          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+      MachineSpaceCoordinate cpu0{0_n, 0_n, DeviceType::CPU};
+      MachineSpaceCoordinate cpu1{0_n, 1_n, DeviceType::CPU};
+      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu1,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {linear_operator_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+              {linear_operator_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{cpu1,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+          },
+      };
+      MappedOperatorTaskGroup loss_mapping{
+          {{cpu0,
+            OperatorAtomicTaskShardBinding{{
+                {TensorSlotName::INPUT, tensor_coord0},
+                {TensorSlotName::LOGIT, tensor_coord0},
+            }}}}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+          ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
+      }
+
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+    });
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("RealmBackend e2e Training (GPU Model Parallelism)") {
+    std::vector<char *> fake_args =
+        make_fake_realm_args(/*num_cpus=*/1_p, /*num_gpus=*/1_n);
+    int fake_argc = fake_args.size();
+    char **fake_argv = fake_args.data();
+
+    RealmManager manager(&fake_argc, &fake_argv);
+
+    (void)manager.start_controller([](RealmContext &ctx) {
+      Allocator allocator = ctx.get_current_device_allocator();
+
+      positive_int batch_size = 10_p;
+      positive_int data_dim = 16_p;
+      positive_int hidden_dim = 32_p;
+      positive_int output_dim = 1_p;
+
+      TensorShape output_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      GenericTensorAccessorW label_tensor_backing =
+          allocator.allocate_tensor(output_tensor_shape);
+
+      // construct computation graph
+      ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+      GenericTensorAccessorW label_tensor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      TensorShape weight_shape_1 = TensorShape{
+          TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+      TensorShape weight_shape_2 = TensorShape{
+          TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+      ParallelLayerAddedResult inputs_layer =
+          pcg_add_input_layer_with_grad(pcg, input_tensor_shape);
+      parallel_tensor_guid_t t_input =
+          require_only_key(inputs_layer.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_1 =
+          require_only_key(weights_layer_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult weights_layer_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{
+              PCGOperatorAttrs{WeightAttrs{
+                  weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+              std::nullopt},
+          {},
+          {});
+      parallel_tensor_guid_t t_weights_2 =
+          require_only_key(weights_layer_2.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_1 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{hidden_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_input,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_1,
+              },
+          });
+      parallel_tensor_guid_t t_linear_1 =
+          require_only_key(linear_operator_1.outputs, TensorSlotName::OUTPUT);
+
+      ParallelLayerAddedResult linear_operator_2 = add_parallel_layer(
+          pcg,
+          ParallelLayerAttrs{PCGOperatorAttrs{LinearAttrs{output_dim,
+                                                          /*use_bias=*/false,
+                                                          DataType::FLOAT,
+                                                          Activation::RELU,
+                                                          std::nullopt}},
+                             std::nullopt},
+          {
+              {
+                  TensorSlotName::INPUT,
+                  t_linear_1,
+              },
+          },
+          {
+              {
+                  TensorSlotName::WEIGHT,
+                  t_weights_2,
+              },
+          });
+      parallel_tensor_guid_t t_linear_2 =
+          require_only_key(linear_operator_2.outputs, TensorSlotName::OUTPUT);
+
+      MachineSpaceCoordinate gpu0{0_n, 0_n, DeviceType::GPU};
+      ParallelTensorSpaceCoordinate tensor_coord0{0_n, 0_n, FFOrdered{0_n}};
+      MappedParallelComputationGraph mpcg{
+          pcg,
+          {
+              {inputs_layer.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {weights_layer_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{
+                         {{TensorSlotName::OUTPUT, tensor_coord0}}}}}}},
+              {linear_operator_1.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+              {linear_operator_2.parallel_layer,
+               MappedOperatorTaskGroup{
+                   {{gpu0,
+                     OperatorAtomicTaskShardBinding{{
+                         {TensorSlotName::INPUT, tensor_coord0},
+                         {TensorSlotName::WEIGHT, tensor_coord0},
+                         {TensorSlotName::OUTPUT, tensor_coord0},
+                     }}}}}},
+          },
+      };
+      MappedOperatorTaskGroup loss_mapping{
+          {{gpu0,
+            OperatorAtomicTaskShardBinding{{
+                {TensorSlotName::INPUT, tensor_coord0},
+                {TensorSlotName::LOGIT, tensor_coord0},
+            }}}}};
+
+      // instantiate computation graph
+      LossAttrs loss_attrs = LossAttrs{
+          NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                           /*momentum=*/0.9,
+                                           /*nesterov=*/false,
+                                           /*weight_decay=*/0.001}};
+
+      std::unordered_map<DynamicValueAttrs, DynamicTensorAccessor>
+          input_tensors;
+
+      DistributedDeviceHandle device_handle = create_distributed_device_handle(
+          ctx,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true);
+
+      PCGInstance pcg_instance = create_pcg_instance(
+          /*ctx=*/ctx,
+          /*mpcg=*/mpcg,
+          /*optimizer=*/optimizer_attrs,
+          /*loss=*/loss_attrs,
+          /*label_tensor=*/label_tensor,
+          /*logit_tensor=*/t_linear_2,
+          /*loss_mapping=*/loss_mapping,
+          /*input_tensors=*/input_tensors,
+          /*profiling_settings=*/ProfilingSettings{0, 0},
+          /*device_handle=*/device_handle,
+          /*iteration_config=*/FFIterationConfig{1_p});
+
+      // begin training loop
+      int num_epochs = 5;
+      std::vector<GenericTensorAccessorR> loss_values;
+
+      for (int i = 0; i < num_epochs; i++) {
+        perform_all_passes_for_pcg_instance(
+            /*instance=*/pcg_instance,
+            /*profiling_settings=*/ProfilingSettings{0, 0},
+            /*device_handle=*/device_handle,
+            /*iteration_config=*/FFIterationConfig{1_p});
+        loss_values.push_back(copy_tensor_accessor_r(
+            dynamic_tensor_accessor_from_instance(
+                pcg_instance.get_loss_tensor_instance().value(),
+                Realm::Event::NO_EVENT,
+                lift_to_parallel(
+                    TensorShape{TensorDims{FFOrdered{output_dim, hidden_dim}},
+                                DataType::FLOAT}),
+                Permissions::RO,
+                ctx.get_current_processor())
+                .require_read(),
+            allocator));
+      }
+
+      // Assert that each sample in the batch has a lower loss in last epoch
+      // than the first epoch
+      GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+      GenericTensorAccessorR last_epoch_loss = loss_values.back();
+      CHECK_MESSAGE(
+          did_loss_decrease(first_epoch_loss, last_epoch_loss, allocator),
+          check_kv("first_epoch_loss",
+                   format_accessor_r_contents(first_epoch_loss)),
+          check_kv("last_epoch_loss",
+                   format_accessor_r_contents(last_epoch_loss)));
+    });
+  }
+}
+
+} // namespace test
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
index c6e6673f33..bd64f52567 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
index 75e9099104..c9171b928b 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
index c7cef3f06f..b3b2a465f8 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
+++ b/lib/task-spec/include/task-spec/dynamic_graph/loss_insertion.h
@@ -6,12 +6,15 @@
 #include "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h"
 #include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
 #include "task-spec/dynamic_graph/loss_insertion_result.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor);
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
new file mode 100644
index 0000000000..758a0c2813
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_DYNAMIC_OPEN_DATAFLOW_GRAPH_FROM_MPCG_H
+
+#include "pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h"
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
new file mode 100644
index 0000000000..3c43e1d637
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.toml
@@ -0,0 +1,43 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_task_type.dtg.h",
+  "pcg/machine_space_coordinate.dtg.h",
+  "pcg/mapped_parallel_computation_graph/mapped_operator_task_group.h",
+  "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h",
+  "task-spec/dynamic_graph/training_operation_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "task_type"
+type = "std::optional<::FlexFlow::DynamicTaskType>"
+
+[[fields]]
+name = "device_coord"
+type = "std::optional<::FlexFlow::MachineSpaceCoordinate>"
+
+[[fields]]
+name = "mapping"
+type = "std::optional<::FlexFlow::MappedOperatorTaskGroup>"
+
+[[fields]]
+name = "op_attrs"
+type = "std::optional<::FlexFlow::TrainingOperationAttrs>"
+
+[[fields]]
+name = "layer_guid"
+type = "::FlexFlow::dynamic_layer_guid_t"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
new file mode 100644
index 0000000000..7a274a1e7b
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_node_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &);
+DynamicNodeAttrs
+    dynamic_node_attrs_from_serializable(SerializableDynamicNodeAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
new file mode 100644
index 0000000000..01f4cc8876
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.toml
@@ -0,0 +1,33 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicNodeInvocation"
+type = "struct"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "json",
+]
+
+includes = [
+  "<unordered_map>",
+  "task-spec/dynamic_graph/serializable_dynamic_node_attrs.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_slot.dtg.h",
+  "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "inputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
+
+[[fields]]
+name = "node_attrs"
+type = "::FlexFlow::SerializableDynamicNodeAttrs"
+
+[[fields]]
+name = "outputs"
+type = "std::unordered_map<::FlexFlow::DynamicTensorSlot, ::FlexFlow::SerializableDynamicValueAttrs>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
new file mode 100644
index 0000000000..2bcdb9a898
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_node_invocation.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_NODE_INVOCATION_H
+
+#include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation
+    dynamic_node_invocation_to_serializable(DynamicNodeInvocation const &);
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
new file mode 100644
index 0000000000..6209bfa247
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.toml
@@ -0,0 +1,38 @@
+namespace = "FlexFlow"
+name = "SerializableDynamicValueAttrs"
+type = "struct"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<optional>",
+  "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h",
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "op-attrs/parallel_tensor_space_coordinate.dtg.h",
+  "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+]
+
+[[fields]]
+name = "tensor_guid"
+type = "::FlexFlow::dynamic_tensor_guid_t"
+
+[[fields]]
+name = "parallel_tensor_shape"
+type = "std::optional<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "shard_coord"
+type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>"
+
+[[fields]]
+name = "role"
+type = "std::optional<::FlexFlow::DynamicTensorRole>"
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
new file mode 100644
index 0000000000..6272265b7e
--- /dev/null
+++ b/lib/task-spec/include/task-spec/dynamic_graph/serializable_dynamic_value_attrs.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DYNAMIC_GRAPH_SERIALIZABLE_DYNAMIC_VALUE_ATTRS_H
+
+#include "task-spec/dynamic_graph/dynamic_value_attrs.dtg.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.dtg.h"
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &);
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
index 66c475b3a9..1051d8ac13 100644
--- a/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
+++ b/lib/task-spec/include/task-spec/dynamic_graph/training_operation_attrs.dtg.toml
@@ -5,6 +5,7 @@ features = [
   "eq",
   "hash",
   "fmt",
+  "json",
 ]
 
 includes = [
diff --git a/lib/task-spec/include/task-spec/ops/impl/dropout.h b/lib/task-spec/include/task-spec/ops/impl/dropout.h
index a7b382ce62..192f2f8244 100644
--- a/lib/task-spec/include/task-spec/ops/impl/dropout.h
+++ b/lib/task-spec/include/task-spec/ops/impl/dropout.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_IMPL_DROPOUT_H
 
 #include "op-attrs/ops/dropout_attrs.dtg.h"
-#include "task-spec/task_id_t.dtg.h"
 #include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml b/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
deleted file mode 100644
index 557da6cf4c..0000000000
--- a/lib/task-spec/include/task-spec/ops/op_task_id_t.dtg.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-namespace = "FlexFlow"
-name = "op_task_id_t"
-type = "enum"
-features = [
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[values]]
-name = "INIT"
-
-[[values]]
-name = "FWD"
-
-[[values]]
-name = "BWD"
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
deleted file mode 100644
index 50349d5773..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.dtg.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-namespace = "FlexFlow"
-name = "task_id_with_noop_default_t"
-type = "variant"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-  "rapidcheck",
-]
-
-includes = [
-  "task-spec/task_id_t.dtg.h",
-  "<utility>",
-]
-
-src_includes = [
-  "utils/rapidcheck/monostate.h",
-  "utils/fmt/monostate.h",
-]
-
-[[values]]
-type = "::FlexFlow::task_id_t"
-key = "real_task"
-
-[[values]]
-type = "std::monostate"
-key = "noop_task"
diff --git a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h b/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
deleted file mode 100644
index 054b73844e..0000000000
--- a/lib/task-spec/include/task-spec/task_id_with_noop_default_t.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ID_WITH_NOOP_DEFAULT_T_H
-
-#include "op-attrs/computation_graph_op_attrs.dtg.h"
-#include "op-attrs/operator_type.dtg.h"
-#include "task-spec/ops/op_task_id_t.dtg.h"
-#include "task-spec/task_id_with_noop_default_t.dtg.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t);
-task_id_with_noop_default_t default_noop_task();
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t, ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
index 4270119612..857fed1a84 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/loss_insertion.cc
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
-LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
-                                           LossAttrs const &loss_attrs,
-                                           dynamic_tensor_guid_t logit_tensor) {
+LossInsertionResult perform_loss_insertion(
+    DynamicOpenDataflowGraph const &dg,
+    LossAttrs const &loss_attrs,
+    dynamic_tensor_guid_t logit_tensor,
+    std::optional<MappedOperatorTaskGroup> const &loss_mapping) {
   DynamicValueAttrs logit_value = assert_unwrap(
       find_output_value_attrs(dg, logit_tensor, mk_dynamic_tensor_role_fwd()));
 
@@ -45,7 +47,7 @@ LossInsertionResult perform_loss_insertion(DynamicOpenDataflowGraph const &dg,
       DynamicNodeAttrs{
           /*task_type=*/DynamicTaskType::LOSS,
           /*device_coord=*/std::nullopt,
-          /*mapping=*/std::nullopt,
+          /*mapping=*/loss_mapping,
           /*op_attrs=*/TrainingOperationAttrs{loss_attrs},
           /*layer_guid=*/mk_dynamic_layer_guid_for_loss(),
           /*per_device_op_state=*/std::nullopt,
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
new file mode 100644
index 0000000000..ced98dfd44
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.cc
@@ -0,0 +1,78 @@
+#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_mpcg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/pcg_operator_attrs.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_layer_guid_t.dtg.h"
+#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h"
+#include "task-spec/dynamic_graph/dynamic_tensor_role.h"
+#include "utils/containers/generate_map.h"
+#include <optional>
+#include <unordered_map>
+#include <utility>
+
+namespace FlexFlow {
+
+DynamicOpenDataflowGraph make_dynamic_open_dataflow_graph_from_mpcg(
+    MappedParallelComputationGraph const &mpcg) {
+  DynamicOpenDataflowGraph result = make_empty_dynamic_open_dataflow_graph();
+
+  for (auto const &[layer, attrs] :
+       get_parallel_layer_attrs_mapping(mpcg.pcg)) {
+    DynamicNodeAttrs result_attrs{
+        /*task_type=*/std::nullopt,
+        /*device_coord=*/std::nullopt,
+        /*mapping=*/mpcg.mapped_tasks.at(layer),
+        /*op_attrs=*/TrainingOperationAttrs{attrs.op_attrs},
+        /*pcg_layer_guid=*/dynamic_layer_guid_t{layer},
+        /*per_device_op_state=*/std::nullopt,
+    };
+
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_inputs =
+        transform(get_incoming_tensors(mpcg.pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+    std::unordered_map<DynamicTensorSlot, DynamicValueAttrs> result_outputs =
+        transform(get_outgoing_tensors(mpcg.pcg, layer),
+                  [&](TensorSlotName const &slot_name,
+                      parallel_tensor_guid_t const &tensor) {
+                    ParallelTensorAttrs attrs =
+                        get_parallel_tensor_attrs(mpcg.pcg, tensor);
+                    return std::pair<DynamicTensorSlot, DynamicValueAttrs>{
+                        DynamicTensorSlot{
+                            /*slot_name=*/slot_name,
+                            /*slot_tensor_role=*/std::nullopt,
+                        },
+                        DynamicValueAttrs{
+                            /*tensor_guid=*/dynamic_tensor_guid_t{tensor},
+                            /*parallel_tensor_shape=*/attrs.shape,
+                            /*shard_coord=*/std::nullopt,
+                            /*accessor=*/std::nullopt,
+                            /*role=*/std::nullopt,
+                        },
+                    };
+                  });
+
+    result.invocations.emplace(result_inputs, result_attrs, result_outputs);
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
new file mode 100644
index 0000000000..d613194d14
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_attrs.cc
@@ -0,0 +1,29 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicNodeAttrs
+    dynamic_node_attrs_to_serializable(DynamicNodeAttrs const &attrs) {
+  return SerializableDynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+  };
+}
+
+DynamicNodeAttrs dynamic_node_attrs_from_serializable(
+    SerializableDynamicNodeAttrs const &attrs) {
+  return DynamicNodeAttrs{
+      /*task_type=*/attrs.task_type,
+      /*device_coord=*/attrs.device_coord,
+      /*mapping=*/attrs.mapping,
+      /*op_attrs=*/attrs.op_attrs,
+      /*layer_guid=*/attrs.layer_guid,
+      /*per_device_op_state=*/std::nullopt,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
new file mode 100644
index 0000000000..334623ee67
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_node_invocation.cc
@@ -0,0 +1,31 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_node_invocation.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_node_attrs.h"
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include "utils/containers/map_values.h"
+
+namespace FlexFlow {
+
+SerializableDynamicNodeInvocation dynamic_node_invocation_to_serializable(
+    DynamicNodeInvocation const &invocation) {
+  return SerializableDynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_to_serializable),
+      /*node_attrs=*/dynamic_node_attrs_to_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_to_serializable),
+  };
+}
+
+DynamicNodeInvocation dynamic_node_invocation_from_serializable(
+    SerializableDynamicNodeInvocation const &invocation) {
+  return DynamicNodeInvocation{
+      /*inputs=*/map_values(invocation.inputs,
+                            dynamic_value_attrs_from_serializable),
+      /*node_attrs=*/
+      dynamic_node_attrs_from_serializable(invocation.node_attrs),
+      /*outputs=*/
+      map_values(invocation.outputs, dynamic_value_attrs_from_serializable),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
new file mode 100644
index 0000000000..2dc0b509ab
--- /dev/null
+++ b/lib/task-spec/src/task-spec/dynamic_graph/serializable_dynamic_value_attrs.cc
@@ -0,0 +1,27 @@
+#include "task-spec/dynamic_graph/serializable_dynamic_value_attrs.h"
+#include <optional>
+
+namespace FlexFlow {
+
+SerializableDynamicValueAttrs
+    dynamic_value_attrs_to_serializable(DynamicValueAttrs const &attrs) {
+  return SerializableDynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*role=*/attrs.role,
+  };
+}
+
+DynamicValueAttrs dynamic_value_attrs_from_serializable(
+    SerializableDynamicValueAttrs const &attrs) {
+  return DynamicValueAttrs{
+      /*tensor_guid=*/attrs.tensor_guid,
+      /*parallel_tensor_shape=*/attrs.parallel_tensor_shape,
+      /*shard_coord=*/attrs.shard_coord,
+      /*accessor=*/std::nullopt,
+      /*role=*/attrs.role,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
index ea253b63f8..402e0ef055 100644
--- a/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
+++ b/lib/task-spec/src/task-spec/dynamic_graph/shard_expansion.cc
@@ -15,7 +15,7 @@ bool value_is_shard_expanded(DynamicValueAttrs const &n) {
 
 bool no_part_of_graph_is_shard_expanded(DynamicOpenDataflowGraph const &g) {
   auto slot_is_shard_expanded = [](DynamicTensorSlot const &) -> bool {
-    return true;
+    return false;
   };
 
   return no_part_of_dynamic_graph_satisfies(g,
@@ -81,4 +81,19 @@ std::unordered_set<DynamicNodeInvocation>
       });
 }
 
+DynamicOpenDataflowGraph
+    perform_shard_expansion(DynamicOpenDataflowGraph const &g) {
+
+  ASSERT(no_part_of_graph_is_shard_expanded(g));
+
+  DynamicOpenDataflowGraph result =
+      flatmap_dynamic_invocation_set(g, [&](DynamicNodeInvocation const &i) {
+        return perform_shard_expansion_for_invocation(i);
+      });
+
+  ASSERT(graph_is_fully_shard_expanded(result));
+
+  return result;
+}
+
 } // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
deleted file mode 100644
index 20e0d00c57..0000000000
--- a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "task-spec/task_id_with_noop_default_t.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-task_id_with_noop_default_t lift_task_id_t(task_id_t task_id) {
-  return task_id_with_noop_default_t{task_id};
-}
-
-task_id_with_noop_default_t default_noop_task() {
-  return task_id_with_noop_default_t{std::monostate{}};
-}
-
-task_id_with_noop_default_t lower_op_task_id_to_task_id_with_noop_default_t(
-    op_task_id_t op_task_id, ComputationGraphOpAttrs const &op_attrs) {
-  switch (op_task_id) {
-    case op_task_id_t::INIT:
-      return get_init_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::FWD:
-      return get_fwd_task_id_for_op_attrs(op_attrs);
-    case op_task_id_t::BWD:
-      return get_bwd_task_id_for_op_attrs(op_attrs);
-    default:
-      PANIC("Unhandled op_task_id_t", op_task_id);
-  }
-}
-
-task_id_with_noop_default_t
-    get_init_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) { return default_noop_task(); },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_INIT_TASK_ID);
-      },
-      [](BroadcastAttrs const &) { return default_noop_task(); },
-      [](CastAttrs const &) { return default_noop_task(); },
-      [](ConcatAttrs const &) { return default_noop_task(); },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_INIT_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_INIT_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_INIT_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) { return default_noop_task(); },
-      [](FlatAttrs const &) { return default_noop_task(); },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_INIT_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_INIT_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_INIT_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_INIT_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_INIT_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_INIT_TASK_ID);
-      },
-      [](ReshapeAttrs const &) { return default_noop_task(); },
-      [](ReverseAttrs const &) { return default_noop_task(); },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_INIT_TASK_ID);
-      },
-      [](SplitAttrs const &) { return default_noop_task(); },
-      [](TopKAttrs const &) { return default_noop_task(); },
-      [](TransposeAttrs const &) { return default_noop_task(); },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_fwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_FWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_FWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_FWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_FWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_FWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_FWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_FWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_FWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_FWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_FWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_FWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_FWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_FWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_FWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_FWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_FWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_FWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_FWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_FWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_FWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_FWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_FWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-task_id_with_noop_default_t
-    get_bwd_task_id_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) {
-
-  return op_attrs.visit<task_id_with_noop_default_t>(overload{
-      [](BatchMatmulAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHMATMUL_BWD_TASK_ID);
-      },
-      [](BatchNormAttrs const &) {
-        return lift_task_id_t(task_id_t::BATCHNORM_BWD_TASK_ID);
-      },
-      [](BroadcastAttrs const &) {
-        return lift_task_id_t(task_id_t::BROADCAST_BWD_TASK_ID);
-      },
-      [](CastAttrs const &) {
-        return lift_task_id_t(task_id_t::CAST_BWD_TASK_ID);
-      },
-      [](ConcatAttrs const &) {
-        return lift_task_id_t(task_id_t::CONCAT_BWD_TASK_ID);
-      },
-      [](Conv2DAttrs const &) {
-        return lift_task_id_t(task_id_t::CONV2D_BWD_TASK_ID);
-      },
-      [](DropoutAttrs const &) {
-        return lift_task_id_t(task_id_t::DROPOUT_BWD_TASK_ID);
-      },
-      [](ElementBinaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID);
-      },
-      [](ElementUnaryAttrs const &) {
-        return lift_task_id_t(task_id_t::ELEMENTUNARY_BWD_TASK_ID);
-      },
-      [](EmbeddingAttrs const &) {
-        return lift_task_id_t(task_id_t::EMBED_BWD_TASK_ID);
-      },
-      [](FlatAttrs const &) {
-        return lift_task_id_t(task_id_t::FLAT_BWD_TASK_ID);
-      },
-      [](GatherAttrs const &) {
-        return lift_task_id_t(task_id_t::GATHER_BWD_TASK_ID);
-      },
-      [](InputAttrs const &) { return default_noop_task(); },
-      [](LayerNormAttrs const &) {
-        return lift_task_id_t(task_id_t::LAYERNORM_BWD_TASK_ID);
-      },
-      [](LinearAttrs const &) {
-        return lift_task_id_t(task_id_t::LINEAR_BWD_TASK_ID);
-      },
-      [](MultiHeadAttentionAttrs const &) {
-        return lift_task_id_t(task_id_t::ATTENTION_BWD_TASK_ID);
-      },
-      [](NoopAttrs const &) { return default_noop_task(); },
-      [](Pool2DAttrs const &) {
-        return lift_task_id_t(task_id_t::POOL2D_BWD_TASK_ID);
-      },
-      [](ReduceAttrs const &) {
-        return lift_task_id_t(task_id_t::REDUCE_BWD_TASK_ID);
-      },
-      [](ReshapeAttrs const &) {
-        return lift_task_id_t(task_id_t::RESHAPE_BWD_TASK_ID);
-      },
-      [](ReverseAttrs const &) {
-        return lift_task_id_t(task_id_t::REVERSE_BWD_TASK_ID);
-      },
-      [](SoftmaxAttrs const &) {
-        return lift_task_id_t(task_id_t::SOFTMAX_BWD_TASK_ID);
-      },
-      [](SplitAttrs const &) {
-        return lift_task_id_t(task_id_t::SPLIT_BWD_TASK_ID);
-      },
-      [](TopKAttrs const &) {
-        return lift_task_id_t(task_id_t::TOPK_BWD_TASK_ID);
-      },
-      [](TransposeAttrs const &) {
-        return lift_task_id_t(task_id_t::TRANSPOSE_BWD_TASK_ID);
-      },
-      [](WeightAttrs const &) { return default_noop_task(); },
-  });
-}
-
-} // namespace FlexFlow
diff --git a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
index f286fb90a7..5b537eac88 100644
--- a/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
+++ b/lib/utils/include/utils/graph/kwarg_dataflow_graph/kwarg_dataflow_output.dtg.toml
@@ -6,6 +6,7 @@ features = [
   "ord",
   "hash",
   "fmt",
+  "json",
 ]
 
 template_params = [