diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index ba80244b3b..ec8aa43721 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -26,6 +26,7 @@
 #include <dual_simplex/user_problem.hpp>
 
 #include <raft/core/nvtx.hpp>
+#include <utilities/circular_deque.hpp>
 #include <utilities/hashing.hpp>
 
 #include <omp.h>
@@ -34,9 +35,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
-#include <deque>
 #include <limits>
-#include <optional>
 #include <string>
 #include <vector>
 
@@ -296,11 +295,8 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  f_t lower_bound        = lower_bound_ceiling_.load();
-  f_t heap_lower_bound   = node_queue_.get_lower_bound();
-  f_t worker_lower_bound = worker_pool_.get_lower_bound();
-  lower_bound            = std::min(heap_lower_bound, lower_bound);
-  lower_bound            = std::min(worker_lower_bound, lower_bound);
+  f_t lower_bound = lower_bound_ceiling_.load();
+  lower_bound     = std::min(bfs_worker_pool_.get_lower_bound(), lower_bound);
 
   if (std::isfinite(lower_bound)) {
     return lower_bound;
@@ -707,9 +703,11 @@ void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>&
   if (solver_status_ == mip_status_t::TIME_LIMIT) {
     settings_.log.printf("Time limit reached. Stopping the solver...\n");
   }
+
   if (solver_status_ == mip_status_t::WORK_LIMIT) {
     settings_.log.printf("Work limit reached. Stopping the solver...\n");
   }
+
   if (solver_status_ == mip_status_t::NODE_LIMIT) {
     settings_.log.printf("Node limit reached. Stopping the solver...\n");
   }
@@ -837,7 +835,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   std::vector<f_t>& solution = worker->leaf_solution.x;
 
   switch (worker->search_strategy) {
-    case search_strategy_t::BEST_FIRST:
+    case BEST_FIRST:
 
       if (settings_.reliability_branching != 0) {
         branch_var = pc_.reliable_variable_selection(node_ptr,
@@ -846,7 +844,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
                                                      var_types_,
                                                      exploration_stats_,
                                                      upper_bound_,
-                                                     worker_pool_.num_idle_workers(),
+                                                     bfs_worker_pool_.num_idle_workers(),
                                                      new_slacks_,
                                                      original_lp_);
       } else {
@@ -857,17 +855,18 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
 
       return {branch_var, round_dir};
 
-    case search_strategy_t::COEFFICIENT_DIVING:
+    case COEFFICIENT_DIVING:
       return coefficient_diving(
         original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
 
-    case search_strategy_t::LINE_SEARCH_DIVING:
+    case LINE_SEARCH_DIVING:
       return line_search_diving(fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::PSEUDOCOST_DIVING:
+    case PSEUDOCOST_DIVING:
       return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::GUIDED_DIVING:
+    case GUIDED_DIVING:
+      assert(incumbent_.has_incumbent);
       mutex_upper_.lock();
       current_incumbent = incumbent_.x;
       mutex_upper_.unlock();
@@ -1085,12 +1084,12 @@ struct deterministic_diving_policy_t
   : deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>> {
   using base = deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>>;
 
-  std::deque<mip_node_t<i_t, f_t>*>& stack;
+  circular_deque_t<mip_node_t<i_t, f_t>*>& stack;
   i_t max_backtrack_depth;
 
   deterministic_diving_policy_t(branch_and_bound_t<i_t, f_t>& bnb,
                                 deterministic_diving_worker_t<i_t, f_t>& worker,
-                                std::deque<mip_node_t<i_t, f_t>*>& stack,
+                                circular_deque_t<mip_node_t<i_t, f_t>*>& stack,
                                 i_t max_backtrack_depth)
     : base(bnb, worker), stack(stack), max_backtrack_depth(max_backtrack_depth)
   {
@@ -1244,7 +1243,7 @@ std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::updat
         policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x);
       round_dir = dir;
 
-      assert(node_ptr->vstatus.size() == leaf_problem.num_cols);
+      assert(worker->leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
       assert(dir != branch_direction_t::NONE);
 
@@ -1258,7 +1257,7 @@ std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::updat
                          branch_var,
                          leaf_solution.x[branch_var],
                          num_frac,
-                         node_ptr->vstatus,
+                         worker->leaf_vstatus,
                          leaf_problem,
                          log);
       search_tree.update(node_ptr, node_status_t::HAS_CHILDREN);
@@ -1341,8 +1340,9 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   }
 #endif
 
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == worker->leaf_problem.num_cols);
+  worker->leaf_vstatus =
+    decompress_vstatus(node_ptr->packed_vstatus, worker->leaf_problem.num_cols);
+  assert(worker->leaf_vstatus.size() == worker->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.concurrent_halt           = &node_concurrent_halt_;
@@ -1401,7 +1401,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                 lp_start_time,
                                                 worker->leaf_problem,
                                                 lp_settings,
-                                                leaf_vstatus,
+                                                worker->leaf_vstatus,
                                                 worker->basis_factors,
                                                 worker->basic_list,
                                                 worker->nonbasic_list,
@@ -1418,7 +1418,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                                            worker->basis_factors,
                                                                            worker->basic_list,
                                                                            worker->nonbasic_list,
-                                                                           leaf_vstatus,
+                                                                           worker->leaf_vstatus,
                                                                            worker->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
@@ -1434,11 +1434,19 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
+
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::plunge_with(bfs_worker_t<i_t, f_t>* worker,
+                                               mip_node_t<i_t, f_t>* start_node)
 {
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(worker->start_node);
+  assert(worker != nullptr && worker->is_active.load());
+  assert(start_node != nullptr);
+
+  // Stack holds at most 2 entries: the preferred child + its sibling.
+  // The sibling is evicted to the queue before a new pair of children is added.
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(4);
+  stack.push_front(start_node);
+
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
 
@@ -1449,6 +1457,36 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
 
   while (stack.size() > 0 && (solver_status_ == mip_status_t::UNSET && is_running_) &&
          rel_gap > settings_.relative_mip_gap_tol && abs_gap > settings_.absolute_mip_gap_tol) {
+    if (worker->worker_id == 0) { repair_heuristic_solutions(); }
+
+    // Launch a new diving task if any worker is idle
+    if (worker->total_active_diving_workers < worker->total_max_diving_workers &&
+        worker->node_queue.diving_queue_size() > 0) {
+      launch_diving_worker(worker);
+    }
+
+    // If any best-first worker become idle,
+    if (bfs_worker_pool_.num_idle_workers() > 0 && worker->node_queue.best_first_queue_size() > 0) {
+      worker->node_queue.lock();
+      mip_node_t<i_t, f_t>* node = worker->node_queue.pop_best_first();
+
+      // We need to temporarily save the lower bound in this worker so it is
+      // considered when calculating the global lower bound.
+      f_t node_lower_bound = node ? node->lower_bound : std::numeric_limits<f_t>::infinity();
+      worker->lower_bound  = std::min(worker->lower_bound.load(), node_lower_bound);
+
+      worker->node_queue.unlock();
+
+      if (node != nullptr) {
+        if (!launch_bfs_worker({node})) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node);
+          worker->node_queue.unlock();
+        }
+      }
+    }
+
+    assert(stack.size() <= 2);
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
@@ -1458,7 +1496,7 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    worker->lower_bound = node_ptr->lower_bound;
+    worker->lower_bound = std::min(worker->node_queue.get_lower_bound(), node_ptr->lower_bound);
 
     if (node_ptr->lower_bound > upper_bound_.load()) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
@@ -1469,13 +1507,31 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       continue;
     }
 
-    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+    f_t now = toc(exploration_stats_.start_time);
+
+    if (worker->worker_id == 0) {
+      f_t time_since_last_log =
+        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+      i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+      if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+           time_since_last_log >= 1) ||
+          (time_since_last_log > 30) || now > settings_.time_limit) {
+        report(' ', upper_bound_, lower_bound, node_ptr->depth, node_ptr->integer_infeasible);
+        exploration_stats_.last_log             = tic();
+        exploration_stats_.nodes_since_last_log = 0;
+      }
+    }
+
+    if (now > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
+      stack.push_front(node_ptr);
       break;
     }
 
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
       solver_status_ = mip_status_t::NODE_LIMIT;
+      stack.push_front(node_ptr);
       break;
     }
 
@@ -1483,11 +1539,17 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
 
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
+      stack.push_front(node_ptr);
       break;
-    } else if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
+    }
+
+    if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
       stack.push_front(node_ptr);
       break;
-    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+    }
+
+    if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      stack.push_front(node_ptr);
       break;
     }
 
@@ -1509,22 +1571,28 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-        node_queue_.push(node);
+        worker->node_queue.lock();
+        worker->node_queue.push(node);
+        worker->node_queue.unlock();
       }
 
       exploration_stats_.nodes_unexplored += 2;
 
       if (round_dir == branch_direction_t::UP) {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_down_child());
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node_ptr->get_down_child());
+          worker->node_queue.unlock();
         } else {
           stack.push_front(node_ptr->get_down_child());
         }
 
         stack.push_front(node_ptr->get_up_child());
       } else {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_up_child());
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.lock();
+          worker->node_queue.push(node_ptr->get_up_child());
+          worker->node_queue.unlock();
         } else {
           stack.push_front(node_ptr->get_up_child());
         }
@@ -1539,31 +1607,116 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  lower_bound = get_lower_bound();
-  upper_bound = upper_bound_;
-  rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
+  // If the solver was forced to stop, but we still have nodes to explore
+  // in the stack, then we should add all the pending nodes back to the heap so the global lower
+  // bound is set to the correct value.
+  while (!stack.empty()) {
+    auto node = stack.front();
+    stack.pop_front();
+    worker->node_queue.lock();
+    worker->node_queue.push(node);
+    worker->node_queue.unlock();
+  }
+}
 
-  if (stack.size() > 0 &&
-      (rel_gap <= settings_.relative_mip_gap_tol || abs_gap <= settings_.absolute_mip_gap_tol)) {
-    // If the solver converged according to the gap rules, but we still have nodes to explore
-    // in the stack, then we should add all the pending nodes back to the heap so the lower
-    // bound of the solver is set to the correct value.
-    while (!stack.empty()) {
-      auto node = stack.front();
-      stack.pop_front();
-      node_queue_.push(node);
-    }
+template <typename i_t, typename f_t>
+bfs_worker_t<i_t, f_t>* branch_and_bound_t<i_t, f_t>::launch_bfs_worker(
+  const std::vector<mip_node_t<i_t, f_t>*>& start_nodes)
+{
+  // Take an idle node from the pool
+  bfs_worker_t<i_t, f_t>* idle_worker = bfs_worker_pool_.pop_idle_worker();
+  if (!idle_worker) { return nullptr; }
+
+  if (toc(exploration_stats_.start_time) > settings_.time_limit ||
+      solver_status_ != mip_status_t::UNSET) {
+    bfs_worker_pool_.return_worker_to_pool(idle_worker);
+    return nullptr;
   }
 
-  if (settings_.num_threads > 1) {
-    worker_pool_.return_worker_to_pool(worker);
-    active_workers_per_strategy_[BEST_FIRST]--;
+  idle_worker->init(start_nodes);
+  idle_worker->set_active();
+
+#pragma omp task affinity(*idle_worker) priority(99) default(none) firstprivate(idle_worker)
+  best_first_search_with(idle_worker);
+
+  return idle_worker;
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::best_first_search_with(bfs_worker_t<i_t, f_t>* worker)
+{
+  f_t lower_bound  = get_lower_bound();
+  f_t abs_gap      = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t rel_gap      = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t steal_chance = settings_.bnb_steal_chance >= 0 ? settings_.bnb_steal_chance : 0.05;
+
+  worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(),
+                                       diving_worker_pool_.num_workers(),
+                                       has_solver_space_incumbent(),
+                                       settings_.diving_settings);
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         worker->node_queue.best_first_queue_size() > 0) {
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (diving_worker_pool_.num_workers() > 0 && settings_.diving_settings.guided_diving != 0 &&
+        worker->max_diving_workers[GUIDED_DIVING] == 0) {
+      if (has_solver_space_incumbent()) {
+        worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(),
+                                             diving_worker_pool_.num_workers(),
+                                             has_solver_space_incumbent(),
+                                             settings_.diving_settings);
+      }
+    }
+
+    worker->node_queue.lock();
+    mip_node_t<i_t, f_t>* start_node = worker->node_queue.pop_best_first();
+    if (!start_node) {
+      worker->node_queue.unlock();
+      continue;
+    }
+
+    worker->lower_bound = start_node->lower_bound;
+    worker->node_queue.unlock();
+
+    if (upper_bound_.load() < start_node->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+      search_tree_.update(start_node, node_status_t::FATHOMED);
+      --exploration_stats_.nodes_unexplored;
+      continue;
+    }
+
+    plunge_with(worker, start_node);
+
+    lower_bound = get_lower_bound();
+    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
+      node_concurrent_halt_ = 1;
+      solver_status_        = mip_status_t::OPTIMAL;
+      break;
+    }
+
+    // Steal a node with some probability or when it is empty. The victim is determined at random.
+    if (worker->node_queue.best_first_queue_size() == 0 ||
+        worker->rng.next_double() < steal_chance) {
+      for (i_t i = 0; i < settings_.bnb_max_steal_attempts; ++i) {
+        i_t k = worker->rng.uniform(0, bfs_worker_pool_.num_workers());
+        if (worker->steal_node_from(bfs_worker_pool_[k], settings_.bnb_nodes_per_steal)) { break; }
+      }
+    }
   }
+
+  worker->set_inactive();
+  bfs_worker_pool_.return_worker_to_pool(worker);
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::dive_with(diving_worker_t<i_t, f_t>* worker)
 {
   raft::common::nvtx::range scope("BB::diving_thread");
   logger_t log;
@@ -1576,8 +1729,8 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
 
-  search_tree_t<i_t, f_t> dive_tree(std::move(*worker->start_node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
+  search_tree_t<i_t, f_t> dive_tree(std::move(worker->start_node));
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2 * diving_backtrack_limit + 4);
   stack.push_front(&dive_tree.root);
 
   branch_and_bound_stats_t<i_t, f_t> dive_stats;
@@ -1612,11 +1765,9 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     if (lp_status == dual::status_t::TIME_LIMIT) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
-    } else if (lp_status == dual::status_t::CONCURRENT_LIMIT) {
-      break;
-    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
-      break;
     }
+    if (lp_status == dual::status_t::CONCURRENT_LIMIT) { break; }
+    if (lp_status == dual::status_t::ITERATION_LIMIT) { break; }
 
     ++dive_stats.nodes_explored;
 
@@ -1635,7 +1786,7 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
       }
     }
 
-    // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only
+    // Remove nodes that we can no longer backtrack to (i.e., from the current node, we can only
     // backtrack to a node that is has a depth of at most 5 levels lower than the current node).
     if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) {
       stack.pop_back();
@@ -1647,220 +1798,80 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  worker_pool_.return_worker_to_pool(worker);
-  active_workers_per_strategy_[search_strategy]--;
+  worker->set_inactive();
+  diving_worker_pool_.return_worker_to_pool(worker);
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::run_scheduler()
+bool branch_and_bound_t<i_t, f_t>::launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker)
 {
-  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
-  const i_t num_workers                                  = 2 * settings_.num_threads;
-
-  if (!has_solver_space_incumbent()) { diving_settings.guided_diving = false; }
-  std::vector<search_strategy_t> strategies = get_search_strategies(diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers_per_type =
-    get_max_workers(num_workers, strategies);
+  // Get an idle worker.
+  diving_worker_t<i_t, f_t>* diving_worker = diving_worker_pool_.pop_idle_worker();
+  if (diving_worker == nullptr) { return false; }
 
-  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_);
-  active_workers_per_strategy_.fill(0);
+  bfs_worker->node_queue.lock();
+  mip_node_t<i_t, f_t>* start_node = bfs_worker->node_queue.pop_diving();
 
-#ifdef CUOPT_LOG_DEBUG
-  for (auto strategy : strategies) {
-    settings_.log.debug("%c%d: max num of workers = %d",
-                        feasible_solution_symbol(strategy),
-                        strategy,
-                        max_num_workers_per_type[strategy]);
+  if (!start_node || upper_bound_.load() < start_node->lower_bound ||
+      start_node->depth < settings_.diving_settings.min_node_depth) {
+    bfs_worker->node_queue.unlock();
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
   }
-#endif
-
-  f_t lower_bound     = get_lower_bound();
-  f_t abs_gap         = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-  i_t last_node_depth = 0;
-  i_t last_int_infeas = 0;
-
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
-    bool launched_any_task = false;
-
-    repair_heuristic_solutions();
-
-    // If the guided diving was disabled previously due to the lack of an incumbent solution,
-    // re-enable as soon as a new incumbent is found.
-    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
-      if (has_solver_space_incumbent()) {
-        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
-        strategies                    = get_search_strategies(diving_settings);
-        max_num_workers_per_type      = get_max_workers(num_workers, strategies);
-
-#ifdef CUOPT_LOG_DEBUG
-        for (auto type : strategies) {
-          settings_.log.debug("%c%d: max num of workers = %d",
-                              feasible_solution_symbol(type),
-                              type,
-                              max_num_workers_per_type[type]);
-        }
-#endif
-      }
-    }
-
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t queue_size = node_queue_.best_first_queue_size();
-      i_t depth      = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
-      i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
-
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
-
-    for (auto strategy : strategies) {
-      if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) {
-        continue;
-      }
-
-      // Get an idle worker.
-      branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-      if (worker == nullptr) { break; }
-
-      if (strategy == BEST_FIRST) {
-        // If there any node left in the heap, we pop the top node and explore it.
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
-          // This node was put on the heap earlier but its lower bound is now greater than the
-          // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
-        }
-
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        worker->init_best_first(start_node.value(), original_lp_);
-        last_node_depth = start_node.value()->depth;
-        last_int_infeas = start_node.value()->integer_infeasible;
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
 
-#pragma omp task affinity(worker) default(none) firstprivate(worker)
-        plunge_with(worker);
+  diving_worker->init(start_node, original_lp_);
+  bfs_worker->node_queue.unlock();
 
-      } else {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+  bool is_feasible = diving_worker->presolve_start_bounds(settings_);
+  if (!is_feasible) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound ||
-            start_node.value()->depth < diving_settings.min_node_depth) {
-          continue;
-        }
+  if (toc(exploration_stats_.start_time) > settings_.time_limit ||
+      solver_status_ != mip_status_t::UNSET) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-        bool is_feasible =
-          worker->init_diving(start_node.value(), strategy, original_lp_, settings_);
-        if (!is_feasible) { continue; }
+  for (int i = 1; i < num_search_strategies; ++i) {
+    auto strategy = search_strategies[i];
 
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
+    if (bfs_worker->active_diving_workers[strategy] < bfs_worker->max_diving_workers[strategy]) {
+      diving_worker->search_strategy = strategy;
+      diving_worker->bfs_worker      = bfs_worker;
+      diving_worker->set_active();
+      bfs_worker->active_diving_workers[strategy]++;
+      bfs_worker->total_active_diving_workers++;
 
-#pragma omp task affinity(worker) default(none) firstprivate(worker)
-        dive_with(worker);
-      }
-    }
+      assert(bfs_worker->active_diving_workers[strategy].load() <=
+             bfs_worker->max_diving_workers[strategy]);
+      assert(bfs_worker->total_active_diving_workers.load() <=
+             bfs_worker->total_max_diving_workers);
 
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+#pragma omp task affinity(*diving_worker) default(none) firstprivate(diving_worker)
+      dive_with(diving_worker);
 
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      node_concurrent_halt_ = 1;
-      solver_status_        = mip_status_t::OPTIMAL;
-      break;
+      return true;
     }
-
-    // If no new task was launched in this iteration, suspend temporarily the
-    // execution of the scheduler. As of 8/Jan/2026, GCC does not
-    // implement taskyield, but LLVM does.
-    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
   }
+
+  diving_worker_pool_.return_worker_to_pool(diving_worker);
+  return false;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
 {
-  raft::common::nvtx::range scope("BB::single_threaded_solve");
-  worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_);
-  branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-
-  f_t lower_bound = get_lower_bound();
-  f_t abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
-    repair_heuristic_solutions();
-
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t depth      = node_queue_.bfs_top()->depth;
-      i_t int_infeas = node_queue_.bfs_top()->integer_infeasible;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
-
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
-
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-
-    if (!start_node.has_value()) { continue; }
-    if (upper_bound_.load() < start_node.value()->lower_bound) {
-      // This node was put on the heap earlier but its lower bound is now greater than the
-      // current upper bound
-      search_tree_.graphviz_node(
-        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-      continue;
-    }
-
-    worker->init_best_first(start_node.value(), original_lp_);
-    plunge_with(worker);
-
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      solver_status_ = mip_status_t::OPTIMAL;
-      break;
-    }
-  }
+  bfs_worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_);
+  bfs_worker_t<i_t, f_t>* worker = bfs_worker_pool_.pop_idle_worker();
+
+  node_queue_t<i_t, f_t>& node_queue = worker->node_queue;
+  node_queue.push(search_tree_.root.get_down_child());
+  node_queue.push(search_tree_.root.get_up_child());
+  worker->lower_bound = worker->node_queue.get_lower_bound();
+  worker->set_active();
+  best_first_search_with(worker);
 }
 
 template <typename i_t, typename f_t>
@@ -2595,8 +2606,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
-  node_queue_.push(search_tree_.root.get_down_child());
-  node_queue_.push(search_tree_.root.get_up_child());
 
   settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads);
   node_concurrent_halt_ = 0;
@@ -2605,7 +2614,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  min_node_queue_size_                    = 2 * settings_.num_threads;
+  min_node_queue_size_                    = 20;
 
   if (settings_.diving_settings.coefficient_diving != 0) {
     calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
@@ -2625,7 +2634,19 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     if (settings_.deterministic) {
       run_deterministic_coordinator(Arow_);
     } else if (settings_.num_threads > 1) {
-      run_scheduler();
+      const i_t num_workers        = settings_.num_threads;
+      const i_t num_bfs_workers    = std::max(settings_.num_threads / 2, 1);
+      const i_t num_diving_workers = num_workers - num_bfs_workers;
+      bfs_worker_pool_.init(num_bfs_workers, original_lp_, Arow_, var_types_, settings_);
+      diving_worker_pool_.init(
+        num_diving_workers, original_lp_, Arow_, var_types_, settings_, num_bfs_workers);
+
+      if (num_bfs_workers > 1) {
+        launch_bfs_worker({search_tree_.root.get_up_child()});
+        launch_bfs_worker({search_tree_.root.get_down_child()});
+      } else {
+        launch_bfs_worker({search_tree_.root.get_up_child(), search_tree_.root.get_down_child()});
+      }
     } else {
       single_threaded_solve();
     }
@@ -2633,37 +2654,45 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
 
   is_running_ = false;
 
+  if (solver_status_ == mip_status_t::UNSET &&
+      toc(exploration_stats_.start_time) > settings_.time_limit) {
+    solver_status_ = mip_status_t::TIME_LIMIT;
+  }
+
   // Compute final lower bound
   f_t lower_bound;
   if (deterministic_mode_enabled_) {
     lower_bound    = deterministic_compute_lower_bound();
     solver_status_ = deterministic_global_termination_status_;
   } else {
-    if (node_queue_.best_first_queue_size() > 0) {
+    lower_bound = std::numeric_limits<f_t>::infinity();
+
+    for (int i = 0; i < bfs_worker_pool_.num_workers(); ++i) {
+      bfs_worker_t<i_t, f_t>* worker = bfs_worker_pool_[i];
+
       // We need to clear the queue and use the info in the search tree for the lower bound
-      while (node_queue_.best_first_queue_size() > 0) {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+      while (worker->node_queue.best_first_queue_size() > 0) {
+        mip_node_t<i_t, f_t>* start_node = worker->node_queue.pop_best_first();
 
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
+        if (!start_node) { continue; }
+        if (upper_bound_.load() < start_node->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
+          search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+          search_tree_.update(start_node, node_status_t::FATHOMED);
+          --exploration_stats_.nodes_unexplored;
         } else {
-          node_queue_.push(
-            start_node.value());  // Needed to ensure we don't lose the correct lower bound
+          // Needed to ensure we don't lose the correct lower bound
+          worker->node_queue.push(start_node);
+          lower_bound = std::min(lower_bound, worker->node_queue.get_lower_bound());
           break;
         }
       }
-      lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
-                                                            : search_tree_.root.lower_bound;
-    } else {
-      lower_bound = search_tree_.root.lower_bound;
     }
+
+    if (!std::isfinite(lower_bound)) { lower_bound = search_tree_.root.lower_bound; }
   }
+
   set_final_solution(solution, lower_bound);
   return solver_status_;
 }
@@ -2792,17 +2821,9 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   deterministic_horizon_step_ = 0.50;
 
   // Compute worker counts using the same formula as reliability-branching scheduler
-  const i_t num_workers = settings_.num_threads;
-  std::vector<search_strategy_t> search_strategies =
-    get_search_strategies(settings_.diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers =
-    get_max_workers(num_workers, search_strategies);
-
-  const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST];
-  int num_diving_workers    = 0;
-  for (size_t i = 1; i < search_strategies.size(); ++i) {
-    num_diving_workers += max_num_workers[search_strategies[i]];
-  }
+  const i_t num_workers        = settings_.num_threads;
+  const i_t num_bfs_workers    = std::max(num_workers / 2, 1);
+  const i_t num_diving_workers = num_workers - num_bfs_workers;
 
   deterministic_mode_enabled_              = true;
   deterministic_current_horizon_           = deterministic_horizon_step_;
@@ -3201,10 +3222,10 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
 
   // Solve LP relaxation
   worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  i_t node_iter                                = 0;
-  f_t lp_start_time                            = tic();
-  std::vector<f_t> leaf_edge_norms             = edge_norms_;
+  worker.leaf_vstatus = decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols);
+  i_t node_iter       = 0;
+  f_t lp_start_time   = tic();
+  std::vector<f_t> leaf_edge_norms = edge_norms_;
 
   dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                              0,
@@ -3212,7 +3233,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                              lp_start_time,
                                                              worker.leaf_problem,
                                                              lp_settings,
-                                                             leaf_vstatus,
+                                                             worker.leaf_vstatus,
                                                              worker.basis_factors,
                                                              worker.basic_list,
                                                              worker.nonbasic_list,
@@ -3230,7 +3251,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                                          worker.basis_factors,
                                                                          worker.basic_list,
                                                                          worker.nonbasic_list,
-                                                                         leaf_vstatus,
+                                                                         worker.leaf_vstatus,
                                                                          leaf_edge_norms,
                                                                          &worker.work_context);
     lp_status                 = convert_lp_status_to_dual_status(second_status);
@@ -3688,8 +3709,10 @@ void branch_and_bound_t<i_t, f_t>::deterministic_assign_diving_nodes()
       continue;             // this worker is full, try next one
     }
 
-    auto entry = diving_heap_.pop();
-    if (entry.has_value()) { worker.enqueue_dive_node(entry.value().node, original_lp_); }
+    if (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      worker.enqueue_dive_node(entry.node, original_lp_);
+    }
   }
 
   diving_heap_.clear();
@@ -3739,11 +3762,6 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 {
   raft::common::nvtx::range scope("BB::deterministic_dive");
 
-  // Create local search tree for the dive
-  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(&dive_tree.root);
-
   worker.dive_lower = std::move(entry.resolved_lower);
   worker.dive_upper = std::move(entry.resolved_upper);
 
@@ -3753,6 +3771,11 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
   worker.lp_iters_this_dive         = 0;
   worker.recompute_bounds_and_basis = true;
 
+  // Create local search tree for the dive
+  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2 * max_backtrack_depth + 4);
+  stack.push_front(&dive_tree.root);
+
   while (!stack.empty() && deterministic_global_termination_status_ == mip_status_t::UNSET &&
          nodes_this_dive < max_nodes_per_dive) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
@@ -3813,18 +3836,19 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 
     // Solve LP relaxation
     worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-    std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-    i_t node_iter                                = 0;
-    f_t lp_start_time                            = tic();
-    std::vector<f_t> leaf_edge_norms             = edge_norms_;
+    i_t node_iter                    = 0;
+    f_t lp_start_time                = tic();
+    std::vector<f_t> leaf_edge_norms = edge_norms_;
 
+    worker.leaf_vstatus =
+      decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols);
     dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                                0,
                                                                worker.recompute_bounds_and_basis,
                                                                lp_start_time,
                                                                worker.leaf_problem,
                                                                lp_settings,
-                                                               leaf_vstatus,
+                                                               worker.leaf_vstatus,
                                                                worker.basis_factors,
                                                                worker.basic_list,
                                                                worker.nonbasic_list,
@@ -3841,7 +3865,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
                                                                            worker.basis_factors,
                                                                            worker.basic_list,
                                                                            worker.nonbasic_list,
-                                                                           leaf_vstatus,
+                                                                           worker.leaf_vstatus,
                                                                            leaf_edge_norms,
                                                                            &worker.work_context);
       lp_status                 = convert_lp_status_to_dual_status(second_status);
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index ae1a225e9a..391afc8a71 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -234,18 +234,14 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes waiting to be explored.
-  node_queue_t<i_t, f_t> node_queue_;
-
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of workers per type that either are being executed or
-  // are waiting to be executed.
-  std::array<omp_atomic_t<i_t>, num_search_strategies> active_workers_per_strategy_;
+  // Worker pool dedicated to the best-first search
+  bfs_worker_pool_t<i_t, f_t> bfs_worker_pool_;
 
-  // Worker pool
-  branch_and_bound_worker_pool_t<i_t, f_t> worker_pool_;
+  // Worker pool dedicated to diving
+  diving_worker_pool_t<i_t, f_t> diving_worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
@@ -287,21 +283,26 @@ class branch_and_bound_t {
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
+  // Launch a new bfs worker initialized from the `start_node`.
+  bfs_worker_t<i_t, f_t>* launch_bfs_worker(const std::vector<mip_node_t<i_t, f_t>*>&);
+  // Launch a new diving worker from a given bfs worker. The dive will start
+  // from the node at the top of the local heap.
+  bool launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker);
+
+  // Perform best-first search with a given bfs worker.
+  void best_first_search_with(bfs_worker_t<i_t, f_t>* worker);
+
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker);
+  void plunge_with(bfs_worker_t<i_t, f_t>* worker, mip_node_t<i_t, f_t>* start_node);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_with(branch_and_bound_worker_t<i_t, f_t>* worker);
-
-  // Run the scheduler whose will schedule and manage
-  // all the other workers.
-  void run_scheduler();
+  void dive_with(diving_worker_t<i_t, f_t>* worker);
 
   // Run the branch-and-bound algorithm in single threaded mode.
-  // This disable all diving heuristics.
+  // This disables all diving heuristics.
   void single_threaded_solve();
 
   // Solve the LP relaxation of a leaf node
diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp
index 39bfa0bf3a..4693e6789a 100644
--- a/cpp/src/branch_and_bound/constants.hpp
+++ b/cpp/src/branch_and_bound/constants.hpp
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #pragma once
+#include <array>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -24,6 +25,9 @@ enum search_strategy_t : int {
   COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
 };
 
+constexpr std::array search_strategies = {
+  BEST_FIRST, PSEUDOCOST_DIVING, LINE_SEARCH_DIVING, GUIDED_DIVING, COEFFICIENT_DIVING};
+
 enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 };
 
 enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 };
diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp
index 53d7e4ef65..a63dad1bcc 100644
--- a/cpp/src/branch_and_bound/deterministic_workers.hpp
+++ b/cpp/src/branch_and_bound/deterministic_workers.hpp
@@ -195,8 +195,8 @@ class deterministic_bfs_worker_t
       plunge_stack.pop_front();
       return node;
     }
-    auto node_opt = backlog.pop();
-    return node_opt.has_value() ? node_opt.value() : nullptr;
+
+    return !backlog.empty() ? backlog.pop() : nullptr;
   }
 
   size_t queue_size() const
diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp
index a0bb731c1e..4989e48c6a 100644
--- a/cpp/src/branch_and_bound/diving_heuristics.cpp
+++ b/cpp/src/branch_and_bound/diving_heuristics.cpp
@@ -92,7 +92,7 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
     f_t score              = 0;
     branch_direction_t dir = branch_direction_t::DOWN;
 
-    f_t root_val = (j < static_cast<i_t>(root_solution.size())) ? root_solution[j] : solution[j];
+    f_t root_val = root_solution[j];
 
     if (solution[j] < root_val - f_t(0.4)) {
       score = score_down;
diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp
index 694a7099c4..c2c2a328a4 100644
--- a/cpp/src/branch_and_bound/mip_node.hpp
+++ b/cpp/src/branch_and_bound/mip_node.hpp
@@ -40,24 +40,6 @@ inline bool inactive_status(node_status_t status)
 template <typename i_t, typename f_t>
 class mip_node_t {
  public:
-  ~mip_node_t()
-  {
-    // Iterative teardown to avoid stack overflow on deep trees.
-    // Detach all descendants breadth-first, then destroy them as leaves.
-    std::vector<std::unique_ptr<mip_node_t>> nodes;
-    for (auto& c : children) {
-      if (c) { nodes.push_back(std::move(c)); }
-    }
-    // nodes.size() grows so that this loop only terminates when only leaves remain
-    for (size_t i = 0; i < nodes.size(); ++i) {
-      for (auto& c : nodes[i]->children) {
-        if (c) { nodes.push_back(std::move(c)); }
-      }
-    }
-
-    // scope-exit ensure destruction of all detached leaves
-  }
-
   mip_node_t(mip_node_t&&)            = default;
   mip_node_t& operator=(mip_node_t&&) = default;
 
@@ -73,7 +55,7 @@ class mip_node_t {
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(0)
+      packed_vstatus(0)
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -89,7 +71,7 @@ class mip_node_t {
       branch_dir(branch_direction_t::NONE),
       integer_infeasible(-1),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -113,7 +95,7 @@ class mip_node_t {
       fractional_val(branch_var_value),
       integer_infeasible(integer_inf),
       objective_estimate(parent_node->objective_estimate),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
     branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var]
                                                                     : std::ceil(branch_var_value);
@@ -123,6 +105,22 @@ class mip_node_t {
     children[1]      = nullptr;
   }
 
+  ~mip_node_t()
+  {
+    // Iterative teardown to avoid stack overflow on deep trees.
+    // Detach all descendants breadth-first, then destroy them as leaves.
+    std::vector<std::unique_ptr<mip_node_t>> nodes;
+    for (auto& c : children) {
+      if (c) { nodes.push_back(std::move(c)); }
+    }
+    // nodes.size() grows so that this loop only terminates when only leaves remain
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      for (auto& c : nodes[i]->children) {
+        if (c) { nodes.push_back(std::move(c)); }
+      }
+    }
+  }
+
   void get_variable_bounds(std::vector<f_t>& lower,
                            std::vector<f_t>& upper,
                            std::vector<bool>& bounds_changed) const
@@ -168,7 +166,7 @@ class mip_node_t {
     children[0] = std::move(down_child);
     children[1] = std::move(up_child);
     // When we add children we no longer need to store our basis
-    vstatus.clear();
+    packed_vstatus = {};
   }
 
   bool is_inactive() const
@@ -256,7 +254,7 @@ class mip_node_t {
   // This method creates a copy of the current node
   // with its parent set to `nullptr`
   // This detaches the node from the tree.
-  mip_node_t<i_t, f_t> detach_copy() const
+  mip_node_t detach_copy() const
   {
     mip_node_t<i_t, f_t> copy;
     copy.lower_bound        = lower_bound;
@@ -264,7 +262,7 @@ class mip_node_t {
     copy.depth              = depth;
     copy.node_id            = node_id;
     copy.integer_infeasible = integer_infeasible;
-    copy.vstatus            = vstatus;
+    copy.packed_vstatus     = packed_vstatus;
     copy.branch_var         = branch_var;
     copy.branch_dir         = branch_dir;
     copy.branch_var_lower   = branch_var_lower;
@@ -295,7 +293,11 @@ class mip_node_t {
   mip_node_t<i_t, f_t>* parent;
   std::unique_ptr<mip_node_t> children[2];
 
-  std::vector<variable_status_t> vstatus;
+  std::vector<uint8_t> packed_vstatus;
+
+  // Indicate if we can dive from this node or not. This is set to false when
+  // this node was already selected for diving once.
+  omp_atomic_t<bool> can_dive{true};
 
   // Worker-local identification for deterministic ordering:
   // - origin_worker_id: which worker created this node
diff --git a/cpp/src/branch_and_bound/node_queue.hpp b/cpp/src/branch_and_bound/node_queue.hpp
index 09d030c96e..15c50c99b7 100644
--- a/cpp/src/branch_and_bound/node_queue.hpp
+++ b/cpp/src/branch_and_bound/node_queue.hpp
@@ -10,7 +10,6 @@
 #include <algorithm>
 #include <memory>
 #include <mutex>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -29,12 +28,14 @@ class heap_t {
   {
     buffer.push_back(node);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   void push(T&& node)
   {
     buffer.push_back(std::move(node));
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   template <typename... Args>
@@ -42,39 +43,99 @@ class heap_t {
   {
     buffer.emplace_back(std::forward<Args>(args)...);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
+    assert(num_entries_.load() == buffer.size());
   }
 
-  std::optional<T> pop()
+  T pop()
   {
-    if (buffer.empty()) return std::nullopt;
-
     std::pop_heap(buffer.begin(), buffer.end(), comp);
     T node = std::move(buffer.back());
     buffer.pop_back();
+    --num_entries_;
+    assert(num_entries_.load() == buffer.size());
     return node;
   }
 
-  size_t size() const { return buffer.size(); }
+  size_t size() const { return num_entries_; }
   T& top() { return buffer.front(); }
-  void clear() { buffer.clear(); }
-  bool empty() const { return buffer.empty(); }
+
+  void clear()
+  {
+    buffer.clear();
+    num_entries_ = 0;
+  }
+
+  bool empty() const { return num_entries_ == 0; }
 
   // Read-only access to underlying buffer for iteration without modification
   const std::vector<T>& data() const { return buffer; }
 
  private:
   std::vector<T> buffer;
+  omp_atomic_t<size_t> num_entries_{0};
   Comp comp;
 };
 
-// A queue storing the nodes waiting to be explored/dived from.
+// A queue storing the nodes waiting to be explored. Before calling pop or push in parallel,
+// the mutex NEEDS to be acquired via the `lock()` method. It must the released afterwards with
+// `unlock()`.
 template <typename i_t, typename f_t>
 class node_queue_t {
+ public:
+  void push(mip_node_t<i_t, f_t>* new_node)
+  {
+    assert(new_node != nullptr);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap_.push(entry);
+    if (new_node->can_dive) diving_heap_.push(entry);
+    lower_bound_ = best_first_heap_.top()->lower_bound;
+  }
+
+  mip_node_t<i_t, f_t>* pop_best_first()
+  {
+    if (best_first_heap_.empty()) { return nullptr; }
+    auto entry                 = best_first_heap_.pop();
+    lower_bound_               = best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity()
+                                                          : best_first_heap_.top()->lower_bound;
+    mip_node_t<i_t, f_t>* node = std::exchange(entry->node, nullptr);
+    assert(node != nullptr);
+    return node;
+  }
+
+  mip_node_t<i_t, f_t>* pop_diving()
+  {
+    while (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      if (entry->node != nullptr) {
+        entry->node->can_dive = false;
+        return entry->node;
+      }
+    }
+    return nullptr;
+  }
+
+  void lock() { mutex_.lock(); }
+  void unlock() { mutex_.unlock(); }
+
+  mip_node_t<i_t, f_t>* bfs_top()
+  {
+    return best_first_heap_.empty() ? nullptr : best_first_heap_.top()->node;
+  }
+
+  i_t diving_queue_size() { return diving_heap_.size(); }
+  i_t best_first_queue_size() { return best_first_heap_.size(); }
+
+  f_t get_lower_bound()
+  {
+    return best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity() : lower_bound_.load();
+  }
+
  private:
   struct heap_entry_t {
     mip_node_t<i_t, f_t>* node = nullptr;
-    f_t lower_bound            = -inf;
-    f_t score                  = inf;
+    f_t lower_bound            = -std::numeric_limits<f_t>::infinity();
+    f_t score                  = std::numeric_limits<f_t>::infinity();
 
     heap_entry_t(mip_node_t<i_t, f_t>* new_node)
       : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
@@ -102,67 +163,11 @@ class node_queue_t {
     }
   };
 
-  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
-  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
-  omp_mutex_t mutex;
-
- public:
-  void push(mip_node_t<i_t, f_t>* new_node)
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = std::make_shared<heap_entry_t>(new_node);
-    best_first_heap.push(entry);
-    diving_heap.push(entry);
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = best_first_heap.pop();
-
-    if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
-
-    return std::nullopt;
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_diving()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-
-    while (!diving_heap.empty()) {
-      auto entry = diving_heap.pop();
-
-      if (entry.has_value()) {
-        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
-      }
-    }
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap_;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap_;
+  omp_mutex_t mutex_;
 
-    return std::nullopt;
-  }
-
-  i_t diving_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return diving_heap.size();
-  }
-
-  i_t best_first_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.size();
-  }
-
-  f_t get_lower_bound()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
-  }
-
-  mip_node_t<i_t, f_t>* bfs_top()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? nullptr : best_first_heap.top()->node;
-  }
+  omp_atomic_t<f_t> lower_bound_{std::numeric_limits<f_t>::infinity()};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 564019a15e..184e7ea102 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1402,8 +1402,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average.
   // Otherwise, the best ones are initialized via strong branching, while the other are ignored.  //
-  // In the latter, we are not using the average pseudocost (which calculated in the `initialized`
-  // method).
+  // So we only need to initialize the average for the former.
   if (reliable_threshold == 0) {
     avg_down = compute_pseudocost_average_down();
     avg_up   = compute_pseudocost_average_up();
@@ -1555,7 +1554,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             single_pivot_objective_change_estimate(worker->leaf_problem,
                                                    settings,
                                                    *AT,
-                                                   node_ptr->vstatus,
+                                                   worker->leaf_vstatus,
                                                    j,
                                                    basic_map[j],
                                                    leaf_solution,
@@ -1659,7 +1658,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
@@ -1704,7 +1703,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
diff --git a/cpp/src/branch_and_bound/worker.hpp b/cpp/src/branch_and_bound/worker.hpp
index 87689e57bb..acb8ccca87 100644
--- a/cpp/src/branch_and_bound/worker.hpp
+++ b/cpp/src/branch_and_bound/worker.hpp
@@ -9,13 +9,13 @@
 
 #include <branch_and_bound/constants.hpp>
 #include <branch_and_bound/mip_node.hpp>
+#include <branch_and_bound/node_queue.hpp>
 
 #include <dual_simplex/basis_updates.hpp>
 #include <dual_simplex/bounds_strengthening.hpp>
 
 #include <utilities/pcgenerator.hpp>
 
-#include <deque>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -31,16 +31,35 @@ struct branch_and_bound_stats_t {
   omp_atomic_t<f_t> last_log             = 0.0;
 };
 
+template <typename f_t, typename i_t>
+bool is_search_strategy_enabled(search_strategy_t strategy,
+                                bool has_incumbent,
+                                diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  switch (strategy) {
+    case BEST_FIRST: return true;
+    case PSEUDOCOST_DIVING: return settings.pseudocost_diving != 0;
+    case LINE_SEARCH_DIVING: return settings.line_search_diving != 0;
+    case GUIDED_DIVING: return settings.guided_diving != 0 && has_incumbent;
+    case COEFFICIENT_DIVING: return settings.coefficient_diving != 0;
+    default: return false;
+  }
+}
+
 template <typename i_t, typename f_t>
 class branch_and_bound_worker_t {
  public:
-  const i_t worker_id;
+  using float_type = f_t;
+  using int_type   = i_t;
+
+  i_t worker_id;
   omp_atomic_t<search_strategy_t> search_strategy;
   omp_atomic_t<bool> is_active;
   omp_atomic_t<f_t> lower_bound;
 
   lp_problem_t<i_t, f_t> leaf_problem;
   lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<variable_status_t> leaf_vstatus;
   std::vector<f_t> leaf_edge_norms;
 
   basis_update_mpf_t<i_t, f_t> basis_factors;
@@ -52,7 +71,6 @@ class branch_and_bound_worker_t {
 
   std::vector<f_t> start_lower;
   std::vector<f_t> start_upper;
-  mip_node_t<i_t, f_t>* start_node;
 
   pcgenerator_t rng;
 
@@ -63,53 +81,23 @@ class branch_and_bound_worker_t {
                             const lp_problem_t<i_t, f_t>& original_lp,
                             const csr_matrix_t<i_t, f_t>& Arow,
                             const std::vector<variable_type_t>& var_type,
-                            const simplex_solver_settings_t<i_t, f_t>& settings)
+                            const simplex_solver_settings_t<i_t, f_t>& settings,
+                            uint64_t rng_offset = 0)
     : worker_id(worker_id),
       search_strategy(BEST_FIRST),
       is_active(false),
       lower_bound(-std::numeric_limits<f_t>::infinity()),
       leaf_problem(original_lp),
       leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      leaf_vstatus(original_lp.num_cols),
       basis_factors(original_lp.num_rows, settings.refactor_frequency),
       basic_list(original_lp.num_rows),
       nonbasic_list(),
       node_presolver(leaf_problem, Arow, {}, var_type),
       bounds_changed(original_lp.num_cols, false),
-      rng(settings.random_seed + pcgenerator_t::default_seed + worker_id,
-          pcgenerator_t::default_stream ^ worker_id)
-  {
-  }
-
-  // Set the `start_node` for best-first search.
-  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
-  {
-    start_node      = node;
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = BEST_FIRST;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-  }
-
-  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
-  // `start_upper`. Returns `true` if the starting node is feasible via
-  // bounds propagation.
-  bool init_diving(mip_node_t<i_t, f_t>* node,
-                   search_strategy_t type,
-                   const lp_problem_t<i_t, f_t>& original_lp,
-                   const simplex_solver_settings_t<i_t, f_t>& settings)
+      rng(settings.random_seed + pcgenerator_t::default_seed + rng_offset + worker_id,
+          pcgenerator_t::default_stream ^ (worker_id + rng_offset))
   {
-    internal_node   = node->detach_copy();
-    start_node      = &internal_node;
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = type;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-
-    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
-    return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper);
   }
 
   // Set the variables bounds for the LP relaxation in the current node.
@@ -134,14 +122,172 @@ class branch_and_bound_worker_t {
       settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
   }
 
- private:
-  // For diving, we need to store the full node instead of
-  // of just a pointer, since it is not stored in the tree anymore.
-  // To keep the same interface across all worker types,
-  // this will be used as a temporary storage and
-  // will be pointed by `start_node`.
-  // For exploration, this will not be used.
-  mip_node_t<i_t, f_t> internal_node;
+  void set_active() { is_active = true; }
+};
+
+template <typename i_t, typename f_t>
+class bfs_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  bfs_worker_t(i_t worker_id,
+               const lp_problem_t<i_t, f_t>& original_lp,
+               const csr_matrix_t<i_t, f_t>& Arow,
+               const std::vector<variable_type_t>& var_type,
+               const simplex_solver_settings_t<i_t, f_t>& settings,
+               uint64_t rng_offset = 0)
+    : Base(worker_id, original_lp, Arow, var_type, settings, rng_offset)
+  {
+    this->start_lower     = original_lp.lower;
+    this->start_upper     = original_lp.upper;
+    this->search_strategy = BEST_FIRST;
+
+    max_diving_workers.fill(0);
+    active_diving_workers.fill(0);
+    total_active_diving_workers = 0;
+  }
+
+  void init(const std::vector<mip_node_t<i_t, f_t>*>& nodes)
+  {
+    assert(!this->is_active.load());
+    assert(node_queue.best_first_queue_size() == 0);
+    assert(nodes.size() > 0);
+
+    for (auto* node : nodes) {
+      assert(node != nullptr);
+      node_queue.push(node);
+    }
+
+    this->lower_bound = node_queue.get_lower_bound();
+  }
+
+  void set_inactive() { this->is_active = false; }
+
+  // Steal nodes from another worker
+  bool steal_node_from(bfs_worker_t* other, i_t num_nodes)
+  {
+    bool steal = false;
+    assert(num_nodes > 0);
+
+    if (!other->is_active || this == other ||
+        other->node_queue.best_first_queue_size() < 2 * num_nodes) {
+      return steal;
+    }
+
+    while (num_nodes > 0) {
+      other->node_queue.lock();
+      mip_node_t<i_t, f_t>* node = other->node_queue.best_first_queue_size() > num_nodes
+                                     ? other->node_queue.pop_best_first()
+                                     : nullptr;
+      other->node_queue.unlock();
+      if (node == nullptr) { break; }
+
+      this->node_queue.lock();
+      this->node_queue.push(node);
+      this->node_queue.unlock();
+      --num_nodes;
+      steal = true;
+    }
+
+    return steal;
+  }
+
+  // Calculate the number of diving workers that this worker can launch. Having a fixed number
+  // of workers allows the solver to be more deterministic.
+  void calculate_num_diving_workers(i_t num_bfs_workers,
+                                    i_t total_diving_workers,
+                                    bool has_incumbent,
+                                    const diving_heuristics_settings_t<i_t, f_t>& settings)
+  {
+    i_t num_active = 0;
+    for (i_t i = 1; i < num_search_strategies; ++i) {
+      num_active += is_search_strategy_enabled(search_strategies[i], has_incumbent, settings);
+    }
+
+    total_max_diving_workers = 0;
+    max_diving_workers.fill(0);
+    if (num_active == 0) { return; }
+
+    for (size_t i = 1, k = 0; i < num_search_strategies; ++i) {
+      if (is_search_strategy_enabled(search_strategies[i], has_incumbent, settings)) {
+        // Calculate the number of workers for a given diving heuristic
+        i_t start            = std::floor((double)k * total_diving_workers / num_active);
+        i_t end              = std::floor((double)(k + 1) * total_diving_workers / num_active);
+        i_t workers_per_type = end - start;
+
+        // Calculate the number of diving workers allocated to this (best-first) worker
+        start = std::floor((double)this->worker_id * workers_per_type / num_bfs_workers);
+        end   = std::floor((double)(this->worker_id + 1) * workers_per_type / num_bfs_workers);
+        max_diving_workers[i] = end - start;
+        total_max_diving_workers += max_diving_workers[i];
+        ++k;
+      }
+    }
+  }
+
+  // The worker-local node heap.
+  node_queue_t<i_t, f_t> node_queue;
+
+  // The number of diving workers of each type that this (best-first) worker can launch.
+  std::array<i_t, num_search_strategies> max_diving_workers;
+
+  // The number of active diving workers of each type associated with this (best-first) worker.
+  std::array<omp_atomic_t<i_t>, num_search_strategies> active_diving_workers;
+
+  // Keep track of the total number of active diving worker that are associated with this
+  // (best-first) worker
+  omp_atomic_t<i_t> total_active_diving_workers{0};
+
+  // The maximum number of diving worker that are associated with this
+  // (best-first) worker
+  i_t total_max_diving_workers{0};
+};
+
+template <typename i_t, typename f_t>
+class diving_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  using Base::Base;
+
+  // After calling this routine, you need to set `is_active = true` when the worker is ready.
+  // Note that the starting node may be dropped if become infeasible via bound propagation.
+  void init(const mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    // Creates a copy of the node that is disconnected from the main tree, such that the
+    // diving does not modify the main tree. We need to store the variables bounds
+    // associated with this node, since we cannot retrieve it from the tree
+    start_node        = node->detach_copy();
+    this->start_lower = original_lp.lower;
+    this->start_upper = original_lp.upper;
+    this->lower_bound = node->lower_bound;
+    std::fill(this->bounds_changed.begin(), this->bounds_changed.end(), false);
+    node->get_variable_bounds(this->start_lower, this->start_upper, this->bounds_changed);
+  }
+
+  // Apply bound strengthening to the starting variable bounds
+  bool presolve_start_bounds(const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    return this->node_presolver.bounds_strengthening(
+      settings, this->bounds_changed, this->start_lower, this->start_upper);
+  }
+
+  // Set this node inactive
+  void set_inactive()
+  {
+    assert(this->is_active.load());
+    assert(bfs_worker != nullptr);
+    assert(bfs_worker->active_diving_workers[this->search_strategy].load() > 0);
+    assert(bfs_worker->total_active_diving_workers.load() > 0);
+
+    this->is_active = false;
+    --bfs_worker->active_diving_workers[this->search_strategy];
+    --bfs_worker->total_active_diving_workers;
+  }
+
+  mip_node_t<i_t, f_t> start_node;
+
+  // The best-first worker that is associated with this diving worker. Used for controlling the
+  // number of active diving workers.
+  bfs_worker_t<i_t, f_t>* bfs_worker{nullptr};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp
index 2b52b6e7bf..9396f48c04 100644
--- a/cpp/src/branch_and_bound/worker_pool.hpp
+++ b/cpp/src/branch_and_bound/worker_pool.hpp
@@ -8,57 +8,62 @@
 #pragma once
 
 #include <branch_and_bound/worker.hpp>
+#include <utilities/circular_deque.hpp>
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t, typename f_t>
-class branch_and_bound_worker_pool_t {
+template <typename WorkerType>
+class worker_pool_t {
  public:
+  using i_t = typename WorkerType::int_type;
+  using f_t = typename WorkerType::float_type;
+
   void init(i_t num_workers,
             const lp_problem_t<i_t, f_t>& original_lp,
             const csr_matrix_t<i_t, f_t>& Arow,
             const std::vector<variable_type_t>& var_type,
-            const simplex_solver_settings_t<i_t, f_t>& settings)
+            const simplex_solver_settings_t<i_t, f_t>& settings,
+            const uint64_t rng_offset = 0)
   {
+    assert(!is_initialized);
+    assert(num_workers > 0);
+
     workers_.resize(num_workers);
     num_idle_workers_ = num_workers;
+    idle_workers_.clear_resize(num_workers);
     for (i_t i = 0; i < num_workers; ++i) {
-      workers_[i] = std::make_unique<branch_and_bound_worker_t<i_t, f_t>>(
-        i, original_lp, Arow, var_type, settings);
-      idle_workers_.push_front(i);
+      workers_[i] =
+        std::make_unique<WorkerType>(i, original_lp, Arow, var_type, settings, rng_offset);
+      idle_workers_.push_back(i);
     }
 
     is_initialized = true;
   }
 
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  branch_and_bound_worker_t<i_t, f_t>* get_idle_worker()
+  WorkerType* pop_idle_worker()
   {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
+    std::lock_guard lock(mutex_);
     if (idle_workers_.empty()) {
       return nullptr;
     } else {
       i_t idx = idle_workers_.front();
-      return workers_[idx].get();
-    }
-  }
-
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  void pop_idle_worker()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
-    if (!idle_workers_.empty()) {
       idle_workers_.pop_front();
       num_idle_workers_--;
+      assert(idle_workers_.size() == static_cast<size_t>(num_idle_workers_.load()));
+      assert(idx >= 0 && static_cast<size_t>(idx) < workers_.size());
+      return workers_[idx].get();
     }
   }
 
-  void return_worker_to_pool(branch_and_bound_worker_t<i_t, f_t>* worker)
+  void return_worker_to_pool(WorkerType* worker)
   {
-    worker->is_active = false;
-    std::lock_guard<omp_mutex_t> lock(mutex_);
+    std::lock_guard lock(mutex_);
+    assert(worker != nullptr);
+    assert(workers_[worker->worker_id].get() == worker);
+    assert(!worker->is_active.load());
+    assert(static_cast<size_t>(num_idle_workers_.load()) == idle_workers_.size());
+    assert(idle_workers_.size() <= workers_.size());
+
     idle_workers_.push_back(worker->worker_id);
     num_idle_workers_++;
   }
@@ -69,7 +74,7 @@ class branch_and_bound_worker_pool_t {
 
     if (is_initialized) {
       for (i_t i = 0; i < workers_.size(); ++i) {
-        if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) {
+        if (workers_[i]->is_active.load()) {
           lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
         }
       }
@@ -78,53 +83,35 @@ class branch_and_bound_worker_pool_t {
     return lower_bound;
   }
 
-  i_t num_idle_workers() { return num_idle_workers_; }
+  WorkerType* operator[](i_t id)
+  {
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
+  }
+  WorkerType* operator[](i_t id) const
+  {
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
+  }
+
+  i_t num_idle_workers() const { return num_idle_workers_; }
+  i_t num_workers() const { return workers_.size(); }
 
  private:
-  // Worker pool
-  std::vector<std::unique_ptr<branch_and_bound_worker_t<i_t, f_t>>> workers_;
+  std::vector<std::unique_ptr<WorkerType>> workers_;
   bool is_initialized = false;
 
   omp_mutex_t mutex_;
-  std::deque<i_t> idle_workers_;
-  omp_atomic_t<i_t> num_idle_workers_;
+  circular_deque_t<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_{0};
 };
 
-template <typename f_t, typename i_t>
-std::vector<search_strategy_t> get_search_strategies(
-  diving_heuristics_settings_t<i_t, f_t> settings)
-{
-  std::vector<search_strategy_t> types;
-  types.reserve(num_search_strategies);
-  types.push_back(BEST_FIRST);
-  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
-  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
-  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
-  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
-  return types;
-}
-
-template <typename i_t>
-std::array<i_t, num_search_strategies> get_max_workers(
-  i_t num_workers, const std::vector<search_strategy_t>& strategies)
-{
-  std::array<i_t, num_search_strategies> max_num_workers;
-  max_num_workers.fill(0);
-
-  i_t bfs_workers             = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1);
-  max_num_workers[BEST_FIRST] = bfs_workers;
-
-  i_t diving_workers = (num_workers - bfs_workers);
-  i_t m              = strategies.size() - 1;
-
-  for (size_t i = 1, k = 0; i < strategies.size(); ++i) {
-    i_t start                      = (double)k * diving_workers / m;
-    i_t end                        = (double)(k + 1) * diving_workers / m;
-    max_num_workers[strategies[i]] = end - start;
-    ++k;
-  }
+template <typename i_t, typename f_t>
+using bfs_worker_pool_t = worker_pool_t<bfs_worker_t<i_t, f_t>>;
 
-  return max_num_workers;
-}
+template <typename i_t, typename f_t>
+using diving_worker_pool_t = worker_pool_t<diving_worker_t<i_t, f_t>>;
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp
index d69bf8877e..dcb1615359 100644
--- a/cpp/src/dual_simplex/initial_basis.cpp
+++ b/cpp/src/dual_simplex/initial_basis.cpp
@@ -18,6 +18,90 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+uint8_t encode(variable_status_t vstatus)
+{
+  assert(vstatus != variable_status_t::SUPERBASIC &&
+         "packed_variable_status_t does not support superbasic variables");
+
+  uint8_t val = 0;
+  switch (vstatus) {
+    case variable_status_t::BASIC: val = 0b00; break;
+    case variable_status_t::NONBASIC_LOWER: val = 0b01; break;
+    case variable_status_t::NONBASIC_UPPER: val = 0b10; break;
+    case variable_status_t::NONBASIC_FIXED: val = 0b01; break;
+    default: val = 0b11;
+  }
+
+  return val;
+}
+
+variable_status_t decode(uint8_t val)
+{
+  val &= 0b11;
+  if (val == 0b00) return variable_status_t::BASIC;
+  if (val == 0b01) return variable_status_t::NONBASIC_LOWER;
+  if (val == 0b10) return variable_status_t::NONBASIC_UPPER;
+  return variable_status_t::NONBASIC_FREE;
+}
+
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus)
+{
+  size_t n        = vstatus.size() / 4;
+  size_t has_tail = (vstatus.size() % 4 > 0);
+
+  std::vector<uint8_t> packed_vstatus;
+  packed_vstatus.resize(n + has_tail);
+
+  for (size_t i = 0; i < n; ++i) {
+    size_t j          = i * 4;
+    packed_vstatus[i] = 0;
+    packed_vstatus[i] |= encode(vstatus[j]);
+    packed_vstatus[i] |= encode(vstatus[j + 1]) << 2;
+    packed_vstatus[i] |= encode(vstatus[j + 2]) << 4;
+    packed_vstatus[i] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  if (has_tail) {
+    size_t j          = n * 4;
+    packed_vstatus[n] = 0;
+    packed_vstatus[n] |= encode(vstatus[j]);
+    if (j + 1 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 1]) << 2;
+    if (j + 2 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 2]) << 4;
+    if (j + 3 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  return packed_vstatus;
+}
+
+std::vector<variable_status_t> decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                                                  size_t vstatus_size)
+{
+  size_t n        = vstatus_size / 4;
+  size_t has_tail = (vstatus_size % 4 > 0);
+
+  std::vector<variable_status_t> vstatus;
+  vstatus.resize(vstatus_size);
+  assert(packed_vstatus.size() == n + has_tail);
+
+  for (size_t i = 0; i < n; ++i) {
+    size_t j       = i * 4;
+    vstatus[j]     = decode(packed_vstatus[i]);
+    vstatus[j + 1] = decode(packed_vstatus[i] >> 2);
+    vstatus[j + 2] = decode(packed_vstatus[i] >> 4);
+    vstatus[j + 3] = decode(packed_vstatus[i] >> 6);
+  }
+
+  if (has_tail) {
+    size_t j   = n * 4;
+    vstatus[j] = decode(packed_vstatus[n]);
+    if (j + 1 < vstatus.size()) vstatus[j + 1] = decode(packed_vstatus[n] >> 2);
+    if (j + 2 < vstatus.size()) vstatus[j + 2] = decode(packed_vstatus[n] >> 4);
+    if (j + 3 < vstatus.size()) vstatus[j + 3] = decode(packed_vstatus[n] >> 6);
+  }
+
+  return vstatus;
+}
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/initial_basis.hpp b/cpp/src/dual_simplex/initial_basis.hpp
index 646785fbd2..22a172dbe0 100644
--- a/cpp/src/dual_simplex/initial_basis.hpp
+++ b/cpp/src/dual_simplex/initial_basis.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -23,6 +23,10 @@ enum class variable_status_t : int8_t {
   SUPERBASIC     = 4
 };
 
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus);
+std::vector<variable_status_t> decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                                                  size_t vstatus_size);
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index cfc120e477..0f35a7d479 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -110,6 +110,9 @@ struct simplex_solver_settings_t {
       mip_batch_pdlp_reliability_branching(0),
       strong_branching_simplex_iteration_limit(-1),
       random_seed(0),
+      bnb_steal_chance(-1),
+      bnb_nodes_per_steal(10),
+      bnb_max_steal_attempts(3),
       reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
@@ -198,13 +201,21 @@ struct simplex_solver_settings_t {
                                              // PDLP only
   // Set the maximum number of simplex iterations allowed per trial branch when applying
   // strong branching to the root node.
-  // -1 - Automatic (iteration limit = 200)
-  // 0, 1 - Estimate the objective change using a single pivot of dual simplex
-  // >1 - Set as the iteration limit in dual simplex
+  // -1 - automatic (iteration limit = 200)
+  // 0, 1 - estimate the objective change using a single pivot of dual simplex
+  // >1 - set as the iteration limit in dual simplex
   i_t strong_branching_simplex_iteration_limit;
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
+  // In B&B, indicate the chance in which a worker can steal a node from another worker.
+  // -1 - automatic (0.05)
+  // 0 - disable
+  // >0 - set the stealing chance [0, 1]
+  f_t bnb_steal_chance;
+  i_t bnb_nodes_per_steal;
+  i_t bnb_max_steal_attempts;
+
   // Settings for the reliability branching.
   // - -1: automatic
   // - 0: disable (use pseudocost branching instead)
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 682be92a54..2c2c05a795 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -715,6 +715,12 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
                                solver_stats_t<i_t, f_t>{},
                                op_problem.get_handle_ptr()->get_stream());
 
+  // The outer solver opens an omp parallel region in solve.cu, so this inner team would
+  // collapse to a single thread under the default OMP_MAX_ACTIVE_LEVELS=1 and only worker 0
+  // would execute. Enable two active levels locally and restore on the way out.
+  const int saved_max_active_levels = omp_get_max_active_levels();
+  if (saved_max_active_levels < 2) { omp_set_max_active_levels(2); }
+
   // Creates the OpenMP thread pool. It will be shared across the entire MIP solver.
 #pragma omp parallel num_threads(num_threads) default(none) \
   shared(sol, op_problem, settings_const, exception)
@@ -731,15 +737,16 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     }
   }  // Implicit barrier
 
+  if (saved_max_active_levels < 2) { omp_set_max_active_levels(saved_max_active_levels); }
+
   if (exception) { std::rethrow_exception(exception); }
   return sol;
 }
 
 template <typename i_t, typename f_t>
-mip_solution_t<i_t, f_t> solve_mip(
-  raft::handle_t const* handle_ptr,
-  const cuopt::mps_parser::mps_data_model_t<i_t, f_t>& mps_data_model,
-  mip_solver_settings_t<i_t, f_t> const& settings)
+mip_solution_t<i_t, f_t> solve_mip(raft::handle_t const* handle_ptr,
+                                   const mps_parser::mps_data_model_t<i_t, f_t>& mps_data_model,
+                                   mip_solver_settings_t<i_t, f_t> const& settings)
 {
   auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
   return solve_mip(op_problem, settings);
diff --git a/cpp/src/utilities/circular_deque.hpp b/cpp/src/utilities/circular_deque.hpp
new file mode 100644
index 0000000000..6e9d8f5b05
--- /dev/null
+++ b/cpp/src/utilities/circular_deque.hpp
@@ -0,0 +1,114 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <cassert>
+#include <utility>
+#include <vector>
+
+namespace cuopt {
+
+// A fixed-capacity double-ended queue backed by a circular buffer.
+// All operations are O(1) with no dynamic allocation after construction.
+//
+// Preconditions (asserted in debug builds):
+//   - push_front / push_back : size() < capacity()
+//   - pop_front  / pop_back  : !empty()
+//   - front / back           : !empty()
+template <typename T>
+class circular_deque_t {
+ public:
+  circular_deque_t() : buffer_(1), capacity_(1), head_(0), tail_(0) {}
+
+  // Allocates storage for exactly `capacity` elements up front.
+  explicit circular_deque_t(size_t capacity)
+    : buffer_(capacity + 1),  // +1 to distinguish full (next(tail)==head) from empty (head==tail)
+      capacity_(capacity + 1),
+      head_(0),
+      tail_(0)
+  {
+    assert(capacity > 0);
+  }
+
+  bool empty() const { return head_ == tail_; }
+  bool full() const { return next(tail_) == head_; }
+
+  size_t size() const { return (tail_ - head_ + capacity_) % capacity_; }
+  size_t capacity() const { return capacity_ - 1; }
+
+  void clear_resize(size_t new_capacity)
+  {
+    assert(new_capacity > 0);
+    head_     = 0;
+    tail_     = 0;
+    capacity_ = new_capacity + 1;
+    buffer_.resize(capacity_);
+  }
+
+  void push_back(T val)
+  {
+    assert(!full());
+    buffer_[tail_] = std::move(val);
+    tail_          = next(tail_);
+  }
+
+  void push_front(T val)
+  {
+    assert(!full());
+    head_          = prev(head_);
+    buffer_[head_] = std::move(val);
+  }
+
+  T pop_front()
+  {
+    assert(!empty());
+    T val = std::move(buffer_[head_]);
+    head_ = next(head_);
+    return val;
+  }
+
+  T pop_back()
+  {
+    assert(!empty());
+    tail_ = prev(tail_);
+    return std::move(buffer_[tail_]);
+  }
+
+  T& front()
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+  const T& front() const
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+
+  T& back()
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+  const T& back() const
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+
+ private:
+  size_t next(size_t idx) const { return (idx + 1) % capacity_; }
+  size_t prev(size_t idx) const { return (idx + capacity_ - 1) % capacity_; }
+
+  std::vector<T> buffer_;
+  size_t capacity_;
+  size_t head_;
+  size_t tail_;
+};
+
+}  // namespace cuopt
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index a13b9ec887..a4209adf14 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -20,11 +20,9 @@ namespace cuopt {
 class omp_mutex_t {
  public:
   omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); }
-
-  omp_mutex_t(const omp_mutex_t&) = delete;
-
   omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); }
 
+  omp_mutex_t(const omp_mutex_t&)            = delete;
   omp_mutex_t& operator=(const omp_mutex_t&) = delete;
 
   omp_mutex_t& operator=(omp_mutex_t&& other)
@@ -203,17 +201,10 @@ class omp_atomic_t {
  private:
   T val;
 
-#ifndef __NVCC__
   friend double fetch_min(omp_atomic_t<double>& atomic_var, double other);
   friend double fetch_max(omp_atomic_t<double>& atomic_var, double other);
-#endif
 };
 
-// Atomic CAS are only supported in OpenMP v5.1
-// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot
-// parse it correctly yet
-#ifndef __NVCC__
-
 // Free non-template functions are necessary because of a clang 20 bug
 // when omp atomic compare is used within a templated context.
 // see https://github.com/llvm/llvm-project/issues/127466
@@ -238,7 +229,6 @@ inline double fetch_max(omp_atomic_t<double>& atomic_var, double other)
   }
   return old;
 }
-#endif
 
 #endif
 
diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp
index 29a865f02f..e83e5f36ad 100644
--- a/cpp/src/utilities/pcgenerator.hpp
+++ b/cpp/src/utilities/pcgenerator.hpp
@@ -21,12 +21,11 @@ class pcgenerator_t {
   static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
 
   /**
-   * @brief ctor. Initializes the PCG
-   * @param rng_state is the generator state used for initializing the generator
-   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
-   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
-   * unique set of random numbers. This can be achieved by initializing the generator with same
-   * rng_state for all the threads and diststreamt values for subsequence.
+   * @brief Initializes the PCG generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences. Use distinct values per
+   *                    thread to guarantee non-overlapping streams in parallel contexts.
+   * @param offset      Number of outputs to skip ahead before the first draw.
    */
   pcgenerator_t(const uint64_t seed        = default_seed,
                 const uint64_t subsequence = default_stream,
@@ -35,7 +34,12 @@ class pcgenerator_t {
     set_seed(seed, subsequence, offset);
   }
 
-  // Set the seed, subsequence and offset of the PCG
+  /**
+   * @brief Re-seeds the generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences.
+   * @param offset      Number of outputs to skip ahead before the first draw.
+   */
   void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
   {
     state  = uint64_t(0);
@@ -47,8 +51,12 @@ class pcgenerator_t {
     skipahead(offset);
   }
 
-  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
-  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  /**
+   * @brief Advances the generator state by @p offset steps in O(log offset) time.
+   *
+   * Uses the closed-form LCG jump described in "Random Number Generation with Arbitrary Strides"
+   * (F. B. Brown, https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf).
+   */
   void skipahead(uint64_t offset)
   {
     uint64_t G = 1;
@@ -68,9 +76,7 @@ class pcgenerator_t {
   }
 
   /**
-   * @defgroup NextRand Generate the next random number
-   * @brief This code is derived from PCG basic code
-   * @{
+   * @returns the next uniformly distributed 32-bit unsigned integer.
    */
   uint32_t next_u32()
   {
@@ -83,6 +89,9 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed 64-bit unsigned integer.
+   */
   uint64_t next_u64()
   {
     uint64_t ret;
@@ -93,6 +102,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 32-bit signed integer in [0,
+   * INT32_MAX].
+   */
   int32_t next_i32()
   {
     int32_t ret;
@@ -102,6 +115,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 64-bit signed integer in [0,
+   * INT64_MAX].
+   */
   int64_t next_i64()
   {
     int64_t ret;
@@ -111,10 +128,19 @@ class pcgenerator_t {
     return ret;
   }
 
-  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+  /**
+   * @returns a uniformly distributed float in [0, 1).
+   */
+  float next_float() { return (next_u32() >> 8) * 0x1.0p-24; }
 
-  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+  /**
+   * @returns a uniformly distributed double in [0, 1).
+   */
+  double next_double() { return (next_u64() >> 11) * 0x1.0p-53; }
 
+  /**
+   * @returns the next random value of type @p T.
+   */
   template <typename T>
   T next()
   {
@@ -130,9 +156,11 @@ class pcgenerator_t {
   void next(float& ret) { ret = next_float(); }
   void next(double& ret) { ret = next_double(); }
 
-  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
-  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
-  /// some numbers in the range (scaling by floating-point).
+  /**
+   * @brief Draws a sample from a uniform distribution over `[low, high)`.
+   *
+   * May have a slight bias toward some values due to floating-point scaling.
+   */
   template <typename T>
   T uniform(T low, T high)
   {
@@ -141,7 +169,9 @@ class pcgenerator_t {
     return low + (val * range);
   }
 
-  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  /**
+   * @brief Shuffles @p seq in-place using the Fisher-Yates algorithm.
+   */
   template <typename T>
   void shuffle(std::vector<T>& seq)
   {