diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index ba80244b3b..ec8aa43721 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -34,9 +35,7 @@ #include #include #include -#include #include -#include #include #include @@ -296,11 +295,8 @@ branch_and_bound_t::branch_and_bound_t( template f_t branch_and_bound_t::get_lower_bound() { - f_t lower_bound = lower_bound_ceiling_.load(); - f_t heap_lower_bound = node_queue_.get_lower_bound(); - f_t worker_lower_bound = worker_pool_.get_lower_bound(); - lower_bound = std::min(heap_lower_bound, lower_bound); - lower_bound = std::min(worker_lower_bound, lower_bound); + f_t lower_bound = lower_bound_ceiling_.load(); + lower_bound = std::min(bfs_worker_pool_.get_lower_bound(), lower_bound); if (std::isfinite(lower_bound)) { return lower_bound; @@ -707,9 +703,11 @@ void branch_and_bound_t::set_final_solution(mip_solution_t& if (solver_status_ == mip_status_t::TIME_LIMIT) { settings_.log.printf("Time limit reached. Stopping the solver...\n"); } + if (solver_status_ == mip_status_t::WORK_LIMIT) { settings_.log.printf("Work limit reached. Stopping the solver...\n"); } + if (solver_status_ == mip_status_t::NODE_LIMIT) { settings_.log.printf("Node limit reached. Stopping the solver...\n"); } @@ -837,7 +835,7 @@ branch_variable_t branch_and_bound_t::variable_selection( std::vector& solution = worker->leaf_solution.x; switch (worker->search_strategy) { - case search_strategy_t::BEST_FIRST: + case BEST_FIRST: if (settings_.reliability_branching != 0) { branch_var = pc_.reliable_variable_selection(node_ptr, @@ -846,7 +844,7 @@ branch_variable_t branch_and_bound_t::variable_selection( var_types_, exploration_stats_, upper_bound_, - worker_pool_.num_idle_workers(), + bfs_worker_pool_.num_idle_workers(), new_slacks_, original_lp_); } else { @@ -857,17 +855,18 @@ branch_variable_t branch_and_bound_t::variable_selection( return {branch_var, round_dir}; - case search_strategy_t::COEFFICIENT_DIVING: + case COEFFICIENT_DIVING: return coefficient_diving( original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log); - case search_strategy_t::LINE_SEARCH_DIVING: + case LINE_SEARCH_DIVING: return line_search_diving(fractional, solution, root_relax_soln_.x, log); - case search_strategy_t::PSEUDOCOST_DIVING: + case PSEUDOCOST_DIVING: return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log); - case search_strategy_t::GUIDED_DIVING: + case GUIDED_DIVING: + assert(incumbent_.has_incumbent); mutex_upper_.lock(); current_incumbent = incumbent_.x; mutex_upper_.unlock(); @@ -1085,12 +1084,12 @@ struct deterministic_diving_policy_t : deterministic_policy_base_t> { using base = deterministic_policy_base_t>; - std::deque*>& stack; + circular_deque_t*>& stack; i_t max_backtrack_depth; deterministic_diving_policy_t(branch_and_bound_t& bnb, deterministic_diving_worker_t& worker, - std::deque*>& stack, + circular_deque_t*>& stack, i_t max_backtrack_depth) : base(bnb, worker), stack(stack), max_backtrack_depth(max_backtrack_depth) { @@ -1244,7 +1243,7 @@ std::pair branch_and_bound_t::updat policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x); round_dir = dir; - assert(node_ptr->vstatus.size() == leaf_problem.num_cols); + assert(worker->leaf_vstatus.size() == leaf_problem.num_cols); assert(branch_var >= 0); assert(dir != branch_direction_t::NONE); @@ -1258,7 +1257,7 @@ std::pair branch_and_bound_t::updat branch_var, leaf_solution.x[branch_var], num_frac, - node_ptr->vstatus, + worker->leaf_vstatus, leaf_problem, log); search_tree.update(node_ptr, node_status_t::HAS_CHILDREN); @@ -1341,8 +1340,9 @@ dual::status_t branch_and_bound_t::solve_node_lp( } #endif - std::vector& leaf_vstatus = node_ptr->vstatus; - assert(leaf_vstatus.size() == worker->leaf_problem.num_cols); + worker->leaf_vstatus = + decompress_vstatus(node_ptr->packed_vstatus, worker->leaf_problem.num_cols); + assert(worker->leaf_vstatus.size() == worker->leaf_problem.num_cols); simplex_solver_settings_t lp_settings = settings_; lp_settings.concurrent_halt = &node_concurrent_halt_; @@ -1401,7 +1401,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( lp_start_time, worker->leaf_problem, lp_settings, - leaf_vstatus, + worker->leaf_vstatus, worker->basis_factors, worker->basic_list, worker->nonbasic_list, @@ -1418,7 +1418,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( worker->basis_factors, worker->basic_list, worker->nonbasic_list, - leaf_vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms); lp_status = convert_lp_status_to_dual_status(second_status); @@ -1434,11 +1434,19 @@ dual::status_t branch_and_bound_t::solve_node_lp( return lp_status; } + template -void branch_and_bound_t::plunge_with(branch_and_bound_worker_t* worker) +void branch_and_bound_t::plunge_with(bfs_worker_t* worker, + mip_node_t* start_node) { - std::deque*> stack; - stack.push_front(worker->start_node); + assert(worker != nullptr && worker->is_active.load()); + assert(start_node != nullptr); + + // Stack holds at most 2 entries: the preferred child + its sibling. + // The sibling is evicted to the queue before a new pair of children is added. + circular_deque_t*> stack(4); + stack.push_front(start_node); + worker->recompute_basis = true; worker->recompute_bounds = true; @@ -1449,6 +1457,36 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t 0 && (solver_status_ == mip_status_t::UNSET && is_running_) && rel_gap > settings_.relative_mip_gap_tol && abs_gap > settings_.absolute_mip_gap_tol) { + if (worker->worker_id == 0) { repair_heuristic_solutions(); } + + // Launch a new diving task if any worker is idle + if (worker->total_active_diving_workers < worker->total_max_diving_workers && + worker->node_queue.diving_queue_size() > 0) { + launch_diving_worker(worker); + } + + // If any best-first worker become idle, + if (bfs_worker_pool_.num_idle_workers() > 0 && worker->node_queue.best_first_queue_size() > 0) { + worker->node_queue.lock(); + mip_node_t* node = worker->node_queue.pop_best_first(); + + // We need to temporarily save the lower bound in this worker so it is + // considered when calculating the global lower bound. + f_t node_lower_bound = node ? node->lower_bound : std::numeric_limits::infinity(); + worker->lower_bound = std::min(worker->lower_bound.load(), node_lower_bound); + + worker->node_queue.unlock(); + + if (node != nullptr) { + if (!launch_bfs_worker({node})) { + worker->node_queue.lock(); + worker->node_queue.push(node); + worker->node_queue.unlock(); + } + } + } + + assert(stack.size() <= 2); mip_node_t* node_ptr = stack.front(); stack.pop_front(); @@ -1458,7 +1496,7 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_tlower_bound = node_ptr->lower_bound; + worker->lower_bound = std::min(worker->node_queue.get_lower_bound(), node_ptr->lower_bound); if (node_ptr->lower_bound > upper_bound_.load()) { search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound); @@ -1469,13 +1507,31 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t settings_.time_limit) { + f_t now = toc(exploration_stats_.start_time); + + if (worker->worker_id == 0) { + f_t time_since_last_log = + exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); + i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; + + if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && + time_since_last_log >= 1) || + (time_since_last_log > 30) || now > settings_.time_limit) { + report(' ', upper_bound_, lower_bound, node_ptr->depth, node_ptr->integer_infeasible); + exploration_stats_.last_log = tic(); + exploration_stats_.nodes_since_last_log = 0; + } + } + + if (now > settings_.time_limit) { solver_status_ = mip_status_t::TIME_LIMIT; + stack.push_front(node_ptr); break; } if (exploration_stats_.nodes_explored >= settings_.node_limit) { solver_status_ = mip_status_t::NODE_LIMIT; + stack.push_front(node_ptr); break; } @@ -1483,11 +1539,17 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t::plunge_with(branch_and_bound_worker_t 0) { mip_node_t* node = stack.back(); stack.pop_back(); - node_queue_.push(node); + worker->node_queue.lock(); + worker->node_queue.push(node); + worker->node_queue.unlock(); } exploration_stats_.nodes_unexplored += 2; if (round_dir == branch_direction_t::UP) { - if (node_queue_.best_first_queue_size() < min_node_queue_size_) { - node_queue_.push(node_ptr->get_down_child()); + if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) { + worker->node_queue.lock(); + worker->node_queue.push(node_ptr->get_down_child()); + worker->node_queue.unlock(); } else { stack.push_front(node_ptr->get_down_child()); } stack.push_front(node_ptr->get_up_child()); } else { - if (node_queue_.best_first_queue_size() < min_node_queue_size_) { - node_queue_.push(node_ptr->get_up_child()); + if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) { + worker->node_queue.lock(); + worker->node_queue.push(node_ptr->get_up_child()); + worker->node_queue.unlock(); } else { stack.push_front(node_ptr->get_up_child()); } @@ -1539,31 +1607,116 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_tnode_queue.lock(); + worker->node_queue.push(node); + worker->node_queue.unlock(); + } +} - if (stack.size() > 0 && - (rel_gap <= settings_.relative_mip_gap_tol || abs_gap <= settings_.absolute_mip_gap_tol)) { - // If the solver converged according to the gap rules, but we still have nodes to explore - // in the stack, then we should add all the pending nodes back to the heap so the lower - // bound of the solver is set to the correct value. - while (!stack.empty()) { - auto node = stack.front(); - stack.pop_front(); - node_queue_.push(node); - } +template +bfs_worker_t* branch_and_bound_t::launch_bfs_worker( + const std::vector*>& start_nodes) +{ + // Take an idle node from the pool + bfs_worker_t* idle_worker = bfs_worker_pool_.pop_idle_worker(); + if (!idle_worker) { return nullptr; } + + if (toc(exploration_stats_.start_time) > settings_.time_limit || + solver_status_ != mip_status_t::UNSET) { + bfs_worker_pool_.return_worker_to_pool(idle_worker); + return nullptr; } - if (settings_.num_threads > 1) { - worker_pool_.return_worker_to_pool(worker); - active_workers_per_strategy_[BEST_FIRST]--; + idle_worker->init(start_nodes); + idle_worker->set_active(); + +#pragma omp task affinity(*idle_worker) priority(99) default(none) firstprivate(idle_worker) + best_first_search_with(idle_worker); + + return idle_worker; +} + +template +void branch_and_bound_t::best_first_search_with(bfs_worker_t* worker) +{ + f_t lower_bound = get_lower_bound(); + f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); + f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + f_t steal_chance = settings_.bnb_steal_chance >= 0 ? settings_.bnb_steal_chance : 0.05; + + worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(), + diving_worker_pool_.num_workers(), + has_solver_space_incumbent(), + settings_.diving_settings); + + while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && + rel_gap > settings_.relative_mip_gap_tol && + worker->node_queue.best_first_queue_size() > 0) { + // If the guided diving was disabled previously due to the lack of an incumbent solution, + // re-enable as soon as a new incumbent is found. + if (diving_worker_pool_.num_workers() > 0 && settings_.diving_settings.guided_diving != 0 && + worker->max_diving_workers[GUIDED_DIVING] == 0) { + if (has_solver_space_incumbent()) { + worker->calculate_num_diving_workers(bfs_worker_pool_.num_workers(), + diving_worker_pool_.num_workers(), + has_solver_space_incumbent(), + settings_.diving_settings); + } + } + + worker->node_queue.lock(); + mip_node_t* start_node = worker->node_queue.pop_best_first(); + if (!start_node) { + worker->node_queue.unlock(); + continue; + } + + worker->lower_bound = start_node->lower_bound; + worker->node_queue.unlock(); + + if (upper_bound_.load() < start_node->lower_bound) { + // This node was put on the heap earlier but its lower bound is now greater than the + // current upper bound + search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound); + search_tree_.update(start_node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; + continue; + } + + plunge_with(worker, start_node); + + lower_bound = get_lower_bound(); + abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); + rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { + node_concurrent_halt_ = 1; + solver_status_ = mip_status_t::OPTIMAL; + break; + } + + // Steal a node with some probability or when it is empty. The victim is determined at random. + if (worker->node_queue.best_first_queue_size() == 0 || + worker->rng.next_double() < steal_chance) { + for (i_t i = 0; i < settings_.bnb_max_steal_attempts; ++i) { + i_t k = worker->rng.uniform(0, bfs_worker_pool_.num_workers()); + if (worker->steal_node_from(bfs_worker_pool_[k], settings_.bnb_nodes_per_steal)) { break; } + } + } } + + worker->set_inactive(); + bfs_worker_pool_.return_worker_to_pool(worker); } template -void branch_and_bound_t::dive_with(branch_and_bound_worker_t* worker) +void branch_and_bound_t::dive_with(diving_worker_t* worker) { raft::common::nvtx::range scope("BB::diving_thread"); logger_t log; @@ -1576,8 +1729,8 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t worker->recompute_basis = true; worker->recompute_bounds = true; - search_tree_t dive_tree(std::move(*worker->start_node)); - std::deque*> stack; + search_tree_t dive_tree(std::move(worker->start_node)); + circular_deque_t*> stack(2 * diving_backtrack_limit + 4); stack.push_front(&dive_tree.root); branch_and_bound_stats_t dive_stats; @@ -1612,11 +1765,9 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t if (lp_status == dual::status_t::TIME_LIMIT) { solver_status_ = mip_status_t::TIME_LIMIT; break; - } else if (lp_status == dual::status_t::CONCURRENT_LIMIT) { - break; - } else if (lp_status == dual::status_t::ITERATION_LIMIT) { - break; } + if (lp_status == dual::status_t::CONCURRENT_LIMIT) { break; } + if (lp_status == dual::status_t::ITERATION_LIMIT) { break; } ++dive_stats.nodes_explored; @@ -1635,7 +1786,7 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t } } - // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only + // Remove nodes that we can no longer backtrack to (i.e., from the current node, we can only // backtrack to a node that is has a depth of at most 5 levels lower than the current node). if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) { stack.pop_back(); @@ -1647,220 +1798,80 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound, lower_bound); } - worker_pool_.return_worker_to_pool(worker); - active_workers_per_strategy_[search_strategy]--; + worker->set_inactive(); + diving_worker_pool_.return_worker_to_pool(worker); } template -void branch_and_bound_t::run_scheduler() +bool branch_and_bound_t::launch_diving_worker(bfs_worker_t* bfs_worker) { - diving_heuristics_settings_t diving_settings = settings_.diving_settings; - const i_t num_workers = 2 * settings_.num_threads; - - if (!has_solver_space_incumbent()) { diving_settings.guided_diving = false; } - std::vector strategies = get_search_strategies(diving_settings); - std::array max_num_workers_per_type = - get_max_workers(num_workers, strategies); + // Get an idle worker. + diving_worker_t* diving_worker = diving_worker_pool_.pop_idle_worker(); + if (diving_worker == nullptr) { return false; } - worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, settings_); - active_workers_per_strategy_.fill(0); + bfs_worker->node_queue.lock(); + mip_node_t* start_node = bfs_worker->node_queue.pop_diving(); -#ifdef CUOPT_LOG_DEBUG - for (auto strategy : strategies) { - settings_.log.debug("%c%d: max num of workers = %d", - feasible_solution_symbol(strategy), - strategy, - max_num_workers_per_type[strategy]); + if (!start_node || upper_bound_.load() < start_node->lower_bound || + start_node->depth < settings_.diving_settings.min_node_depth) { + bfs_worker->node_queue.unlock(); + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; } -#endif - - f_t lower_bound = get_lower_bound(); - f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - i_t last_node_depth = 0; - i_t last_int_infeas = 0; - - while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && - rel_gap > settings_.relative_mip_gap_tol && - (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) { - bool launched_any_task = false; - - repair_heuristic_solutions(); - - // If the guided diving was disabled previously due to the lack of an incumbent solution, - // re-enable as soon as a new incumbent is found. - if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) { - if (has_solver_space_incumbent()) { - diving_settings.guided_diving = settings_.diving_settings.guided_diving; - strategies = get_search_strategies(diving_settings); - max_num_workers_per_type = get_max_workers(num_workers, strategies); - -#ifdef CUOPT_LOG_DEBUG - for (auto type : strategies) { - settings_.log.debug("%c%d: max num of workers = %d", - feasible_solution_symbol(type), - type, - max_num_workers_per_type[type]); - } -#endif - } - } - - f_t now = toc(exploration_stats_.start_time); - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; - - if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && - time_since_last_log >= 1) || - (time_since_last_log > 30) || now > settings_.time_limit) { - i_t queue_size = node_queue_.best_first_queue_size(); - i_t depth = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth; - i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas; - report(' ', upper_bound_, lower_bound, depth, int_infeas); - exploration_stats_.last_log = tic(); - exploration_stats_.nodes_since_last_log = 0; - } - - if (now > settings_.time_limit) { - solver_status_ = mip_status_t::TIME_LIMIT; - break; - } - - for (auto strategy : strategies) { - if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) { - continue; - } - - // Get an idle worker. - branch_and_bound_worker_t* worker = worker_pool_.get_idle_worker(); - if (worker == nullptr) { break; } - - if (strategy == BEST_FIRST) { - // If there any node left in the heap, we pop the top node and explore it. - std::optional*> start_node = node_queue_.pop_best_first(); - - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; - } - - // Remove the worker from the idle list. - worker_pool_.pop_idle_worker(); - worker->init_best_first(start_node.value(), original_lp_); - last_node_depth = start_node.value()->depth; - last_int_infeas = start_node.value()->integer_infeasible; - active_workers_per_strategy_[strategy]++; - launched_any_task = true; -#pragma omp task affinity(worker) default(none) firstprivate(worker) - plunge_with(worker); + diving_worker->init(start_node, original_lp_); + bfs_worker->node_queue.unlock(); - } else { - std::optional*> start_node = node_queue_.pop_diving(); + bool is_feasible = diving_worker->presolve_start_bounds(settings_); + if (!is_feasible) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; + } - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound || - start_node.value()->depth < diving_settings.min_node_depth) { - continue; - } + if (toc(exploration_stats_.start_time) > settings_.time_limit || + solver_status_ != mip_status_t::UNSET) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; + } - bool is_feasible = - worker->init_diving(start_node.value(), strategy, original_lp_, settings_); - if (!is_feasible) { continue; } + for (int i = 1; i < num_search_strategies; ++i) { + auto strategy = search_strategies[i]; - // Remove the worker from the idle list. - worker_pool_.pop_idle_worker(); - active_workers_per_strategy_[strategy]++; - launched_any_task = true; + if (bfs_worker->active_diving_workers[strategy] < bfs_worker->max_diving_workers[strategy]) { + diving_worker->search_strategy = strategy; + diving_worker->bfs_worker = bfs_worker; + diving_worker->set_active(); + bfs_worker->active_diving_workers[strategy]++; + bfs_worker->total_active_diving_workers++; -#pragma omp task affinity(worker) default(none) firstprivate(worker) - dive_with(worker); - } - } + assert(bfs_worker->active_diving_workers[strategy].load() <= + bfs_worker->max_diving_workers[strategy]); + assert(bfs_worker->total_active_diving_workers.load() <= + bfs_worker->total_max_diving_workers); - lower_bound = get_lower_bound(); - abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); +#pragma omp task affinity(*diving_worker) default(none) firstprivate(diving_worker) + dive_with(diving_worker); - if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { - node_concurrent_halt_ = 1; - solver_status_ = mip_status_t::OPTIMAL; - break; + return true; } - - // If no new task was launched in this iteration, suspend temporarily the - // execution of the scheduler. As of 8/Jan/2026, GCC does not - // implement taskyield, but LLVM does. - if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } + + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; } template void branch_and_bound_t::single_threaded_solve() { - raft::common::nvtx::range scope("BB::single_threaded_solve"); - worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_); - branch_and_bound_worker_t* worker = worker_pool_.get_idle_worker(); - - f_t lower_bound = get_lower_bound(); - f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - - while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && - rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) { - repair_heuristic_solutions(); - - f_t now = toc(exploration_stats_.start_time); - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; - - if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && - time_since_last_log >= 1) || - (time_since_last_log > 30) || now > settings_.time_limit) { - i_t depth = node_queue_.bfs_top()->depth; - i_t int_infeas = node_queue_.bfs_top()->integer_infeasible; - report(' ', upper_bound_, lower_bound, depth, int_infeas); - exploration_stats_.last_log = tic(); - exploration_stats_.nodes_since_last_log = 0; - } - - if (now > settings_.time_limit) { - solver_status_ = mip_status_t::TIME_LIMIT; - break; - } - - // If there any node left in the heap, we pop the top node and explore it. - std::optional*> start_node = node_queue_.pop_best_first(); - - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; - } - - worker->init_best_first(start_node.value(), original_lp_); - plunge_with(worker); - - lower_bound = get_lower_bound(); - abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - - if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { - solver_status_ = mip_status_t::OPTIMAL; - break; - } - } + bfs_worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_); + bfs_worker_t* worker = bfs_worker_pool_.pop_idle_worker(); + + node_queue_t& node_queue = worker->node_queue; + node_queue.push(search_tree_.root.get_down_child()); + node_queue.push(search_tree_.root.get_up_child()); + worker->lower_bound = worker->node_queue.get_lower_bound(); + worker->set_active(); + best_first_search_with(worker); } template @@ -2595,8 +2606,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut root_vstatus_, original_lp_, log); - node_queue_.push(search_tree_.root.get_down_child()); - node_queue_.push(search_tree_.root.get_up_child()); settings_.log.printf("Exploring the B&B tree using %d threads\n\n", settings_.num_threads); node_concurrent_halt_ = 0; @@ -2605,7 +2614,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.nodes_unexplored = 2; exploration_stats_.nodes_since_last_log = 0; exploration_stats_.last_log = tic(); - min_node_queue_size_ = 2 * settings_.num_threads; + min_node_queue_size_ = 20; if (settings_.diving_settings.coefficient_diving != 0) { calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); @@ -2625,7 +2634,19 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (settings_.deterministic) { run_deterministic_coordinator(Arow_); } else if (settings_.num_threads > 1) { - run_scheduler(); + const i_t num_workers = settings_.num_threads; + const i_t num_bfs_workers = std::max(settings_.num_threads / 2, 1); + const i_t num_diving_workers = num_workers - num_bfs_workers; + bfs_worker_pool_.init(num_bfs_workers, original_lp_, Arow_, var_types_, settings_); + diving_worker_pool_.init( + num_diving_workers, original_lp_, Arow_, var_types_, settings_, num_bfs_workers); + + if (num_bfs_workers > 1) { + launch_bfs_worker({search_tree_.root.get_up_child()}); + launch_bfs_worker({search_tree_.root.get_down_child()}); + } else { + launch_bfs_worker({search_tree_.root.get_up_child(), search_tree_.root.get_down_child()}); + } } else { single_threaded_solve(); } @@ -2633,37 +2654,45 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut is_running_ = false; + if (solver_status_ == mip_status_t::UNSET && + toc(exploration_stats_.start_time) > settings_.time_limit) { + solver_status_ = mip_status_t::TIME_LIMIT; + } + // Compute final lower bound f_t lower_bound; if (deterministic_mode_enabled_) { lower_bound = deterministic_compute_lower_bound(); solver_status_ = deterministic_global_termination_status_; } else { - if (node_queue_.best_first_queue_size() > 0) { + lower_bound = std::numeric_limits::infinity(); + + for (int i = 0; i < bfs_worker_pool_.num_workers(); ++i) { + bfs_worker_t* worker = bfs_worker_pool_[i]; + // We need to clear the queue and use the info in the search tree for the lower bound - while (node_queue_.best_first_queue_size() > 0) { - std::optional*> start_node = node_queue_.pop_best_first(); + while (worker->node_queue.best_first_queue_size() > 0) { + mip_node_t* start_node = worker->node_queue.pop_best_first(); - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { + if (!start_node) { continue; } + if (upper_bound_.load() < start_node->lower_bound) { // This node was put on the heap earlier but its lower bound is now greater than the // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; + search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound); + search_tree_.update(start_node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; } else { - node_queue_.push( - start_node.value()); // Needed to ensure we don't lose the correct lower bound + // Needed to ensure we don't lose the correct lower bound + worker->node_queue.push(start_node); + lower_bound = std::min(lower_bound, worker->node_queue.get_lower_bound()); break; } } - lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound() - : search_tree_.root.lower_bound; - } else { - lower_bound = search_tree_.root.lower_bound; } + + if (!std::isfinite(lower_bound)) { lower_bound = search_tree_.root.lower_bound; } } + set_final_solution(solution, lower_bound); return solver_status_; } @@ -2792,17 +2821,9 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri deterministic_horizon_step_ = 0.50; // Compute worker counts using the same formula as reliability-branching scheduler - const i_t num_workers = settings_.num_threads; - std::vector search_strategies = - get_search_strategies(settings_.diving_settings); - std::array max_num_workers = - get_max_workers(num_workers, search_strategies); - - const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST]; - int num_diving_workers = 0; - for (size_t i = 1; i < search_strategies.size(); ++i) { - num_diving_workers += max_num_workers[search_strategies[i]]; - } + const i_t num_workers = settings_.num_threads; + const i_t num_bfs_workers = std::max(num_workers / 2, 1); + const i_t num_diving_workers = num_workers - num_bfs_workers; deterministic_mode_enabled_ = true; deterministic_current_horizon_ = deterministic_horizon_step_; @@ -3201,10 +3222,10 @@ node_status_t branch_and_bound_t::solve_node_deterministic( // Solve LP relaxation worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); - std::vector& leaf_vstatus = node_ptr->vstatus; - i_t node_iter = 0; - f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + worker.leaf_vstatus = decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols); + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, @@ -3212,7 +3233,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( lp_start_time, worker.leaf_problem, lp_settings, - leaf_vstatus, + worker.leaf_vstatus, worker.basis_factors, worker.basic_list, worker.nonbasic_list, @@ -3230,7 +3251,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.basis_factors, worker.basic_list, worker.nonbasic_list, - leaf_vstatus, + worker.leaf_vstatus, leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); @@ -3688,8 +3709,10 @@ void branch_and_bound_t::deterministic_assign_diving_nodes() continue; // this worker is full, try next one } - auto entry = diving_heap_.pop(); - if (entry.has_value()) { worker.enqueue_dive_node(entry.value().node, original_lp_); } + if (!diving_heap_.empty()) { + auto entry = diving_heap_.pop(); + worker.enqueue_dive_node(entry.node, original_lp_); + } } diving_heap_.clear(); @@ -3739,11 +3762,6 @@ void branch_and_bound_t::deterministic_dive( { raft::common::nvtx::range scope("BB::deterministic_dive"); - // Create local search tree for the dive - search_tree_t dive_tree(std::move(entry.node)); - std::deque*> stack; - stack.push_front(&dive_tree.root); - worker.dive_lower = std::move(entry.resolved_lower); worker.dive_upper = std::move(entry.resolved_upper); @@ -3753,6 +3771,11 @@ void branch_and_bound_t::deterministic_dive( worker.lp_iters_this_dive = 0; worker.recompute_bounds_and_basis = true; + // Create local search tree for the dive + search_tree_t dive_tree(std::move(entry.node)); + circular_deque_t*> stack(2 * max_backtrack_depth + 4); + stack.push_front(&dive_tree.root); + while (!stack.empty() && deterministic_global_termination_status_ == mip_status_t::UNSET && nodes_this_dive < max_nodes_per_dive) { mip_node_t* node_ptr = stack.front(); @@ -3813,18 +3836,19 @@ void branch_and_bound_t::deterministic_dive( // Solve LP relaxation worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); - std::vector& leaf_vstatus = node_ptr->vstatus; - i_t node_iter = 0; - f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; + worker.leaf_vstatus = + decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols); dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, worker.recompute_bounds_and_basis, lp_start_time, worker.leaf_problem, lp_settings, - leaf_vstatus, + worker.leaf_vstatus, worker.basis_factors, worker.basic_list, worker.nonbasic_list, @@ -3841,7 +3865,7 @@ void branch_and_bound_t::deterministic_dive( worker.basis_factors, worker.basic_list, worker.nonbasic_list, - leaf_vstatus, + worker.leaf_vstatus, leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp index ae1a225e9a..391afc8a71 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.hpp +++ b/cpp/src/branch_and_bound/branch_and_bound.hpp @@ -234,18 +234,14 @@ class branch_and_bound_t { // Pseudocosts pseudo_costs_t pc_; - // Heap storing the nodes waiting to be explored. - node_queue_t node_queue_; - // Search tree search_tree_t search_tree_; - // Count the number of workers per type that either are being executed or - // are waiting to be executed. - std::array, num_search_strategies> active_workers_per_strategy_; + // Worker pool dedicated to the best-first search + bfs_worker_pool_t bfs_worker_pool_; - // Worker pool - branch_and_bound_worker_pool_t worker_pool_; + // Worker pool dedicated to diving + diving_worker_pool_t diving_worker_pool_; // Global status of the solver. omp_atomic_t solver_status_; @@ -287,21 +283,26 @@ class branch_and_bound_t { // Repairs low-quality solutions from the heuristics, if it is applicable. void repair_heuristic_solutions(); + // Launch a new bfs worker initialized from the `start_node`. + bfs_worker_t* launch_bfs_worker(const std::vector*>&); + // Launch a new diving worker from a given bfs worker. The dive will start + // from the node at the top of the local heap. + bool launch_diving_worker(bfs_worker_t* bfs_worker); + + // Perform best-first search with a given bfs worker. + void best_first_search_with(bfs_worker_t* worker); + // We use best-first to pick the `start_node` and then perform a depth-first search // from this node (i.e., a plunge). It can only backtrack to a sibling node. // Unexplored nodes in the subtree are inserted back into the global heap. - void plunge_with(branch_and_bound_worker_t* worker); + void plunge_with(bfs_worker_t* worker, mip_node_t* start_node); // Perform a deep dive in the subtree determined by the `start_node` in order // to find integer feasible solutions. - void dive_with(branch_and_bound_worker_t* worker); - - // Run the scheduler whose will schedule and manage - // all the other workers. - void run_scheduler(); + void dive_with(diving_worker_t* worker); // Run the branch-and-bound algorithm in single threaded mode. - // This disable all diving heuristics. + // This disables all diving heuristics. void single_threaded_solve(); // Solve the LP relaxation of a leaf node diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp index 39bfa0bf3a..4693e6789a 100644 --- a/cpp/src/branch_and_bound/constants.hpp +++ b/cpp/src/branch_and_bound/constants.hpp @@ -6,6 +6,7 @@ /* clang-format on */ #pragma once +#include namespace cuopt::linear_programming::dual_simplex { @@ -24,6 +25,9 @@ enum search_strategy_t : int { COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) }; +constexpr std::array search_strategies = { + BEST_FIRST, PSEUDOCOST_DIVING, LINE_SEARCH_DIVING, GUIDED_DIVING, COEFFICIENT_DIVING}; + enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 }; enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 }; diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp index 53d7e4ef65..a63dad1bcc 100644 --- a/cpp/src/branch_and_bound/deterministic_workers.hpp +++ b/cpp/src/branch_and_bound/deterministic_workers.hpp @@ -195,8 +195,8 @@ class deterministic_bfs_worker_t plunge_stack.pop_front(); return node; } - auto node_opt = backlog.pop(); - return node_opt.has_value() ? node_opt.value() : nullptr; + + return !backlog.empty() ? backlog.pop() : nullptr; } size_t queue_size() const diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp index a0bb731c1e..4989e48c6a 100644 --- a/cpp/src/branch_and_bound/diving_heuristics.cpp +++ b/cpp/src/branch_and_bound/diving_heuristics.cpp @@ -92,7 +92,7 @@ branch_variable_t pseudocost_diving(pseudo_costs_t& pc, f_t score = 0; branch_direction_t dir = branch_direction_t::DOWN; - f_t root_val = (j < static_cast(root_solution.size())) ? root_solution[j] : solution[j]; + f_t root_val = root_solution[j]; if (solution[j] < root_val - f_t(0.4)) { score = score_down; diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp index 694a7099c4..c2c2a328a4 100644 --- a/cpp/src/branch_and_bound/mip_node.hpp +++ b/cpp/src/branch_and_bound/mip_node.hpp @@ -40,24 +40,6 @@ inline bool inactive_status(node_status_t status) template class mip_node_t { public: - ~mip_node_t() - { - // Iterative teardown to avoid stack overflow on deep trees. - // Detach all descendants breadth-first, then destroy them as leaves. - std::vector> nodes; - for (auto& c : children) { - if (c) { nodes.push_back(std::move(c)); } - } - // nodes.size() grows so that this loop only terminates when only leaves remain - for (size_t i = 0; i < nodes.size(); ++i) { - for (auto& c : nodes[i]->children) { - if (c) { nodes.push_back(std::move(c)); } - } - } - - // scope-exit ensure destruction of all detached leaves - } - mip_node_t(mip_node_t&&) = default; mip_node_t& operator=(mip_node_t&&) = default; @@ -73,7 +55,7 @@ class mip_node_t { branch_var_upper(std::numeric_limits::infinity()), fractional_val(std::numeric_limits::infinity()), objective_estimate(std::numeric_limits::infinity()), - vstatus(0) + packed_vstatus(0) { children[0] = nullptr; children[1] = nullptr; @@ -89,7 +71,7 @@ class mip_node_t { branch_dir(branch_direction_t::NONE), integer_infeasible(-1), objective_estimate(std::numeric_limits::infinity()), - vstatus(basis) + packed_vstatus(compress_vstatus(basis)) { children[0] = nullptr; children[1] = nullptr; @@ -113,7 +95,7 @@ class mip_node_t { fractional_val(branch_var_value), integer_infeasible(integer_inf), objective_estimate(parent_node->objective_estimate), - vstatus(basis) + packed_vstatus(compress_vstatus(basis)) { branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var] : std::ceil(branch_var_value); @@ -123,6 +105,22 @@ class mip_node_t { children[1] = nullptr; } + ~mip_node_t() + { + // Iterative teardown to avoid stack overflow on deep trees. + // Detach all descendants breadth-first, then destroy them as leaves. + std::vector> nodes; + for (auto& c : children) { + if (c) { nodes.push_back(std::move(c)); } + } + // nodes.size() grows so that this loop only terminates when only leaves remain + for (size_t i = 0; i < nodes.size(); ++i) { + for (auto& c : nodes[i]->children) { + if (c) { nodes.push_back(std::move(c)); } + } + } + } + void get_variable_bounds(std::vector& lower, std::vector& upper, std::vector& bounds_changed) const @@ -168,7 +166,7 @@ class mip_node_t { children[0] = std::move(down_child); children[1] = std::move(up_child); // When we add children we no longer need to store our basis - vstatus.clear(); + packed_vstatus = {}; } bool is_inactive() const @@ -256,7 +254,7 @@ class mip_node_t { // This method creates a copy of the current node // with its parent set to `nullptr` // This detaches the node from the tree. - mip_node_t detach_copy() const + mip_node_t detach_copy() const { mip_node_t copy; copy.lower_bound = lower_bound; @@ -264,7 +262,7 @@ class mip_node_t { copy.depth = depth; copy.node_id = node_id; copy.integer_infeasible = integer_infeasible; - copy.vstatus = vstatus; + copy.packed_vstatus = packed_vstatus; copy.branch_var = branch_var; copy.branch_dir = branch_dir; copy.branch_var_lower = branch_var_lower; @@ -295,7 +293,11 @@ class mip_node_t { mip_node_t* parent; std::unique_ptr children[2]; - std::vector vstatus; + std::vector packed_vstatus; + + // Indicate if we can dive from this node or not. This is set to false when + // this node was already selected for diving once. + omp_atomic_t can_dive{true}; // Worker-local identification for deterministic ordering: // - origin_worker_id: which worker created this node diff --git a/cpp/src/branch_and_bound/node_queue.hpp b/cpp/src/branch_and_bound/node_queue.hpp index 09d030c96e..15c50c99b7 100644 --- a/cpp/src/branch_and_bound/node_queue.hpp +++ b/cpp/src/branch_and_bound/node_queue.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -29,12 +28,14 @@ class heap_t { { buffer.push_back(node); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; } void push(T&& node) { buffer.push_back(std::move(node)); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; } template @@ -42,39 +43,99 @@ class heap_t { { buffer.emplace_back(std::forward(args)...); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; + assert(num_entries_.load() == buffer.size()); } - std::optional pop() + T pop() { - if (buffer.empty()) return std::nullopt; - std::pop_heap(buffer.begin(), buffer.end(), comp); T node = std::move(buffer.back()); buffer.pop_back(); + --num_entries_; + assert(num_entries_.load() == buffer.size()); return node; } - size_t size() const { return buffer.size(); } + size_t size() const { return num_entries_; } T& top() { return buffer.front(); } - void clear() { buffer.clear(); } - bool empty() const { return buffer.empty(); } + + void clear() + { + buffer.clear(); + num_entries_ = 0; + } + + bool empty() const { return num_entries_ == 0; } // Read-only access to underlying buffer for iteration without modification const std::vector& data() const { return buffer; } private: std::vector buffer; + omp_atomic_t num_entries_{0}; Comp comp; }; -// A queue storing the nodes waiting to be explored/dived from. +// A queue storing the nodes waiting to be explored. Before calling pop or push in parallel, +// the mutex NEEDS to be acquired via the `lock()` method. It must the released afterwards with +// `unlock()`. template class node_queue_t { + public: + void push(mip_node_t* new_node) + { + assert(new_node != nullptr); + auto entry = std::make_shared(new_node); + best_first_heap_.push(entry); + if (new_node->can_dive) diving_heap_.push(entry); + lower_bound_ = best_first_heap_.top()->lower_bound; + } + + mip_node_t* pop_best_first() + { + if (best_first_heap_.empty()) { return nullptr; } + auto entry = best_first_heap_.pop(); + lower_bound_ = best_first_heap_.empty() ? std::numeric_limits::infinity() + : best_first_heap_.top()->lower_bound; + mip_node_t* node = std::exchange(entry->node, nullptr); + assert(node != nullptr); + return node; + } + + mip_node_t* pop_diving() + { + while (!diving_heap_.empty()) { + auto entry = diving_heap_.pop(); + if (entry->node != nullptr) { + entry->node->can_dive = false; + return entry->node; + } + } + return nullptr; + } + + void lock() { mutex_.lock(); } + void unlock() { mutex_.unlock(); } + + mip_node_t* bfs_top() + { + return best_first_heap_.empty() ? nullptr : best_first_heap_.top()->node; + } + + i_t diving_queue_size() { return diving_heap_.size(); } + i_t best_first_queue_size() { return best_first_heap_.size(); } + + f_t get_lower_bound() + { + return best_first_heap_.empty() ? std::numeric_limits::infinity() : lower_bound_.load(); + } + private: struct heap_entry_t { mip_node_t* node = nullptr; - f_t lower_bound = -inf; - f_t score = inf; + f_t lower_bound = -std::numeric_limits::infinity(); + f_t score = std::numeric_limits::infinity(); heap_entry_t(mip_node_t* new_node) : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate) @@ -102,67 +163,11 @@ class node_queue_t { } }; - heap_t, lower_bound_comp> best_first_heap; - heap_t, score_comp> diving_heap; - omp_mutex_t mutex; - - public: - void push(mip_node_t* new_node) - { - std::lock_guard lock(mutex); - auto entry = std::make_shared(new_node); - best_first_heap.push(entry); - diving_heap.push(entry); - } - - std::optional*> pop_best_first() - { - std::lock_guard lock(mutex); - auto entry = best_first_heap.pop(); - - if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); } - - return std::nullopt; - } - - std::optional*> pop_diving() - { - std::lock_guard lock(mutex); - - while (!diving_heap.empty()) { - auto entry = diving_heap.pop(); - - if (entry.has_value()) { - if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; } - } - } + heap_t, lower_bound_comp> best_first_heap_; + heap_t, score_comp> diving_heap_; + omp_mutex_t mutex_; - return std::nullopt; - } - - i_t diving_queue_size() - { - std::lock_guard lock(mutex); - return diving_heap.size(); - } - - i_t best_first_queue_size() - { - std::lock_guard lock(mutex); - return best_first_heap.size(); - } - - f_t get_lower_bound() - { - std::lock_guard lock(mutex); - return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound; - } - - mip_node_t* bfs_top() - { - std::lock_guard lock(mutex); - return best_first_heap.empty() ? nullptr : best_first_heap.top()->node; - } + omp_atomic_t lower_bound_{std::numeric_limits::infinity()}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 564019a15e..184e7ea102 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1402,8 +1402,7 @@ i_t pseudo_costs_t::reliable_variable_selection( // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average. // Otherwise, the best ones are initialized via strong branching, while the other are ignored. // - // In the latter, we are not using the average pseudocost (which calculated in the `initialized` - // method). + // So we only need to initialize the average for the former. if (reliable_threshold == 0) { avg_down = compute_pseudocost_average_down(); avg_up = compute_pseudocost_average_up(); @@ -1555,7 +1554,7 @@ i_t pseudo_costs_t::reliable_variable_selection( single_pivot_objective_change_estimate(worker->leaf_problem, settings, *AT, - node_ptr->vstatus, + worker->leaf_vstatus, j, basic_map[j], leaf_solution, @@ -1659,7 +1658,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, - node_ptr->vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms, worker->basis_factors, worker->basic_list, @@ -1704,7 +1703,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, - node_ptr->vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms, worker->basis_factors, worker->basic_list, diff --git a/cpp/src/branch_and_bound/worker.hpp b/cpp/src/branch_and_bound/worker.hpp index 87689e57bb..acb8ccca87 100644 --- a/cpp/src/branch_and_bound/worker.hpp +++ b/cpp/src/branch_and_bound/worker.hpp @@ -9,13 +9,13 @@ #include #include +#include #include #include #include -#include #include namespace cuopt::linear_programming::dual_simplex { @@ -31,16 +31,35 @@ struct branch_and_bound_stats_t { omp_atomic_t last_log = 0.0; }; +template +bool is_search_strategy_enabled(search_strategy_t strategy, + bool has_incumbent, + diving_heuristics_settings_t settings) +{ + switch (strategy) { + case BEST_FIRST: return true; + case PSEUDOCOST_DIVING: return settings.pseudocost_diving != 0; + case LINE_SEARCH_DIVING: return settings.line_search_diving != 0; + case GUIDED_DIVING: return settings.guided_diving != 0 && has_incumbent; + case COEFFICIENT_DIVING: return settings.coefficient_diving != 0; + default: return false; + } +} + template class branch_and_bound_worker_t { public: - const i_t worker_id; + using float_type = f_t; + using int_type = i_t; + + i_t worker_id; omp_atomic_t search_strategy; omp_atomic_t is_active; omp_atomic_t lower_bound; lp_problem_t leaf_problem; lp_solution_t leaf_solution; + std::vector leaf_vstatus; std::vector leaf_edge_norms; basis_update_mpf_t basis_factors; @@ -52,7 +71,6 @@ class branch_and_bound_worker_t { std::vector start_lower; std::vector start_upper; - mip_node_t* start_node; pcgenerator_t rng; @@ -63,53 +81,23 @@ class branch_and_bound_worker_t { const lp_problem_t& original_lp, const csr_matrix_t& Arow, const std::vector& var_type, - const simplex_solver_settings_t& settings) + const simplex_solver_settings_t& settings, + uint64_t rng_offset = 0) : worker_id(worker_id), search_strategy(BEST_FIRST), is_active(false), lower_bound(-std::numeric_limits::infinity()), leaf_problem(original_lp), leaf_solution(original_lp.num_rows, original_lp.num_cols), + leaf_vstatus(original_lp.num_cols), basis_factors(original_lp.num_rows, settings.refactor_frequency), basic_list(original_lp.num_rows), nonbasic_list(), node_presolver(leaf_problem, Arow, {}, var_type), bounds_changed(original_lp.num_cols, false), - rng(settings.random_seed + pcgenerator_t::default_seed + worker_id, - pcgenerator_t::default_stream ^ worker_id) - { - } - - // Set the `start_node` for best-first search. - void init_best_first(mip_node_t* node, const lp_problem_t& original_lp) - { - start_node = node; - start_lower = original_lp.lower; - start_upper = original_lp.upper; - search_strategy = BEST_FIRST; - lower_bound = node->lower_bound; - is_active = true; - } - - // Initialize the worker for diving, setting the `start_node`, `start_lower` and - // `start_upper`. Returns `true` if the starting node is feasible via - // bounds propagation. - bool init_diving(mip_node_t* node, - search_strategy_t type, - const lp_problem_t& original_lp, - const simplex_solver_settings_t& settings) + rng(settings.random_seed + pcgenerator_t::default_seed + rng_offset + worker_id, + pcgenerator_t::default_stream ^ (worker_id + rng_offset)) { - internal_node = node->detach_copy(); - start_node = &internal_node; - start_lower = original_lp.lower; - start_upper = original_lp.upper; - search_strategy = type; - lower_bound = node->lower_bound; - is_active = true; - - std::fill(bounds_changed.begin(), bounds_changed.end(), false); - node->get_variable_bounds(start_lower, start_upper, bounds_changed); - return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper); } // Set the variables bounds for the LP relaxation in the current node. @@ -134,14 +122,172 @@ class branch_and_bound_worker_t { settings, bounds_changed, leaf_problem.lower, leaf_problem.upper); } - private: - // For diving, we need to store the full node instead of - // of just a pointer, since it is not stored in the tree anymore. - // To keep the same interface across all worker types, - // this will be used as a temporary storage and - // will be pointed by `start_node`. - // For exploration, this will not be used. - mip_node_t internal_node; + void set_active() { is_active = true; } +}; + +template +class bfs_worker_t : public branch_and_bound_worker_t { + public: + using Base = branch_and_bound_worker_t; + bfs_worker_t(i_t worker_id, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_type, + const simplex_solver_settings_t& settings, + uint64_t rng_offset = 0) + : Base(worker_id, original_lp, Arow, var_type, settings, rng_offset) + { + this->start_lower = original_lp.lower; + this->start_upper = original_lp.upper; + this->search_strategy = BEST_FIRST; + + max_diving_workers.fill(0); + active_diving_workers.fill(0); + total_active_diving_workers = 0; + } + + void init(const std::vector*>& nodes) + { + assert(!this->is_active.load()); + assert(node_queue.best_first_queue_size() == 0); + assert(nodes.size() > 0); + + for (auto* node : nodes) { + assert(node != nullptr); + node_queue.push(node); + } + + this->lower_bound = node_queue.get_lower_bound(); + } + + void set_inactive() { this->is_active = false; } + + // Steal nodes from another worker + bool steal_node_from(bfs_worker_t* other, i_t num_nodes) + { + bool steal = false; + assert(num_nodes > 0); + + if (!other->is_active || this == other || + other->node_queue.best_first_queue_size() < 2 * num_nodes) { + return steal; + } + + while (num_nodes > 0) { + other->node_queue.lock(); + mip_node_t* node = other->node_queue.best_first_queue_size() > num_nodes + ? other->node_queue.pop_best_first() + : nullptr; + other->node_queue.unlock(); + if (node == nullptr) { break; } + + this->node_queue.lock(); + this->node_queue.push(node); + this->node_queue.unlock(); + --num_nodes; + steal = true; + } + + return steal; + } + + // Calculate the number of diving workers that this worker can launch. Having a fixed number + // of workers allows the solver to be more deterministic. + void calculate_num_diving_workers(i_t num_bfs_workers, + i_t total_diving_workers, + bool has_incumbent, + const diving_heuristics_settings_t& settings) + { + i_t num_active = 0; + for (i_t i = 1; i < num_search_strategies; ++i) { + num_active += is_search_strategy_enabled(search_strategies[i], has_incumbent, settings); + } + + total_max_diving_workers = 0; + max_diving_workers.fill(0); + if (num_active == 0) { return; } + + for (size_t i = 1, k = 0; i < num_search_strategies; ++i) { + if (is_search_strategy_enabled(search_strategies[i], has_incumbent, settings)) { + // Calculate the number of workers for a given diving heuristic + i_t start = std::floor((double)k * total_diving_workers / num_active); + i_t end = std::floor((double)(k + 1) * total_diving_workers / num_active); + i_t workers_per_type = end - start; + + // Calculate the number of diving workers allocated to this (best-first) worker + start = std::floor((double)this->worker_id * workers_per_type / num_bfs_workers); + end = std::floor((double)(this->worker_id + 1) * workers_per_type / num_bfs_workers); + max_diving_workers[i] = end - start; + total_max_diving_workers += max_diving_workers[i]; + ++k; + } + } + } + + // The worker-local node heap. + node_queue_t node_queue; + + // The number of diving workers of each type that this (best-first) worker can launch. + std::array max_diving_workers; + + // The number of active diving workers of each type associated with this (best-first) worker. + std::array, num_search_strategies> active_diving_workers; + + // Keep track of the total number of active diving worker that are associated with this + // (best-first) worker + omp_atomic_t total_active_diving_workers{0}; + + // The maximum number of diving worker that are associated with this + // (best-first) worker + i_t total_max_diving_workers{0}; +}; + +template +class diving_worker_t : public branch_and_bound_worker_t { + public: + using Base = branch_and_bound_worker_t; + using Base::Base; + + // After calling this routine, you need to set `is_active = true` when the worker is ready. + // Note that the starting node may be dropped if become infeasible via bound propagation. + void init(const mip_node_t* node, const lp_problem_t& original_lp) + { + // Creates a copy of the node that is disconnected from the main tree, such that the + // diving does not modify the main tree. We need to store the variables bounds + // associated with this node, since we cannot retrieve it from the tree + start_node = node->detach_copy(); + this->start_lower = original_lp.lower; + this->start_upper = original_lp.upper; + this->lower_bound = node->lower_bound; + std::fill(this->bounds_changed.begin(), this->bounds_changed.end(), false); + node->get_variable_bounds(this->start_lower, this->start_upper, this->bounds_changed); + } + + // Apply bound strengthening to the starting variable bounds + bool presolve_start_bounds(const simplex_solver_settings_t& settings) + { + return this->node_presolver.bounds_strengthening( + settings, this->bounds_changed, this->start_lower, this->start_upper); + } + + // Set this node inactive + void set_inactive() + { + assert(this->is_active.load()); + assert(bfs_worker != nullptr); + assert(bfs_worker->active_diving_workers[this->search_strategy].load() > 0); + assert(bfs_worker->total_active_diving_workers.load() > 0); + + this->is_active = false; + --bfs_worker->active_diving_workers[this->search_strategy]; + --bfs_worker->total_active_diving_workers; + } + + mip_node_t start_node; + + // The best-first worker that is associated with this diving worker. Used for controlling the + // number of active diving workers. + bfs_worker_t* bfs_worker{nullptr}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp index 2b52b6e7bf..9396f48c04 100644 --- a/cpp/src/branch_and_bound/worker_pool.hpp +++ b/cpp/src/branch_and_bound/worker_pool.hpp @@ -8,57 +8,62 @@ #pragma once #include +#include namespace cuopt::linear_programming::dual_simplex { -template -class branch_and_bound_worker_pool_t { +template +class worker_pool_t { public: + using i_t = typename WorkerType::int_type; + using f_t = typename WorkerType::float_type; + void init(i_t num_workers, const lp_problem_t& original_lp, const csr_matrix_t& Arow, const std::vector& var_type, - const simplex_solver_settings_t& settings) + const simplex_solver_settings_t& settings, + const uint64_t rng_offset = 0) { + assert(!is_initialized); + assert(num_workers > 0); + workers_.resize(num_workers); num_idle_workers_ = num_workers; + idle_workers_.clear_resize(num_workers); for (i_t i = 0; i < num_workers; ++i) { - workers_[i] = std::make_unique>( - i, original_lp, Arow, var_type, settings); - idle_workers_.push_front(i); + workers_[i] = + std::make_unique(i, original_lp, Arow, var_type, settings, rng_offset); + idle_workers_.push_back(i); } is_initialized = true; } - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - branch_and_bound_worker_t* get_idle_worker() + WorkerType* pop_idle_worker() { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (idle_workers_.empty()) { return nullptr; } else { i_t idx = idle_workers_.front(); - return workers_[idx].get(); - } - } - - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - void pop_idle_worker() - { - std::lock_guard lock(mutex_); - if (!idle_workers_.empty()) { idle_workers_.pop_front(); num_idle_workers_--; + assert(idle_workers_.size() == static_cast(num_idle_workers_.load())); + assert(idx >= 0 && static_cast(idx) < workers_.size()); + return workers_[idx].get(); } } - void return_worker_to_pool(branch_and_bound_worker_t* worker) + void return_worker_to_pool(WorkerType* worker) { - worker->is_active = false; - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); + assert(worker != nullptr); + assert(workers_[worker->worker_id].get() == worker); + assert(!worker->is_active.load()); + assert(static_cast(num_idle_workers_.load()) == idle_workers_.size()); + assert(idle_workers_.size() <= workers_.size()); + idle_workers_.push_back(worker->worker_id); num_idle_workers_++; } @@ -69,7 +74,7 @@ class branch_and_bound_worker_pool_t { if (is_initialized) { for (i_t i = 0; i < workers_.size(); ++i) { - if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) { + if (workers_[i]->is_active.load()) { lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound); } } @@ -78,53 +83,35 @@ class branch_and_bound_worker_pool_t { return lower_bound; } - i_t num_idle_workers() { return num_idle_workers_; } + WorkerType* operator[](i_t id) + { + assert(id >= 0 && static_cast(id) < workers_.size()); + assert(workers_[id] != nullptr); + return workers_[id].get(); + } + WorkerType* operator[](i_t id) const + { + assert(id >= 0 && static_cast(id) < workers_.size()); + assert(workers_[id] != nullptr); + return workers_[id].get(); + } + + i_t num_idle_workers() const { return num_idle_workers_; } + i_t num_workers() const { return workers_.size(); } private: - // Worker pool - std::vector>> workers_; + std::vector> workers_; bool is_initialized = false; omp_mutex_t mutex_; - std::deque idle_workers_; - omp_atomic_t num_idle_workers_; + circular_deque_t idle_workers_; + omp_atomic_t num_idle_workers_{0}; }; -template -std::vector get_search_strategies( - diving_heuristics_settings_t settings) -{ - std::vector types; - types.reserve(num_search_strategies); - types.push_back(BEST_FIRST); - if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); } - if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); } - if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); } - if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); } - return types; -} - -template -std::array get_max_workers( - i_t num_workers, const std::vector& strategies) -{ - std::array max_num_workers; - max_num_workers.fill(0); - - i_t bfs_workers = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1); - max_num_workers[BEST_FIRST] = bfs_workers; - - i_t diving_workers = (num_workers - bfs_workers); - i_t m = strategies.size() - 1; - - for (size_t i = 1, k = 0; i < strategies.size(); ++i) { - i_t start = (double)k * diving_workers / m; - i_t end = (double)(k + 1) * diving_workers / m; - max_num_workers[strategies[i]] = end - start; - ++k; - } +template +using bfs_worker_pool_t = worker_pool_t>; - return max_num_workers; -} +template +using diving_worker_pool_t = worker_pool_t>; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp index d69bf8877e..dcb1615359 100644 --- a/cpp/src/dual_simplex/initial_basis.cpp +++ b/cpp/src/dual_simplex/initial_basis.cpp @@ -18,6 +18,90 @@ namespace cuopt::linear_programming::dual_simplex { +uint8_t encode(variable_status_t vstatus) +{ + assert(vstatus != variable_status_t::SUPERBASIC && + "packed_variable_status_t does not support superbasic variables"); + + uint8_t val = 0; + switch (vstatus) { + case variable_status_t::BASIC: val = 0b00; break; + case variable_status_t::NONBASIC_LOWER: val = 0b01; break; + case variable_status_t::NONBASIC_UPPER: val = 0b10; break; + case variable_status_t::NONBASIC_FIXED: val = 0b01; break; + default: val = 0b11; + } + + return val; +} + +variable_status_t decode(uint8_t val) +{ + val &= 0b11; + if (val == 0b00) return variable_status_t::BASIC; + if (val == 0b01) return variable_status_t::NONBASIC_LOWER; + if (val == 0b10) return variable_status_t::NONBASIC_UPPER; + return variable_status_t::NONBASIC_FREE; +} + +std::vector compress_vstatus(const std::vector& vstatus) +{ + size_t n = vstatus.size() / 4; + size_t has_tail = (vstatus.size() % 4 > 0); + + std::vector packed_vstatus; + packed_vstatus.resize(n + has_tail); + + for (size_t i = 0; i < n; ++i) { + size_t j = i * 4; + packed_vstatus[i] = 0; + packed_vstatus[i] |= encode(vstatus[j]); + packed_vstatus[i] |= encode(vstatus[j + 1]) << 2; + packed_vstatus[i] |= encode(vstatus[j + 2]) << 4; + packed_vstatus[i] |= encode(vstatus[j + 3]) << 6; + } + + if (has_tail) { + size_t j = n * 4; + packed_vstatus[n] = 0; + packed_vstatus[n] |= encode(vstatus[j]); + if (j + 1 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 1]) << 2; + if (j + 2 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 2]) << 4; + if (j + 3 < vstatus.size()) packed_vstatus[n] |= encode(vstatus[j + 3]) << 6; + } + + return packed_vstatus; +} + +std::vector decompress_vstatus(const std::vector& packed_vstatus, + size_t vstatus_size) +{ + size_t n = vstatus_size / 4; + size_t has_tail = (vstatus_size % 4 > 0); + + std::vector vstatus; + vstatus.resize(vstatus_size); + assert(packed_vstatus.size() == n + has_tail); + + for (size_t i = 0; i < n; ++i) { + size_t j = i * 4; + vstatus[j] = decode(packed_vstatus[i]); + vstatus[j + 1] = decode(packed_vstatus[i] >> 2); + vstatus[j + 2] = decode(packed_vstatus[i] >> 4); + vstatus[j + 3] = decode(packed_vstatus[i] >> 6); + } + + if (has_tail) { + size_t j = n * 4; + vstatus[j] = decode(packed_vstatus[n]); + if (j + 1 < vstatus.size()) vstatus[j + 1] = decode(packed_vstatus[n] >> 2); + if (j + 2 < vstatus.size()) vstatus[j + 2] = decode(packed_vstatus[n] >> 4); + if (j + 3 < vstatus.size()) vstatus[j + 3] = decode(packed_vstatus[n] >> 6); + } + + return vstatus; +} + template i_t initial_basis_selection(const lp_problem_t& problem, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/initial_basis.hpp b/cpp/src/dual_simplex/initial_basis.hpp index 646785fbd2..22a172dbe0 100644 --- a/cpp/src/dual_simplex/initial_basis.hpp +++ b/cpp/src/dual_simplex/initial_basis.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -23,6 +23,10 @@ enum class variable_status_t : int8_t { SUPERBASIC = 4 }; +std::vector compress_vstatus(const std::vector& vstatus); +std::vector decompress_vstatus(const std::vector& packed_vstatus, + size_t vstatus_size); + template i_t initial_basis_selection(const lp_problem_t& problem, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index cfc120e477..0f35a7d479 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -110,6 +110,9 @@ struct simplex_solver_settings_t { mip_batch_pdlp_reliability_branching(0), strong_branching_simplex_iteration_limit(-1), random_seed(0), + bnb_steal_chance(-1), + bnb_nodes_per_steal(10), + bnb_max_steal_attempts(3), reliability_branching(-1), inside_mip(0), sub_mip(0), @@ -198,13 +201,21 @@ struct simplex_solver_settings_t { // PDLP only // Set the maximum number of simplex iterations allowed per trial branch when applying // strong branching to the root node. - // -1 - Automatic (iteration limit = 200) - // 0, 1 - Estimate the objective change using a single pivot of dual simplex - // >1 - Set as the iteration limit in dual simplex + // -1 - automatic (iteration limit = 200) + // 0, 1 - estimate the objective change using a single pivot of dual simplex + // >1 - set as the iteration limit in dual simplex i_t strong_branching_simplex_iteration_limit; diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics + // In B&B, indicate the chance in which a worker can steal a node from another worker. + // -1 - automatic (0.05) + // 0 - disable + // >0 - set the stealing chance [0, 1] + f_t bnb_steal_chance; + i_t bnb_nodes_per_steal; + i_t bnb_max_steal_attempts; + // Settings for the reliability branching. // - -1: automatic // - 0: disable (use pseudocost branching instead) diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index 682be92a54..2c2c05a795 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -715,6 +715,12 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, solver_stats_t{}, op_problem.get_handle_ptr()->get_stream()); + // The outer solver opens an omp parallel region in solve.cu, so this inner team would + // collapse to a single thread under the default OMP_MAX_ACTIVE_LEVELS=1 and only worker 0 + // would execute. Enable two active levels locally and restore on the way out. + const int saved_max_active_levels = omp_get_max_active_levels(); + if (saved_max_active_levels < 2) { omp_set_max_active_levels(2); } + // Creates the OpenMP thread pool. It will be shared across the entire MIP solver. #pragma omp parallel num_threads(num_threads) default(none) \ shared(sol, op_problem, settings_const, exception) @@ -731,15 +737,16 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } } // Implicit barrier + if (saved_max_active_levels < 2) { omp_set_max_active_levels(saved_max_active_levels); } + if (exception) { std::rethrow_exception(exception); } return sol; } template -mip_solution_t solve_mip( - raft::handle_t const* handle_ptr, - const cuopt::mps_parser::mps_data_model_t& mps_data_model, - mip_solver_settings_t const& settings) +mip_solution_t solve_mip(raft::handle_t const* handle_ptr, + const mps_parser::mps_data_model_t& mps_data_model, + mip_solver_settings_t const& settings) { auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); return solve_mip(op_problem, settings); diff --git a/cpp/src/utilities/circular_deque.hpp b/cpp/src/utilities/circular_deque.hpp new file mode 100644 index 0000000000..6e9d8f5b05 --- /dev/null +++ b/cpp/src/utilities/circular_deque.hpp @@ -0,0 +1,114 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include + +namespace cuopt { + +// A fixed-capacity double-ended queue backed by a circular buffer. +// All operations are O(1) with no dynamic allocation after construction. +// +// Preconditions (asserted in debug builds): +// - push_front / push_back : size() < capacity() +// - pop_front / pop_back : !empty() +// - front / back : !empty() +template +class circular_deque_t { + public: + circular_deque_t() : buffer_(1), capacity_(1), head_(0), tail_(0) {} + + // Allocates storage for exactly `capacity` elements up front. + explicit circular_deque_t(size_t capacity) + : buffer_(capacity + 1), // +1 to distinguish full (next(tail)==head) from empty (head==tail) + capacity_(capacity + 1), + head_(0), + tail_(0) + { + assert(capacity > 0); + } + + bool empty() const { return head_ == tail_; } + bool full() const { return next(tail_) == head_; } + + size_t size() const { return (tail_ - head_ + capacity_) % capacity_; } + size_t capacity() const { return capacity_ - 1; } + + void clear_resize(size_t new_capacity) + { + assert(new_capacity > 0); + head_ = 0; + tail_ = 0; + capacity_ = new_capacity + 1; + buffer_.resize(capacity_); + } + + void push_back(T val) + { + assert(!full()); + buffer_[tail_] = std::move(val); + tail_ = next(tail_); + } + + void push_front(T val) + { + assert(!full()); + head_ = prev(head_); + buffer_[head_] = std::move(val); + } + + T pop_front() + { + assert(!empty()); + T val = std::move(buffer_[head_]); + head_ = next(head_); + return val; + } + + T pop_back() + { + assert(!empty()); + tail_ = prev(tail_); + return std::move(buffer_[tail_]); + } + + T& front() + { + assert(!empty()); + return buffer_[head_]; + } + const T& front() const + { + assert(!empty()); + return buffer_[head_]; + } + + T& back() + { + assert(!empty()); + return buffer_[prev(tail_)]; + } + const T& back() const + { + assert(!empty()); + return buffer_[prev(tail_)]; + } + + private: + size_t next(size_t idx) const { return (idx + 1) % capacity_; } + size_t prev(size_t idx) const { return (idx + capacity_ - 1) % capacity_; } + + std::vector buffer_; + size_t capacity_; + size_t head_; + size_t tail_; +}; + +} // namespace cuopt diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index a13b9ec887..a4209adf14 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -20,11 +20,9 @@ namespace cuopt { class omp_mutex_t { public: omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } - - omp_mutex_t(const omp_mutex_t&) = delete; - omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); } + omp_mutex_t(const omp_mutex_t&) = delete; omp_mutex_t& operator=(const omp_mutex_t&) = delete; omp_mutex_t& operator=(omp_mutex_t&& other) @@ -203,17 +201,10 @@ class omp_atomic_t { private: T val; -#ifndef __NVCC__ friend double fetch_min(omp_atomic_t& atomic_var, double other); friend double fetch_max(omp_atomic_t& atomic_var, double other); -#endif }; -// Atomic CAS are only supported in OpenMP v5.1 -// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot -// parse it correctly yet -#ifndef __NVCC__ - // Free non-template functions are necessary because of a clang 20 bug // when omp atomic compare is used within a templated context. // see https://github.com/llvm/llvm-project/issues/127466 @@ -238,7 +229,6 @@ inline double fetch_max(omp_atomic_t& atomic_var, double other) } return old; } -#endif #endif diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp index 29a865f02f..e83e5f36ad 100644 --- a/cpp/src/utilities/pcgenerator.hpp +++ b/cpp/src/utilities/pcgenerator.hpp @@ -21,12 +21,11 @@ class pcgenerator_t { static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL; /** - * @brief ctor. Initializes the PCG - * @param rng_state is the generator state used for initializing the generator - * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences - * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a - * unique set of random numbers. This can be achieved by initializing the generator with same - * rng_state for all the threads and diststreamt values for subsequence. + * @brief Initializes the PCG generator. + * @param seed Generator state seed. + * @param subsequence Selects one of 2^64 independent subsequences. Use distinct values per + * thread to guarantee non-overlapping streams in parallel contexts. + * @param offset Number of outputs to skip ahead before the first draw. */ pcgenerator_t(const uint64_t seed = default_seed, const uint64_t subsequence = default_stream, @@ -35,7 +34,12 @@ class pcgenerator_t { set_seed(seed, subsequence, offset); } - // Set the seed, subsequence and offset of the PCG + /** + * @brief Re-seeds the generator. + * @param seed Generator state seed. + * @param subsequence Selects one of 2^64 independent subsequences. + * @param offset Number of outputs to skip ahead before the first draw. + */ void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0) { state = uint64_t(0); @@ -47,8 +51,12 @@ class pcgenerator_t { skipahead(offset); } - // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown - // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf + /** + * @brief Advances the generator state by @p offset steps in O(log offset) time. + * + * Uses the closed-form LCG jump described in "Random Number Generation with Arbitrary Strides" + * (F. B. Brown, https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf). + */ void skipahead(uint64_t offset) { uint64_t G = 1; @@ -68,9 +76,7 @@ class pcgenerator_t { } /** - * @defgroup NextRand Generate the next random number - * @brief This code is derived from PCG basic code - * @{ + * @returns the next uniformly distributed 32-bit unsigned integer. */ uint32_t next_u32() { @@ -83,6 +89,9 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed 64-bit unsigned integer. + */ uint64_t next_u64() { uint64_t ret; @@ -93,6 +102,10 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed non-negative 32-bit signed integer in [0, + * INT32_MAX]. + */ int32_t next_i32() { int32_t ret; @@ -102,6 +115,10 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed non-negative 64-bit signed integer in [0, + * INT64_MAX]. + */ int64_t next_i64() { int64_t ret; @@ -111,10 +128,19 @@ class pcgenerator_t { return ret; } - float next_float() { return static_cast((next_u32() >> 8) * 0x1.0p-24); } + /** + * @returns a uniformly distributed float in [0, 1). + */ + float next_float() { return (next_u32() >> 8) * 0x1.0p-24; } - double next_double() { return static_cast((next_u64() >> 11) * 0x1.0p-53); } + /** + * @returns a uniformly distributed double in [0, 1). + */ + double next_double() { return (next_u64() >> 11) * 0x1.0p-53; } + /** + * @returns the next random value of type @p T. + */ template T next() { @@ -130,9 +156,11 @@ class pcgenerator_t { void next(float& ret) { ret = next_float(); } void next(double& ret) { ret = next_double(); } - /// Draws a sample from a uniform distribution. The samples are uniformly distributed over - /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward - /// some numbers in the range (scaling by floating-point). + /** + * @brief Draws a sample from a uniform distribution over `[low, high)`. + * + * May have a slight bias toward some values due to floating-point scaling. + */ template T uniform(T low, T high) { @@ -141,7 +169,9 @@ class pcgenerator_t { return low + (val * range); } - // Shuffles the contents of a sequence using the Fisher–Yates algorithm. + /** + * @brief Shuffles @p seq in-place using the Fisher-Yates algorithm. + */ template void shuffle(std::vector& seq) {