diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 145850d4d..acc652407 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 293b49fbe..ae6772cca 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-132_arch-aarch64.yaml b/conda/environments/all_cuda-132_arch-aarch64.yaml index fa8844a1f..17d986fee 100644 --- a/conda/environments/all_cuda-132_arch-aarch64.yaml +++ b/conda/environments/all_cuda-132_arch-aarch64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-132_arch-x86_64.yaml b/conda/environments/all_cuda-132_arch-x86_64.yaml index a37d8718c..62d876e71 100644 --- a/conda/environments/all_cuda-132_arch-x86_64.yaml +++ b/conda/environments/all_cuda-132_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fa0186c3..94b5a151e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -451,7 +451,10 @@ set_target_properties(cuopt CXX_SCAN_FOR_MODULES OFF ) -target_compile_definitions(cuopt PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}") +target_compile_definitions(cuopt + PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}" + PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API +) target_compile_options(cuopt PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu index 359bb7e92..6a36a21b0 100644 --- a/cpp/src/pdlp/cusparse_view.cu +++ b/cpp/src/pdlp/cusparse_view.cu @@ -270,6 +270,153 @@ void my_cusparsespmm_preprocess(cusparseHandle_t handle, } #endif +#if CUDA_VER_13_2_UP +// SpMVOp symbols. resolved at runtime via dlsym +using cusparseSpMVOp_destroyDescr_sig = cusparse_sig; +using cusparseSpMVOp_destroyPlan_sig = cusparse_sig; +using cusparseSpMVOp_bufferSize_sig = cusparse_sig; +using cusparseSpMVOp_createDescr_sig = cusparse_sig; +using cusparseSpMVOp_createPlan_sig = + cusparse_sig; +using cusparseSpMVOp_sig = cusparse_sig; + +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t() + : descr_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t::~cusparse_spmvop_descr_wrapper_t() +{ + if (!need_destruction_) { return; } + static const auto destroy = + dynamic_load_runtime::function("cusparseSpMVOp_destroyDescr"); + RAFT_CUSPARSE_TRY_NO_THROW((*destroy)(descr_)); +} + +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t( + const cusparse_spmvop_descr_wrapper_t& other) + : descr_(other.descr_), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t& cusparse_spmvop_descr_wrapper_t::operator=( + cusparse_spmvop_descr_wrapper_t&& other) +{ + if (need_destruction_) { + static const auto destroy = dynamic_load_runtime::function( + "cusparseSpMVOp_destroyDescr"); + RAFT_CUSPARSE_TRY((*destroy)(descr_)); + } + descr_ = other.descr_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + +void cusparse_spmvop_descr_wrapper_t::create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + rmm::device_uvector& buffer) +{ + static const auto destroy = + dynamic_load_runtime::function("cusparseSpMVOp_destroyDescr"); + static const auto create = + dynamic_load_runtime::function("cusparseSpMVOp_createDescr"); + if (need_destruction_) { RAFT_CUSPARSE_TRY((*destroy)(descr_)); } + RAFT_CUSPARSE_TRY( + (*create)(handle, &descr_, opA, matA, vecX, vecY, vecZ, computeType, buffer.data())); + need_destruction_ = true; +} + +cusparse_spmvop_descr_wrapper_t::operator cusparseSpMVOpDescr_t() const { return descr_; } + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t() + : plan_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t::~cusparse_spmvop_plan_wrapper_t() +{ + if (!need_destruction_) { return; } + static const auto destroy = + dynamic_load_runtime::function("cusparseSpMVOp_destroyPlan"); + RAFT_CUSPARSE_TRY_NO_THROW((*destroy)(plan_)); +} + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t( + const cusparse_spmvop_plan_wrapper_t& other) + : plan_(other.plan_), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t& cusparse_spmvop_plan_wrapper_t::operator=( + cusparse_spmvop_plan_wrapper_t&& other) +{ + if (need_destruction_) { + static const auto destroy = + dynamic_load_runtime::function("cusparseSpMVOp_destroyPlan"); + RAFT_CUSPARSE_TRY((*destroy)(plan_)); + } + plan_ = other.plan_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + +void cusparse_spmvop_plan_wrapper_t::create(cusparseHandle_t handle, cusparseSpMVOpDescr_t descr) +{ + static const auto destroy = + dynamic_load_runtime::function("cusparseSpMVOp_destroyPlan"); + static const auto create = + dynamic_load_runtime::function("cusparseSpMVOp_createPlan"); + if (need_destruction_) { RAFT_CUSPARSE_TRY((*destroy)(plan_)); } + // cuOpt does not supply user-provided LTO IR; pass nullptr/0 so cuSPARSE JITs internally. + RAFT_CUSPARSE_TRY((*create)(handle, descr, &plan_, /*ltoIRBuf=*/nullptr, /*ltoIRSize=*/0)); + need_destruction_ = true; +} + +cusparse_spmvop_plan_wrapper_t::operator cusparseSpMVOpPlan_t() const { return plan_; } + +void cusparse_spmvop_run(cusparseHandle_t handle, + cusparseSpMVOpPlan_t plan, + const void* alpha, + const void* beta, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaStream_t stream) +{ + static const auto func = dynamic_load_runtime::function("cusparseSpMVOp"); + RAFT_CUSPARSE_TRY(cusparseSetStream(handle, stream)); + RAFT_CUSPARSE_TRY((*func)(handle, plan, alpha, beta, vecX, vecY, vecZ)); +} +#endif + // This cstr is used in pdhg, step size strategy and in cuPDLPx infeasible detection // A_T is owned by the scaled problem // It was already transposed in the scaled_problem version @@ -304,6 +451,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{op_problem_scaled.reverse_constraints}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -717,6 +866,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{_A_T_indices}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -926,6 +1077,8 @@ cusparse_view_t::cusparse_view_t( tmp_dual(existing_cusparse_view.tmp_dual), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1041,6 +1194,8 @@ cusparse_view_t::cusparse_view_t( : handle_ptr_(handle_ptr), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1182,6 +1337,77 @@ bool is_cusparse_runtime_mixed_precision_supported() return (major > 12) || (major == 12 && minor >= 5); } +bool is_cusparse_runtime_spmvop_supported() +{ +#if CUDA_VER_13_2_UP + // Probe the runtimme to ensure cusparseSpMVOp is supported + static const bool supported = + dynamic_load_runtime::function("cusparseSpMVOp").has_value(); + return supported; +#else + return false; +#endif +} + +// Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. +template +void cusparse_view_t::create_spmv_op_plans(bool is_reflected) +{ +#if CUDA_VER_13_2_UP + if (!is_cusparse_runtime_spmvop_supported() || !(std::is_same_v)) { return; } + static const auto buffer_size = + dynamic_load_runtime::function("cusparseSpMVOp_bufferSize"); + CUSPARSE_CHECK(cusparseSetStream(handle_ptr_->get_cusparse_handle(), handle_ptr_->get_stream())); + // Prepare buffers for At_y SpMVOp + size_t buffer_size_transpose = 0; + RAFT_CUSPARSE_TRY((*buffer_size)(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + &buffer_size_transpose)); + buffer_transpose_spmvop.resize(buffer_size_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_t_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + buffer_transpose_spmvop); + + spmv_op_plan_A_t_.create(handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_t_); + + // Only prepare buffers for A_x if we are using reflected_halpern + if (is_reflected) { + size_t buffer_size_non_transpose = 0; + RAFT_CUSPARSE_TRY((*buffer_size)(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + &buffer_size_non_transpose)); + buffer_non_transpose_spmvop.resize(buffer_size_non_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + buffer_non_transpose_spmvop); + + spmv_op_plan_A_.create(handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_); + } +#endif +} + #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT template class cusparse_sp_mat_descr_wrapper_t; template class cusparse_dn_vec_descr_wrapper_t; diff --git a/cpp/src/pdlp/cusparse_view.hpp b/cpp/src/pdlp/cusparse_view.hpp index c6d0ddea6..f6d68b21e 100644 --- a/cpp/src/pdlp/cusparse_view.hpp +++ b/cpp/src/pdlp/cusparse_view.hpp @@ -20,6 +20,8 @@ #include +#define CUDA_VER_13_2_UP (CUDART_VERSION >= 13020) + namespace cuopt::linear_programming::detail { template @@ -79,6 +81,52 @@ class cusparse_dn_mat_descr_wrapper_t { bool need_destruction_; }; +#if CUDA_VER_13_2_UP +// RAII wrapper around cusparse SpMVOp objects. All the buffers are owned by the cusparse_view_t. +class cusparse_spmvop_descr_wrapper_t { + public: + cusparse_spmvop_descr_wrapper_t(); + ~cusparse_spmvop_descr_wrapper_t(); + + cusparse_spmvop_descr_wrapper_t(const cusparse_spmvop_descr_wrapper_t& other); + cusparse_spmvop_descr_wrapper_t& operator=(cusparse_spmvop_descr_wrapper_t&& other); + cusparse_spmvop_descr_wrapper_t& operator=(const cusparse_spmvop_descr_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + rmm::device_uvector& buffer); + + operator cusparseSpMVOpDescr_t() const; + + private: + cusparseSpMVOpDescr_t descr_; + bool need_destruction_; +}; + +class cusparse_spmvop_plan_wrapper_t { + public: + cusparse_spmvop_plan_wrapper_t(); + ~cusparse_spmvop_plan_wrapper_t(); + + cusparse_spmvop_plan_wrapper_t(const cusparse_spmvop_plan_wrapper_t& other); + cusparse_spmvop_plan_wrapper_t& operator=(cusparse_spmvop_plan_wrapper_t&& other); + cusparse_spmvop_plan_wrapper_t& operator=(const cusparse_spmvop_plan_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, cusparseSpMVOpDescr_t descr); + + operator cusparseSpMVOpPlan_t() const; + + private: + cusparseSpMVOpPlan_t plan_; + bool need_destruction_; +}; +#endif + template class cusparse_view_t { public: @@ -172,6 +220,17 @@ class cusparse_view_t { rmm::device_uvector buffer_non_transpose; rmm::device_uvector buffer_transpose; + // SpMVOp buffers for A and A_T + rmm::device_uvector buffer_non_transpose_spmvop{0, handle_ptr_->get_stream()}; + rmm::device_uvector buffer_transpose_spmvop{0, handle_ptr_->get_stream()}; + +#if CUDA_VER_13_2_UP + // SpMVOp descriptors and plans for A and A_T (descr before plan so dtor destroys plan first) + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_; + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_t_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_t_; +#endif // reuse buffers for cusparse spmm rmm::device_uvector buffer_transpose_batch; rmm::device_uvector buffer_non_transpose_batch; @@ -212,6 +271,8 @@ class cusparse_view_t { // Redirects the cuSPARSE CSR structure pointers from op_problem_scaled_ to the original problem // so the duplicated row/column buffers can be freed. void redirect_cusparse_csr_structure_pointers(const problem_t& original_problem); + // Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. + void create_spmv_op_plans(bool is_reflected); }; // Mixed precision SpMV: FP32 matrix with FP64 vectors and FP64 compute type @@ -268,4 +329,21 @@ void my_cusparsespmm_preprocess(cusparseHandle_t handle, bool is_cusparse_runtime_mixed_precision_supported(); +// False if cuda version < 13.2 or runtime cuSPARSE does not export SpMVOp symbols. True otherwise. +bool is_cusparse_runtime_spmvop_supported(); + +#if CUDA_VER_13_2_UP +// Dispatches to the runtime cusparseSpMVOp via dlsym so callers (e.g., pdhg.cu) never +// reference the symbol statically. Caller must have verified +// is_cusparse_runtime_spmvop_supported(). +void cusparse_spmvop_run(cusparseHandle_t handle, + cusparseSpMVOpPlan_t plan, + const void* alpha, + const void* beta, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaStream_t stream); +#endif + } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index d9dbb083f..b243023a2 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -444,6 +444,63 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector +void pdhg_solver_t::spmvop_At_y() +{ +#if CUDA_VER_13_2_UP + if (is_cusparse_runtime_spmvop_supported()) { + cusparse_spmvop_run(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_t_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_solution, + cusparse_view_.current_AtY, + cusparse_view_.current_AtY, + stream_view_.value()); + return; + } +#endif + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.current_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); +} + +template +void pdhg_solver_t::spmvop_A_x() +{ +#if CUDA_VER_13_2_UP + if (is_cusparse_runtime_spmvop_supported()) { + cusparse_spmvop_run(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.reflected_primal_solution, + cusparse_view_.dual_gradient, + cusparse_view_.dual_gradient, + stream_view_.value()); + return; + } +#endif + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.reflected_primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_non_transpose.data(), + stream_view_)); +} + template void pdhg_solver_t::compute_At_y() { @@ -462,9 +519,10 @@ void pdhg_solver_t::compute_At_y() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_transpose_mixed_.data(), stream_view_); + } else { + spmvop_At_y(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -510,9 +568,10 @@ void pdhg_solver_t::compute_A_x() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_non_transpose_mixed_.data(), stream_view_); + } else { + spmvop_A_x(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 52f45dc83..7eb4008ea 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -95,6 +95,8 @@ class pdhg_solver_t { void compute_primal_projection(rmm::device_uvector& primal_step_size); void compute_At_y(); void compute_A_x(); + void spmvop_At_y(); + void spmvop_A_x(); bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 49c77e44d..fd0cc9ffc 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2316,6 +2316,12 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co compute_initial_primal_weight(); initial_scaling_strategy_.scale_problem(); + if constexpr (std::is_same_v) { + if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) { + pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + settings_.hyper_params.use_reflected_primal_dual); + } + } // Update FP32 matrix copies for mixed precision SpMV after scaling pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); diff --git a/dependencies.yaml b/dependencies.yaml index 4b647a706..d3424f17c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -701,6 +701,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - cuda-nvtx-dev + - libnvjitlink-dev cuda_wheels: