From fb3425bbbd1f76cf505947dce051b684e2b11f9d Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Tue, 10 Feb 2026 15:33:27 -0800
Subject: [PATCH 1/5] speedup symmetrization

---
 cpp/src/pdlp/optimization_problem.cu | 115 +++++++++++++--------------
 1 file changed, 55 insertions(+), 60 deletions(-)

diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu
index d0888dd3a..036214bee 100644
--- a/cpp/src/pdlp/optimization_problem.cu
+++ b/cpp/src/pdlp/optimization_problem.cu
@@ -188,85 +188,80 @@ void optimization_problem_t<i_t, f_t>::set_quadratic_objective_matrix(
     size_offsets >= 1, error_type_t::ValidationError, "Q_offsets must have at least 1 element");
   cuopt_expects(Q_offsets != nullptr, error_type_t::ValidationError, "Q_offsets cannot be null");
 
-  // Replace Q with Q + Q^T
-  i_t qn    = size_offsets - 1;  // Number of variables
-  i_t q_nnz = size_indices;
-  // Construct H = Q + Q^T in triplet form first
-  std::vector<i_t> H_i;
-  std::vector<i_t> H_j;
-  std::vector<f_t> H_x;
-
-  H_i.reserve(2 * q_nnz);
-  H_j.reserve(2 * q_nnz);
-  H_x.reserve(2 * q_nnz);
+  // Build Q + Q^T using optimized 2-pass algorithm (no COO intermediate)
+  // Memory: ~3× nnz, ~2x faster than original COO-based approach
+  i_t qn = size_offsets - 1;  // Number of variables
 
+  // Pass 1: Count entries per row in Q + Q^T
+  std::vector<i_t> row_counts(qn, 0);
   for (i_t i = 0; i < qn; ++i) {
-    i_t row_start = Q_offsets[i];
-    i_t row_end   = Q_offsets[i + 1];
-    for (i_t p = row_start; p < row_end; ++p) {
+    for (i_t p = Q_offsets[i]; p < Q_offsets[i + 1]; ++p) {
+      i_t j = Q_indices[p];
+      row_counts[i]++;
+      if (i != j) { row_counts[j]++; }
+    }
+  }
+
+  // Build temporary offsets via prefix sum
+  std::vector<i_t> temp_offsets(qn + 1);
+  temp_offsets[0] = 0;
+  for (i_t i = 0; i < qn; ++i) {
+    temp_offsets[i + 1] = temp_offsets[i] + row_counts[i];
+  }
+
+  i_t total_entries = temp_offsets[qn];
+  std::vector<i_t> temp_indices(total_entries);
+  std::vector<f_t> temp_values(total_entries);
+
+  // Pass 2: Fill entries directly
+  std::vector<i_t> row_pos = temp_offsets;  // Copy for tracking insertion positions
+
+  for (i_t i = 0; i < qn; ++i) {
+    for (i_t p = Q_offsets[i]; p < Q_offsets[i + 1]; ++p) {
       i_t j = Q_indices[p];
       f_t x = Q_values[p];
-      // Add H(i,j)
-      H_i.push_back(i);
-      H_j.push_back(j);
-      if (i == j) { H_x.push_back(2 * x); }
+
+      // Add entry (i, j) with value 2x for diagonal, x for off-diagonal
+      temp_indices[row_pos[i]] = j;
+      temp_values[row_pos[i]]  = (i == j) ? (2 * x) : x;
+      row_pos[i]++;
+
+      // Add transpose entry (j, i) if off-diagonal
       if (i != j) {
-        H_x.push_back(x);
-        // Add H(j,i)
-        H_i.push_back(j);
-        H_j.push_back(i);
-        H_x.push_back(x);
+        temp_indices[row_pos[j]] = i;
+        temp_values[row_pos[j]]  = x;
+        row_pos[j]++;
       }
     }
   }
-  // Convert H to CSR format
-  // Get row counts
-  i_t H_nz = H_x.size();
-  std::vector<i_t> H_row_counts(qn, 0);
-  for (i_t k = 0; k < H_nz; ++k) {
-    H_row_counts[H_i[k]]++;
-  }
-  std::vector<i_t> H_cumulative_counts(qn + 1, 0);
-  for (i_t k = 0; k < qn; ++k) {
-    H_cumulative_counts[k + 1] = H_cumulative_counts[k] + H_row_counts[k];
-  }
-  std::vector<i_t> H_row_starts = H_cumulative_counts;
-  std::vector<i_t> H_indices(H_nz);
-  std::vector<f_t> H_values(H_nz);
-  for (i_t k = 0; k < H_nz; ++k) {
-    i_t p        = H_cumulative_counts[H_i[k]]++;
-    H_indices[p] = H_j[k];
-    H_values[p]  = H_x[k];
-  }
-
-  // H_row_starts, H_indices, H_values are the CSR representation of H
-  // But this contains duplicate entries
 
+  // Pass 3: Deduplicate and build final CSR
   std::vector<i_t> workspace(qn, -1);
   Q_offsets_.resize(qn + 1);
-  std::fill(Q_offsets_.begin(), Q_offsets_.end(), 0);
-  Q_indices_.resize(H_nz);
-  Q_values_.resize(H_nz);
+  Q_indices_.resize(total_entries);
+  Q_values_.resize(total_entries);
+
   i_t nz = 0;
   for (i_t i = 0; i < qn; ++i) {
-    i_t q               = nz;  // row i will start at q
-    const i_t row_start = H_row_starts[i];
-    const i_t row_end   = H_row_starts[i + 1];
-    for (i_t p = row_start; p < row_end; ++p) {
-      i_t j = H_indices[p];
-      if (workspace[j] >= q) {
-        Q_values_[workspace[j]] += H_values[p];  // H(i,j) is duplicate
+    i_t row_start_out = nz;
+    Q_offsets_[i]     = row_start_out;
+
+    for (i_t p = temp_offsets[i]; p < temp_offsets[i + 1]; ++p) {
+      i_t j = temp_indices[p];
+      f_t x = temp_values[p];
+
+      if (workspace[j] >= row_start_out) {
+        Q_values_[workspace[j]] += x;
       } else {
-        workspace[j]   = nz;  // record where column j occurs
-        Q_indices_[nz] = j;   // keep H(i,j)
-        Q_values_[nz]  = H_values[p];
+        workspace[j]   = nz;
+        Q_indices_[nz] = j;
+        Q_values_[nz]  = x;
         nz++;
       }
     }
-    Q_offsets_[i] = q;  // record start of row i
   }
 
-  Q_offsets_[qn] = nz;  // finalize Q
+  Q_offsets_[qn] = nz;
   Q_indices_.resize(nz);
   Q_values_.resize(nz);
   // FIX ME:: check for positive semi definite matrix

From 4a73954a726d94bbf579253024ecb81ac1402ec3 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Tue, 10 Feb 2026 15:37:18 -0800
Subject: [PATCH 2/5] separate out heper function

---
 cpp/src/pdlp/optimization_problem.cu        |   1 +
 cpp/src/utilities/sparse_matrix_helpers.hpp | 154 ++++++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 cpp/src/utilities/sparse_matrix_helpers.hpp

diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu
index 036214bee..8893ea9a4 100644
--- a/cpp/src/pdlp/optimization_problem.cu
+++ b/cpp/src/pdlp/optimization_problem.cu
@@ -15,6 +15,7 @@
 #include <mps_parser/writer.hpp>
 #include <utilities/copy_helpers.hpp>
 #include <utilities/logger.hpp>
+#include <utilities/sparse_matrix_helpers.hpp>
 
 #include <raft/core/copy.hpp>
 #include <raft/core/cuda_support.hpp>
diff --git a/cpp/src/utilities/sparse_matrix_helpers.hpp b/cpp/src/utilities/sparse_matrix_helpers.hpp
new file mode 100644
index 000000000..28e26c617
--- /dev/null
+++ b/cpp/src/utilities/sparse_matrix_helpers.hpp
@@ -0,0 +1,154 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <vector>
+
+namespace cuopt {
+
+/**
+ * @brief Symmetrize a CSR matrix by computing A + A^T
+ *
+ * Given a CSR matrix A, computes the symmetric matrix H = A + A^T.
+ * Diagonal entries are doubled (A[i,i] + A[i,i] = 2*A[i,i]).
+ * Off-diagonal entries are summed (H[i,j] = A[i,j] + A[j,i]).
+ *
+ * This is useful for quadratic programming where the objective is
+ * (1/2) x^T Q x, and Q needs to be symmetric.
+ *
+ * @tparam i_t Integer type for indices
+ * @tparam f_t Floating point type for values
+ *
+ * @param[in] in_values   CSR values array
+ * @param[in] in_indices  CSR column indices array
+ * @param[in] in_offsets  CSR row offsets array (size = n_rows + 1)
+ * @param[in] n_rows      Number of rows (and columns, assuming square matrix)
+ * @param[out] out_values   Output CSR values
+ * @param[out] out_indices  Output CSR column indices
+ * @param[out] out_offsets  Output CSR row offsets
+ */
+template <typename i_t, typename f_t>
+void symmetrize_csr(const f_t* in_values,
+                    const i_t* in_indices,
+                    const i_t* in_offsets,
+                    i_t n_rows,
+                    std::vector<f_t>& out_values,
+                    std::vector<i_t>& out_indices,
+                    std::vector<i_t>& out_offsets)
+{
+  // Optimized 2-pass algorithm (no COO intermediate)
+  // Memory: ~3× nnz, ~2x faster than COO-based approach
+
+  // Pass 1: Count entries per row in A + A^T
+  std::vector<i_t> row_counts(n_rows, 0);
+  for (i_t i = 0; i < n_rows; ++i) {
+    for (i_t p = in_offsets[i]; p < in_offsets[i + 1]; ++p) {
+      i_t j = in_indices[p];
+      row_counts[i]++;
+      if (i != j) { row_counts[j]++; }
+    }
+  }
+
+  // Build temporary offsets via prefix sum
+  std::vector<i_t> temp_offsets(n_rows + 1);
+  temp_offsets[0] = 0;
+  for (i_t i = 0; i < n_rows; ++i) {
+    temp_offsets[i + 1] = temp_offsets[i] + row_counts[i];
+  }
+
+  i_t total_entries = temp_offsets[n_rows];
+  std::vector<i_t> temp_indices(total_entries);
+  std::vector<f_t> temp_values(total_entries);
+
+  // Pass 2: Fill entries directly
+  std::vector<i_t> row_pos = temp_offsets;  // Copy for tracking insertion positions
+
+  for (i_t i = 0; i < n_rows; ++i) {
+    for (i_t p = in_offsets[i]; p < in_offsets[i + 1]; ++p) {
+      i_t j = in_indices[p];
+      f_t x = in_values[p];
+
+      // Add entry (i, j) with value 2x for diagonal, x for off-diagonal
+      temp_indices[row_pos[i]] = j;
+      temp_values[row_pos[i]]  = (i == j) ? (2 * x) : x;
+      row_pos[i]++;
+
+      // Add transpose entry (j, i) if off-diagonal
+      if (i != j) {
+        temp_indices[row_pos[j]] = i;
+        temp_values[row_pos[j]]  = x;
+        row_pos[j]++;
+      }
+    }
+  }
+
+  // Pass 3: Deduplicate and build final CSR
+  std::vector<i_t> workspace(n_rows, -1);
+  out_offsets.resize(n_rows + 1);
+  out_indices.resize(total_entries);
+  out_values.resize(total_entries);
+
+  i_t nz = 0;
+  for (i_t i = 0; i < n_rows; ++i) {
+    i_t row_start_out = nz;
+    out_offsets[i]    = row_start_out;
+
+    for (i_t p = temp_offsets[i]; p < temp_offsets[i + 1]; ++p) {
+      i_t j = temp_indices[p];
+      f_t x = temp_values[p];
+
+      if (workspace[j] >= row_start_out) {
+        out_values[workspace[j]] += x;
+      } else {
+        workspace[j]    = nz;
+        out_indices[nz] = j;
+        out_values[nz]  = x;
+        nz++;
+      }
+    }
+  }
+
+  out_offsets[n_rows] = nz;
+  out_indices.resize(nz);
+  out_values.resize(nz);
+}
+
+/**
+ * @brief Symmetrize a CSR matrix in-place using std::vector
+ *
+ * Convenience overload that takes and returns std::vectors.
+ *
+ * @tparam i_t Integer type for indices
+ * @tparam f_t Floating point type for values
+ *
+ * @param[in] in_values   Input CSR values
+ * @param[in] in_indices  Input CSR column indices
+ * @param[in] in_offsets  Input CSR row offsets
+ * @param[out] out_values   Output CSR values (can be same as in_values)
+ * @param[out] out_indices  Output CSR column indices (can be same as in_indices)
+ * @param[out] out_offsets  Output CSR row offsets (can be same as in_offsets)
+ */
+template <typename i_t, typename f_t>
+void symmetrize_csr(const std::vector<f_t>& in_values,
+                    const std::vector<i_t>& in_indices,
+                    const std::vector<i_t>& in_offsets,
+                    std::vector<f_t>& out_values,
+                    std::vector<i_t>& out_indices,
+                    std::vector<i_t>& out_offsets)
+{
+  i_t n_rows = static_cast<i_t>(in_offsets.size()) - 1;
+  symmetrize_csr(in_values.data(),
+                 in_indices.data(),
+                 in_offsets.data(),
+                 n_rows,
+                 out_values,
+                 out_indices,
+                 out_offsets);
+}
+
+}  // namespace cuopt

From fd72185552d1f888db6eb079792c96d8580af798 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 12 Feb 2026 11:07:57 -0800
Subject: [PATCH 3/5] Support quadobj writing to mps

---
 .../include/mps_parser/mps_writer.hpp         |  18 +-
 cpp/libmps_parser/src/mps_writer.cpp          | 139 ++++++++++
 cpp/libmps_parser/tests/mps_parser_test.cpp   | 237 +++++++++++++++++-
 3 files changed, 391 insertions(+), 3 deletions(-)

diff --git a/cpp/libmps_parser/include/mps_parser/mps_writer.hpp b/cpp/libmps_parser/include/mps_parser/mps_writer.hpp
index 0d16c6add..30f2fdf94 100644
--- a/cpp/libmps_parser/include/mps_parser/mps_writer.hpp
+++ b/cpp/libmps_parser/include/mps_parser/mps_writer.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -8,9 +8,11 @@
 #pragma once
 
 #include <mps_parser/data_model_view.hpp>
+#include <mps_parser/mps_data_model.hpp>
 
 #include <stdarg.h>
 #include <limits>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -31,10 +33,16 @@ class mps_writer_t {
    * @brief Ctor. Takes a data model view as input and writes it out as a MPS formatted file
    *
    * @param[in] problem Data model view to write
-   * @param[in] file Path to the MPS file to write
    */
   mps_writer_t(const data_model_view_t<i_t, f_t>& problem);
 
+  /**
+   * @brief Ctor. Takes a data model as input and writes it out as a MPS formatted file
+   *
+   * @param[in] problem Data model to write
+   */
+  mps_writer_t(const mps_data_model_t<i_t, f_t>& problem);
+
   /**
    * @brief Writes the problem to an MPS formatted file
    *
@@ -43,7 +51,13 @@ class mps_writer_t {
   void write(const std::string& mps_file_path);
 
  private:
+  // Owned view (created when constructing from mps_data_model_t)
+  std::unique_ptr<data_model_view_t<i_t, f_t>> owned_view_;
+  // Reference to the view (either external or owned)
   const data_model_view_t<i_t, f_t>& problem_;
+
+  // Helper to create view from data model
+  static data_model_view_t<i_t, f_t> create_view(const mps_data_model_t<i_t, f_t>& model);
 };  // class mps_writer_t
 
 }  // namespace cuopt::mps_parser
diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp
index 4c562ec24..8111830df 100644
--- a/cpp/libmps_parser/src/mps_writer.cpp
+++ b/cpp/libmps_parser/src/mps_writer.cpp
@@ -8,7 +8,9 @@
 #include <mps_parser/mps_writer.hpp>
 
 #include <mps_parser/data_model_view.hpp>
+#include <mps_parser/mps_data_model.hpp>
 #include <utilities/error.hpp>
+#include <utilities/sparse_matrix_utils.hpp>
 
 #include <cmath>
 #include <fstream>
@@ -16,6 +18,7 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <memory>
 
 namespace cuopt::mps_parser {
 
@@ -24,6 +27,92 @@ mps_writer_t<i_t, f_t>::mps_writer_t(const data_model_view_t<i_t, f_t>& problem)
 {
 }
 
+template <typename i_t, typename f_t>
+data_model_view_t<i_t, f_t> mps_writer_t<i_t, f_t>::create_view(
+  const mps_data_model_t<i_t, f_t>& model)
+{
+  data_model_view_t<i_t, f_t> view;
+
+  // Set basic data
+  view.set_maximize(model.get_sense());
+
+  // Constraint matrix
+  const auto& A_values  = model.get_constraint_matrix_values();
+  const auto& A_indices = model.get_constraint_matrix_indices();
+  const auto& A_offsets = model.get_constraint_matrix_offsets();
+  if (!A_values.empty()) {
+    view.set_csr_constraint_matrix(A_values.data(),
+                                   static_cast<i_t>(A_values.size()),
+                                   A_indices.data(),
+                                   static_cast<i_t>(A_indices.size()),
+                                   A_offsets.data(),
+                                   static_cast<i_t>(A_offsets.size()));
+  }
+
+  // Constraint bounds
+  const auto& b = model.get_constraint_bounds();
+  if (!b.empty()) { view.set_constraint_bounds(b.data(), static_cast<i_t>(b.size())); }
+
+  // Objective coefficients
+  const auto& c = model.get_objective_coefficients();
+  if (!c.empty()) { view.set_objective_coefficients(c.data(), static_cast<i_t>(c.size())); }
+
+  view.set_objective_scaling_factor(model.get_objective_scaling_factor());
+  view.set_objective_offset(model.get_objective_offset());
+
+  // Variable bounds
+  const auto& lb = model.get_variable_lower_bounds();
+  const auto& ub = model.get_variable_upper_bounds();
+  if (!lb.empty()) { view.set_variable_lower_bounds(lb.data(), static_cast<i_t>(lb.size())); }
+  if (!ub.empty()) { view.set_variable_upper_bounds(ub.data(), static_cast<i_t>(ub.size())); }
+
+  // Variable types
+  const auto& var_types = model.get_variable_types();
+  if (!var_types.empty()) {
+    view.set_variable_types(var_types.data(), static_cast<i_t>(var_types.size()));
+  }
+
+  // Row types
+  const auto& row_types = model.get_row_types();
+  if (!row_types.empty()) {
+    view.set_row_types(row_types.data(), static_cast<i_t>(row_types.size()));
+  }
+
+  // Constraint bounds (lower/upper)
+  const auto& cl = model.get_constraint_lower_bounds();
+  const auto& cu = model.get_constraint_upper_bounds();
+  if (!cl.empty()) { view.set_constraint_lower_bounds(cl.data(), static_cast<i_t>(cl.size())); }
+  if (!cu.empty()) { view.set_constraint_upper_bounds(cu.data(), static_cast<i_t>(cu.size())); }
+
+  // Names
+  view.set_problem_name(model.get_problem_name());
+  view.set_objective_name(model.get_objective_name());
+  view.set_variable_names(model.get_variable_names());
+  view.set_row_names(model.get_row_names());
+
+  // Quadratic objective
+  const auto& Q_values  = model.get_quadratic_objective_values();
+  const auto& Q_indices = model.get_quadratic_objective_indices();
+  const auto& Q_offsets = model.get_quadratic_objective_offsets();
+  if (!Q_values.empty()) {
+    view.set_quadratic_objective_matrix(Q_values.data(),
+                                        static_cast<i_t>(Q_values.size()),
+                                        Q_indices.data(),
+                                        static_cast<i_t>(Q_indices.size()),
+                                        Q_offsets.data(),
+                                        static_cast<i_t>(Q_offsets.size()));
+  }
+
+  return view;
+}
+
+template <typename i_t, typename f_t>
+mps_writer_t<i_t, f_t>::mps_writer_t(const mps_data_model_t<i_t, f_t>& problem)
+  : owned_view_(std::make_unique<data_model_view_t<i_t, f_t>>(create_view(problem))),
+    problem_(*owned_view_)
+{
+}
+
 template <typename i_t, typename f_t>
 void mps_writer_t<i_t, f_t>::write(const std::string& mps_file_path)
 {
@@ -280,6 +369,56 @@ void mps_writer_t<i_t, f_t>::write(const std::string& mps_file_path)
     }
   }
 
+  // QUADOBJ section for quadratic objective terms (if present)
+  // MPS format: QUADOBJ stores upper triangular elements (row <= col)
+  // MPS uses (1/2) x^T H x, cuOpt uses x^T Q x
+  // For equivalence: H[i,j] = Q[i,j] + Q[j,i] (works for both diagonal and off-diagonal)
+  // We symmetrize Q first (H = Q + Q^T), then extract upper triangular
+  if (problem_.has_quadratic_objective()) {
+    auto Q_values_span  = problem_.get_quadratic_objective_values();
+    auto Q_indices_span = problem_.get_quadratic_objective_indices();
+    auto Q_offsets_span = problem_.get_quadratic_objective_offsets();
+
+    // Copy span data to local vectors for indexed access
+    std::vector<f_t> Q_values(Q_values_span.data(), Q_values_span.data() + Q_values_span.size());
+    std::vector<i_t> Q_indices(Q_indices_span.data(),
+                               Q_indices_span.data() + Q_indices_span.size());
+    std::vector<i_t> Q_offsets(Q_offsets_span.data(),
+                               Q_offsets_span.data() + Q_offsets_span.size());
+
+    if (Q_values.size() > 0) {
+      // Symmetrize Q: compute H = Q + Q^T
+      std::vector<f_t> H_values;
+      std::vector<i_t> H_indices;
+      std::vector<i_t> H_offsets;
+      symmetrize_csr(Q_values, Q_indices, Q_offsets, H_values, H_indices, H_offsets);
+
+      i_t n_rows = static_cast<i_t>(H_offsets.size()) - 1;
+
+      mps_file << "QUADOBJ\n";
+
+      // Write upper triangular entries from symmetric H
+      for (i_t i = 0; i < n_rows; ++i) {
+        std::string row_name = static_cast<size_t>(i) < problem_.get_variable_names().size()
+                                 ? problem_.get_variable_names()[i]
+                                 : "C" + std::to_string(i);
+
+        for (i_t p = H_offsets[i]; p < H_offsets[i + 1]; ++p) {
+          i_t j = H_indices[p];
+          f_t v = H_values[p];
+
+          // Only write upper triangular (i <= j)
+          if (i <= j && v != f_t(0)) {
+            std::string col_name = static_cast<size_t>(j) < problem_.get_variable_names().size()
+                                     ? problem_.get_variable_names()[j]
+                                     : "C" + std::to_string(j);
+            mps_file << "    " << row_name << " " << col_name << " " << v << "\n";
+          }
+        }
+      }
+    }
+  }
+
   mps_file << "ENDATA\n";
   mps_file.close();
 }
diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp
index 510719d02..f915fb2df 100644
--- a/cpp/libmps_parser/tests/mps_parser_test.cpp
+++ b/cpp/libmps_parser/tests/mps_parser_test.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -8,10 +8,12 @@
 #include <utilities/common_utils.hpp>
 
 #include <mps_parser.hpp>
+#include <mps_parser/mps_writer.hpp>
 #include <mps_parser/parser.hpp>
 
 #include <gtest/gtest.h>
 
+#include <cmath>
 #include <cstdint>
 #include <filesystem>
 #include <sstream>
@@ -892,4 +894,237 @@ TEST(qps_parser, test_qps_files)
   }
 }
 
+// ================================================================================================
+// MPS Round-Trip Tests (Read -> Write -> Read -> Compare)
+// ================================================================================================
+
+// Helper function to compare two data models
+template <typename i_t, typename f_t>
+void compare_data_models(const mps_data_model_t<i_t, f_t>& original,
+                         const mps_data_model_t<i_t, f_t>& reloaded,
+                         f_t tol = 1e-9)
+{
+  // Compare basic dimensions
+  EXPECT_EQ(original.get_n_variables(), reloaded.get_n_variables());
+  EXPECT_EQ(original.get_n_constraints(), reloaded.get_n_constraints());
+
+  // Compare objective coefficients
+  auto orig_c   = original.get_objective_coefficients();
+  auto reload_c = reloaded.get_objective_coefficients();
+  ASSERT_EQ(orig_c.size(), reload_c.size());
+  for (size_t i = 0; i < orig_c.size(); ++i) {
+    EXPECT_NEAR(orig_c[i], reload_c[i], tol) << "Objective coefficient mismatch at index " << i;
+  }
+
+  // Compare constraint matrix values
+  auto orig_A   = original.get_constraint_matrix_values();
+  auto reload_A = reloaded.get_constraint_matrix_values();
+  ASSERT_EQ(orig_A.size(), reload_A.size());
+  for (size_t i = 0; i < orig_A.size(); ++i) {
+    EXPECT_NEAR(orig_A[i], reload_A[i], tol) << "Constraint matrix value mismatch at index " << i;
+  }
+
+  // Compare constraint matrix indices
+  auto orig_A_idx   = original.get_constraint_matrix_indices();
+  auto reload_A_idx = reloaded.get_constraint_matrix_indices();
+  ASSERT_EQ(orig_A_idx.size(), reload_A_idx.size());
+  for (size_t i = 0; i < orig_A_idx.size(); ++i) {
+    EXPECT_EQ(orig_A_idx[i], reload_A_idx[i]) << "Constraint matrix index mismatch at index " << i;
+  }
+
+  // Compare constraint matrix offsets
+  auto orig_A_off   = original.get_constraint_matrix_offsets();
+  auto reload_A_off = reloaded.get_constraint_matrix_offsets();
+  ASSERT_EQ(orig_A_off.size(), reload_A_off.size());
+  for (size_t i = 0; i < orig_A_off.size(); ++i) {
+    EXPECT_EQ(orig_A_off[i], reload_A_off[i]) << "Constraint matrix offset mismatch at index " << i;
+  }
+
+  // Compare variable bounds
+  auto orig_lb   = original.get_variable_lower_bounds();
+  auto reload_lb = reloaded.get_variable_lower_bounds();
+  ASSERT_EQ(orig_lb.size(), reload_lb.size());
+  for (size_t i = 0; i < orig_lb.size(); ++i) {
+    if (std::isinf(orig_lb[i]) && std::isinf(reload_lb[i])) {
+      EXPECT_EQ(std::signbit(orig_lb[i]), std::signbit(reload_lb[i]))
+        << "Variable lower bound infinity sign mismatch at index " << i;
+    } else {
+      EXPECT_NEAR(orig_lb[i], reload_lb[i], tol) << "Variable lower bound mismatch at index " << i;
+    }
+  }
+
+  auto orig_ub   = original.get_variable_upper_bounds();
+  auto reload_ub = reloaded.get_variable_upper_bounds();
+  ASSERT_EQ(orig_ub.size(), reload_ub.size());
+  for (size_t i = 0; i < orig_ub.size(); ++i) {
+    if (std::isinf(orig_ub[i]) && std::isinf(reload_ub[i])) {
+      EXPECT_EQ(std::signbit(orig_ub[i]), std::signbit(reload_ub[i]))
+        << "Variable upper bound infinity sign mismatch at index " << i;
+    } else {
+      EXPECT_NEAR(orig_ub[i], reload_ub[i], tol) << "Variable upper bound mismatch at index " << i;
+    }
+  }
+
+  // Compare constraint bounds
+  auto orig_cl   = original.get_constraint_lower_bounds();
+  auto reload_cl = reloaded.get_constraint_lower_bounds();
+  ASSERT_EQ(orig_cl.size(), reload_cl.size());
+  for (size_t i = 0; i < orig_cl.size(); ++i) {
+    if (std::isinf(orig_cl[i]) && std::isinf(reload_cl[i])) {
+      EXPECT_EQ(std::signbit(orig_cl[i]), std::signbit(reload_cl[i]))
+        << "Constraint lower bound infinity sign mismatch at index " << i;
+    } else {
+      EXPECT_NEAR(orig_cl[i], reload_cl[i], tol)
+        << "Constraint lower bound mismatch at index " << i;
+    }
+  }
+
+  auto orig_cu   = original.get_constraint_upper_bounds();
+  auto reload_cu = reloaded.get_constraint_upper_bounds();
+  ASSERT_EQ(orig_cu.size(), reload_cu.size());
+  for (size_t i = 0; i < orig_cu.size(); ++i) {
+    if (std::isinf(orig_cu[i]) && std::isinf(reload_cu[i])) {
+      EXPECT_EQ(std::signbit(orig_cu[i]), std::signbit(reload_cu[i]))
+        << "Constraint upper bound infinity sign mismatch at index " << i;
+    } else {
+      EXPECT_NEAR(orig_cu[i], reload_cu[i], tol)
+        << "Constraint upper bound mismatch at index " << i;
+    }
+  }
+
+  // Compare quadratic objective if present
+  EXPECT_EQ(original.has_quadratic_objective(), reloaded.has_quadratic_objective());
+  if (original.has_quadratic_objective() && reloaded.has_quadratic_objective()) {
+    auto orig_Q       = original.get_quadratic_objective_values();
+    auto orig_Q_idx   = original.get_quadratic_objective_indices();
+    auto orig_Q_off   = original.get_quadratic_objective_offsets();
+    auto reload_Q     = reloaded.get_quadratic_objective_values();
+    auto reload_Q_idx = reloaded.get_quadratic_objective_indices();
+    auto reload_Q_off = reloaded.get_quadratic_objective_offsets();
+
+    // Compare Q matrix structure and values
+    ASSERT_EQ(orig_Q.size(), reload_Q.size()) << "Q values size mismatch";
+    ASSERT_EQ(orig_Q_idx.size(), reload_Q_idx.size()) << "Q indices size mismatch";
+    ASSERT_EQ(orig_Q_off.size(), reload_Q_off.size()) << "Q offsets size mismatch";
+
+    for (size_t i = 0; i < orig_Q.size(); ++i) {
+      EXPECT_NEAR(orig_Q[i], reload_Q[i], tol) << "Q value mismatch at index " << i;
+    }
+    for (size_t i = 0; i < orig_Q_idx.size(); ++i) {
+      EXPECT_EQ(orig_Q_idx[i], reload_Q_idx[i]) << "Q index mismatch at index " << i;
+    }
+    for (size_t i = 0; i < orig_Q_off.size(); ++i) {
+      EXPECT_EQ(orig_Q_off[i], reload_Q_off[i]) << "Q offset mismatch at index " << i;
+    }
+  }
+}
+
+TEST(mps_roundtrip, linear_programming_basic)
+{
+  std::string input_file =
+    cuopt::test::get_rapids_dataset_root_dir() + "/linear_programming/good-mps-1.mps";
+  std::string temp_file = "/tmp/mps_roundtrip_lp_test.mps";
+
+  // Read original
+  auto original = parse_mps<int, double>(input_file, true);
+
+  // Write to temp file
+  mps_writer_t<int, double> writer(original);
+  writer.write(temp_file);
+
+  // Read back
+  auto reloaded = parse_mps<int, double>(temp_file, false);
+
+  // Compare
+  compare_data_models(original, reloaded);
+
+  // Cleanup
+  std::filesystem::remove(temp_file);
+}
+
+TEST(mps_roundtrip, linear_programming_with_bounds)
+{
+  if (!file_exists("linear_programming/lp_model_with_var_bounds.mps")) {
+    GTEST_SKIP() << "Test file not found";
+  }
+
+  std::string input_file =
+    cuopt::test::get_rapids_dataset_root_dir() + "/linear_programming/lp_model_with_var_bounds.mps";
+  std::string temp_file = "/tmp/mps_roundtrip_lp_bounds_test.mps";
+
+  // Read original
+  auto original = parse_mps<int, double>(input_file, false);
+
+  // Write to temp file
+  mps_writer_t<int, double> writer(original);
+  writer.write(temp_file);
+
+  // Read back
+  auto reloaded = parse_mps<int, double>(temp_file, false);
+
+  // Compare
+  compare_data_models(original, reloaded);
+
+  // Cleanup
+  std::filesystem::remove(temp_file);
+}
+
+TEST(mps_roundtrip, quadratic_programming_qp_test_1)
+{
+  if (!file_exists("quadratic_programming/QP_Test_1.qps")) {
+    GTEST_SKIP() << "Test file not found";
+  }
+
+  std::string input_file =
+    cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_1.qps";
+  std::string temp_file = "/tmp/mps_roundtrip_qp_test_1.mps";
+
+  // Read original
+  auto original = parse_mps<int, double>(input_file, false);
+  ASSERT_TRUE(original.has_quadratic_objective()) << "Original should have quadratic objective";
+
+  // Write to temp file
+  mps_writer_t<int, double> writer(original);
+  writer.write(temp_file);
+
+  // Read back
+  auto reloaded = parse_mps<int, double>(temp_file, false);
+  ASSERT_TRUE(reloaded.has_quadratic_objective()) << "Reloaded should have quadratic objective";
+
+  // Compare
+  compare_data_models(original, reloaded);
+
+  // Cleanup
+  std::filesystem::remove(temp_file);
+}
+
+TEST(mps_roundtrip, quadratic_programming_qp_test_2)
+{
+  if (!file_exists("quadratic_programming/QP_Test_2.qps")) {
+    GTEST_SKIP() << "Test file not found";
+  }
+
+  std::string input_file =
+    cuopt::test::get_rapids_dataset_root_dir() + "/quadratic_programming/QP_Test_2.qps";
+  std::string temp_file = "/tmp/mps_roundtrip_qp_test_2.mps";
+
+  // Read original
+  auto original = parse_mps<int, double>(input_file, false);
+  ASSERT_TRUE(original.has_quadratic_objective()) << "Original should have quadratic objective";
+
+  // Write to temp file
+  mps_writer_t<int, double> writer(original);
+  writer.write(temp_file);
+
+  // Read back
+  auto reloaded = parse_mps<int, double>(temp_file, false);
+  ASSERT_TRUE(reloaded.has_quadratic_objective()) << "Reloaded should have quadratic objective";
+
+  // Compare
+  compare_data_models(original, reloaded);
+
+  // Cleanup
+  std::filesystem::remove(temp_file);
+}
+
 }  // namespace cuopt::mps_parser

From 6a165d232022dbdbccee65b8f2302658f4545df5 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Fri, 13 Feb 2026 07:56:35 -0800
Subject: [PATCH 4/5] add missing header

---
 .../src/utilities/sparse_matrix_utils.hpp     | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 cpp/libmps_parser/src/utilities/sparse_matrix_utils.hpp

diff --git a/cpp/libmps_parser/src/utilities/sparse_matrix_utils.hpp b/cpp/libmps_parser/src/utilities/sparse_matrix_utils.hpp
new file mode 100644
index 000000000..72a9472df
--- /dev/null
+++ b/cpp/libmps_parser/src/utilities/sparse_matrix_utils.hpp
@@ -0,0 +1,137 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <vector>
+
+namespace cuopt::mps_parser {
+
+/**
+ * @brief Symmetrize a CSR matrix by computing A + A^T
+ *
+ * Given a CSR matrix A, computes the symmetric matrix H = A + A^T.
+ * Diagonal entries are doubled (A[i,i] + A[i,i] = 2*A[i,i]).
+ * Off-diagonal entries are summed (H[i,j] = A[i,j] + A[j,i]).
+ *
+ * @tparam i_t Integer type for indices
+ * @tparam f_t Floating point type for values
+ *
+ * @param[in] in_values   CSR values array
+ * @param[in] in_indices  CSR column indices array
+ * @param[in] in_offsets  CSR row offsets array (size = n_rows + 1)
+ * @param[in] n_rows      Number of rows (and columns, assuming square matrix)
+ * @param[out] out_values   Output CSR values
+ * @param[out] out_indices  Output CSR column indices
+ * @param[out] out_offsets  Output CSR row offsets
+ */
+template <typename i_t, typename f_t>
+void symmetrize_csr(const f_t* in_values,
+                    const i_t* in_indices,
+                    const i_t* in_offsets,
+                    i_t n_rows,
+                    std::vector<f_t>& out_values,
+                    std::vector<i_t>& out_indices,
+                    std::vector<i_t>& out_offsets)
+{
+  // Optimized 3-pass algorithm
+  // Pass 1: Count entries per row in A + A^T
+  std::vector<i_t> row_counts(n_rows, 0);
+  for (i_t i = 0; i < n_rows; ++i) {
+    for (i_t p = in_offsets[i]; p < in_offsets[i + 1]; ++p) {
+      i_t j = in_indices[p];
+      row_counts[i]++;
+      if (i != j) { row_counts[j]++; }
+    }
+  }
+
+  // Build temporary offsets via prefix sum
+  std::vector<i_t> temp_offsets(n_rows + 1);
+  temp_offsets[0] = 0;
+  for (i_t i = 0; i < n_rows; ++i) {
+    temp_offsets[i + 1] = temp_offsets[i] + row_counts[i];
+  }
+
+  i_t total_entries = temp_offsets[n_rows];
+  std::vector<i_t> temp_indices(total_entries);
+  std::vector<f_t> temp_values(total_entries);
+
+  // Pass 2: Fill entries directly
+  std::vector<i_t> row_pos = temp_offsets;  // Copy for tracking insertion positions
+
+  for (i_t i = 0; i < n_rows; ++i) {
+    for (i_t p = in_offsets[i]; p < in_offsets[i + 1]; ++p) {
+      i_t j = in_indices[p];
+      f_t x = in_values[p];
+
+      // Add entry (i, j) with value 2x for diagonal, x for off-diagonal
+      temp_indices[row_pos[i]] = j;
+      temp_values[row_pos[i]]  = (i == j) ? (2 * x) : x;
+      row_pos[i]++;
+
+      // Add transpose entry (j, i) if off-diagonal
+      if (i != j) {
+        temp_indices[row_pos[j]] = i;
+        temp_values[row_pos[j]]  = x;
+        row_pos[j]++;
+      }
+    }
+  }
+
+  // Pass 3: Deduplicate and build final CSR
+  std::vector<i_t> workspace(n_rows, -1);
+  out_offsets.resize(n_rows + 1);
+  out_indices.resize(total_entries);
+  out_values.resize(total_entries);
+
+  i_t nz = 0;
+  for (i_t i = 0; i < n_rows; ++i) {
+    i_t row_start_out = nz;
+    out_offsets[i]    = row_start_out;
+
+    for (i_t p = temp_offsets[i]; p < temp_offsets[i + 1]; ++p) {
+      i_t j = temp_indices[p];
+      f_t x = temp_values[p];
+
+      if (workspace[j] >= row_start_out) {
+        out_values[workspace[j]] += x;
+      } else {
+        workspace[j]    = nz;
+        out_indices[nz] = j;
+        out_values[nz]  = x;
+        nz++;
+      }
+    }
+  }
+
+  out_offsets[n_rows] = nz;
+  out_indices.resize(nz);
+  out_values.resize(nz);
+}
+
+/**
+ * @brief Symmetrize a CSR matrix using std::vector (convenience overload)
+ */
+template <typename i_t, typename f_t>
+void symmetrize_csr(const std::vector<f_t>& in_values,
+                    const std::vector<i_t>& in_indices,
+                    const std::vector<i_t>& in_offsets,
+                    std::vector<f_t>& out_values,
+                    std::vector<i_t>& out_indices,
+                    std::vector<i_t>& out_offsets)
+{
+  i_t n_rows = static_cast<i_t>(in_offsets.size()) - 1;
+  symmetrize_csr(in_values.data(),
+                 in_indices.data(),
+                 in_offsets.data(),
+                 n_rows,
+                 out_values,
+                 out_indices,
+                 out_offsets);
+}
+
+}  // namespace cuopt::mps_parser

From 7654cccbce7334898e4f6c47f6e12572017f5621 Mon Sep 17 00:00:00 2001
From: Rajesh Gandham <rgandham@nvidia.com>
Date: Thu, 5 Mar 2026 06:52:50 -0800
Subject: [PATCH 5/5] Use kahan summation for post solve residual computation

---
 cpp/src/barrier/barrier.cu        |  2 ++
 cpp/src/dual_simplex/presolve.cpp | 17 ++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu
index 075323744..0d52b5a75 100644
--- a/cpp/src/barrier/barrier.cu
+++ b/cpp/src/barrier/barrier.cu
@@ -3481,6 +3481,8 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
     f_t norm_b = vector_norm_inf<i_t, f_t>(data.b, stream_view_);
     f_t norm_c = vector_norm_inf<i_t, f_t>(data.c, stream_view_);
 
+    settings.log.printf("norm_b: %.2e, norm_c: %.2e\n", norm_b, norm_c);
+
     f_t quad_objective = 0.0;
     if (data.Q.n > 0) {
       dense_vector_t<i_t, f_t> Qx(data.Q.n);
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index b9ee41951..d2a68d96d 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -891,14 +891,19 @@ i_t presolve(const lp_problem_t<i_t, f_t>& original,
       }
     }
 
+    std::vector<f_t> kahan_compensation(problem.num_rows, 0.0);
     for (i_t j = 0; j < problem.num_cols; j++) {
       if (lower_bounds_removed[j]) {
         i_t col_start = problem.A.col_start[j];
         i_t col_end   = problem.A.col_start[j + 1];
         for (i_t p = col_start; p < col_end; p++) {
-          i_t i   = problem.A.i[p];
-          f_t aij = problem.A.x[p];
-          problem.rhs[i] -= aij * problem.lower[j];
+          i_t i                 = problem.A.i[p];
+          f_t aij               = problem.A.x[p];
+          f_t val               = -aij * problem.lower[j];
+          f_t y                 = val - kahan_compensation[i];
+          f_t t                 = problem.rhs[i] + y;
+          kahan_compensation[i] = (t - problem.rhs[i]) - y;
+          problem.rhs[i]        = t;
         }
         problem.obj_constant += old_objective[j] * problem.lower[j];
         problem.upper[j] -= problem.lower[j];
@@ -1496,13 +1501,15 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
   }
 
   if (presolve_info.removed_lower_bounds.size() > 0) {
-    settings.log.printf("Post-solve: Handling removed lower bounds %d\n",
-                        presolve_info.removed_lower_bounds.size());
+    i_t num_lower_bounds = 0;
+
     // We removed some lower bounds so we need to map the crushed solution back to the original
     // variables
     for (i_t j = 0; j < input_x.size(); j++) {
+      if (presolve_info.removed_lower_bounds[j] != 0.0) { num_lower_bounds++; }
       input_x[j] += presolve_info.removed_lower_bounds[j];
     }
+    settings.log.printf("Post-solve: Handling removed lower bounds %d\n", num_lower_bounds);
   }
   assert(uncrushed_x.size() == input_x.size());
   assert(uncrushed_y.size() == input_y.size());