diff --git a/.cursor/AGENTS.md b/.agents/AGENTS.md similarity index 100% rename from .cursor/AGENTS.md rename to .agents/AGENTS.md diff --git a/.cursor/rules/skill-evolution.mdc b/.agents/rules/skill-evolution.mdc similarity index 100% rename from .cursor/rules/skill-evolution.mdc rename to .agents/rules/skill-evolution.mdc diff --git a/.cursor/skills b/.agents/skills similarity index 100% rename from .cursor/skills rename to .agents/skills diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 4c5df380f6..6b17bbe882 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -4,69 +4,51 @@ "name": "NVIDIA" }, "metadata": { - "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server.", - "version": "26.04.00" + "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (user/developer), and REST server.", + "version": "26.06.00" }, "plugins": [ { "name": "cuopt-user-rules", "source": "./skills/cuopt-user-rules", "skills": "./", - "description": "Base behavior rules for using NVIDIA cuOpt. Read first when helping users with cuOpt (routing, LP/MILP, QP, installation, server)." + "description": "Base rules for end users calling NVIDIA cuOpt (routing/LP/MILP/QP/install/server). Not for cuOpt internals — use cuopt-developer for those." }, { "name": "cuopt-developer", "source": "./skills/cuopt-developer", "skills": "./", - "description": "Contribute to NVIDIA cuOpt codebase including C++/CUDA, Python, server, docs, and CI. Use when the user wants to modify solver internals, add features, submit PRs, or understand the codebase architecture." + "description": "Modify, build, test, debug, and contribute to NVIDIA cuOpt (C++/CUDA, Python, server, CI). Use for solver internals, PRs, DCO, and code conventions." }, { - "name": "cuopt-installation-common", - "source": "./skills/cuopt-installation-common", + "name": "cuopt-install", + "source": "./skills/cuopt-install", "skills": "./", - "description": "Install cuOpt — system and environment requirements only. Domain concepts; no install commands or interface guidance." + "description": "Install cuOpt for Python, C, or as a server (pip, conda, Docker) — system requirements, install commands, and verification. Use when the user wants to install or verify cuOpt for any user-facing interface." }, { - "name": "cuopt-installation-api-python", - "source": "./skills/cuopt-installation-api-python", + "name": "numerical-optimization-formulation", + "source": "./skills/numerical-optimization-formulation", "skills": "./", - "description": "Install cuOpt for Python — pip, conda, Docker, verification. Use when the user is installing or verifying the Python API." + "description": "Numerical optimization (LP, MILP, QP) — concepts, problem-text parsing, and formulation patterns. What LP, MILP, and QP are, required formulation questions, modeling elements, common patterns, and how to parse problem statements (parameters, constraints, decisions, objective). Domain concepts; no API or interface." }, { - "name": "cuopt-installation-api-c", - "source": "./skills/cuopt-installation-api-c", + "name": "cuopt-numerical-optimization-api-python", + "source": "./skills/cuopt-numerical-optimization-api-python", "skills": "./", - "description": "Install cuOpt for C — conda, locate lib/headers, verification. Use when the user is installing or verifying the C API." + "description": "Solve LP, MILP, and QP (beta) with the Python API. Use when the user asks about optimization with linear or quadratic objectives, linear constraints, integer variables, scheduling, resource allocation, facility location, production planning, portfolio optimization, or least squares." }, { - "name": "cuopt-installation-developer", - "source": "./skills/cuopt-installation-developer", + "name": "cuopt-numerical-optimization-api-c", + "source": "./skills/cuopt-numerical-optimization-api-c", "skills": "./", - "description": "Developer installation — build cuOpt from source, run tests. Use when the user wants to set up a dev environment to contribute or modify cuOpt." + "description": "LP, MILP, and QP (beta) with cuOpt — C API only. Use when the user is embedding LP, MILP, or QP in C/C++." }, { - "name": "lp-milp-formulation", - "source": "./skills/lp-milp-formulation", + "name": "cuopt-numerical-optimization-api-cli", + "source": "./skills/cuopt-numerical-optimization-api-cli", "skills": "./", - "description": "LP/MILP concepts and going from problem text to formulation. What LP/MILP are, required formulation questions, typical modeling elements, and how to parse problem statements." - }, - { - "name": "cuopt-lp-milp-api-python", - "source": "./skills/cuopt-lp-milp-api-python", - "skills": "./", - "description": "Solve LP and MILP with the Python API. Use when the user asks about optimization with linear constraints, integer variables, scheduling, resource allocation, facility location, or production planning." - }, - { - "name": "cuopt-lp-milp-api-c", - "source": "./skills/cuopt-lp-milp-api-c", - "skills": "./", - "description": "LP and MILP with cuOpt — C API only. Use when the user is embedding LP/MILP in C/C++." - }, - { - "name": "cuopt-lp-milp-api-cli", - "source": "./skills/cuopt-lp-milp-api-cli", - "skills": "./", - "description": "LP and MILP with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving from MPS via command line." + "description": "LP, MILP, and QP (beta) with cuOpt — CLI only (MPS files, cuopt_cli). Use when the user is solving LP, MILP, or QP from MPS via command line." }, { "name": "routing-formulation", @@ -80,30 +62,6 @@ "skills": "./", "description": "Vehicle routing (VRP, TSP, PDP) with cuOpt — Python API only. Use when the user is building or solving routing in Python." }, - { - "name": "qp-formulation", - "source": "./skills/qp-formulation", - "skills": "./", - "description": "Quadratic Programming (QP) — problem form and constraints. Domain concepts; no API or interface. QP is beta." - }, - { - "name": "cuopt-qp-api-python", - "source": "./skills/cuopt-qp-api-python", - "skills": "./", - "description": "Quadratic Programming (QP) with cuOpt — Python API only (beta). Use when the user is building or solving QP in Python." - }, - { - "name": "cuopt-qp-api-c", - "source": "./skills/cuopt-qp-api-c", - "skills": "./", - "description": "Quadratic Programming (QP) with cuOpt — C API. Use when the user is embedding QP in C/C++." - }, - { - "name": "cuopt-qp-api-cli", - "source": "./skills/cuopt-qp-api-cli", - "skills": "./", - "description": "QP with cuOpt — CLI (e.g. cuopt_cli with QP-capable input). Use when the user is solving QP from the command line." - }, { "name": "cuopt-server-common", "source": "./skills/cuopt-server-common", diff --git a/.clinerules b/.clinerules new file mode 120000 index 0000000000..47dc3e3d86 --- /dev/null +++ b/.clinerules @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 28e0835568..1fe59b5032 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -27,33 +27,219 @@ reviews: request_changes_workflow: false review_status: false - # Path-specific review instructions + # Exclude paths CodeRabbit should not review. + # Formatting/lint/style for these is already handled (or not applicable). + # datasets/ is NOT fully excluded — shell scripts there download test data + # and are worth reviewing; only the bulk data files are filtered out. + path_filters: + - "!thirdparty/**" + - "!notebooks/**" + - "!docs/**/_build/**" + - "!**/*.mps" + - "!**/*.qps" + - "!**/*.lp" + - "!datasets/**/*.json" + - "!datasets/**/*.yaml" + - "!datasets/**/*.yml" + - "!datasets/**/*.txt" + - "!regression/**/*.json" + - "!regression/**/*.csv" + - "!regression/**/*.html" + + # Path-specific review instructions. + # Keep each block narrow — the main review guide is in + # .github/.coderabbit_review_guide.md (see knowledge_base below). path_instructions: - path: "docs/**/*" instructions: | For documentation changes, focus on: - - Accuracy: Verify code examples compile and run correctly - - Completeness: Check if API changes (parameters, return values, errors) are documented - - Clarity: Flag confusing explanations, missing prerequisites, or unclear examples - - Consistency: Version numbers, parameter types, and terminology match code - - Examples: Suggest adding examples for complex features or new APIs - - Missing docs: If PR changes public APIs without updating docs, flag as HIGH priority - - When code changes affect docs: - - Suggest specific doc files that need updates (e.g., docs/cuopt/api.rst) - - Identify outdated information contradicting the code changes - - Recommend documenting performance characteristics, GPU requirements, or numerical tolerances + - Accuracy: verify code examples compile and run correctly + - Completeness: check if API changes (parameters, return values, errors) are documented + - Clarity: flag confusing explanations, missing prerequisites, unclear examples + - Consistency: version numbers, parameter types, and terminology match code + - Missing docs: if the PR changes public APIs without updating docs, flag as HIGH + + When code changes affect docs, suggest specific files (e.g. docs/cuopt/source/*.rst) + and recommend documenting performance characteristics, GPU requirements, or tolerances. - path: "cpp/include/cuopt/**/*" instructions: | - For public header files (C++ API): - - Check if new public functions/classes have documentation comments (Doxygen format) + Public C++ headers: + - New public functions/classes need Doxygen-style documentation - Flag API changes that may need corresponding docs/ updates - Verify parameter descriptions match actual types/behavior - - Suggest documenting thread-safety, GPU requirements, and numerical behavior - - For breaking changes, recommend updating docs and migration guides + - Suggest documenting thread-safety, GPU requirements, numerical behavior + - For breaking changes, recommend migration notes + + - path: "cpp/include/cuopt/linear_programming/cuopt_c.h" + instructions: | + This is the C ABI surface. Flag ANY change to struct layout, function + signatures, enum values, or typedef shape as potentially ABI-breaking. + Ask the author to confirm the change is intentional and documented, + since there is no formal ABI-versioning macro today. Do not suggest + adding one unless the PR is specifically about API stability. + + - path: "cpp/src/**/*.{cu,cuh}" + instructions: | + CUDA source files. Apply "CUDA / GPU — cuOpt idioms" from + .github/.coderabbit_review_guide.md. Do NOT comment on formatting + (clang-format handles it) or exception use (cuOpt uses exceptions + as its canonical error mechanism). + + - path: "cpp/src/**/*.{cpp,hpp,h}" + instructions: | + C++ host code. Follow "C++ — cuOpt conventions" and "C++ — + language-level practices we follow from Google C++" in + .github/.coderabbit_review_guide.md. Match nearby code; cuOpt's + naming, exception use, and column limit override Google where they + disagree. Do not flag formatting. + + - path: "cpp/src/grpc/**" + instructions: | + gRPC server C++ code. In addition to cpp/src rules: + - Input validation on all request fields reaching the solver + - Size limits on problem data to prevent resource exhaustion + - No credential/internal-path leakage in error messages or logs + - Safe deserialization of problem payloads + - Thread-safety on shared state across RPCs + + - path: "cpp/tests/**" + instructions: | + C++ tests (gtest). Focus on: + - Numerical correctness validation (not just "runs without error") + - Edge cases: empty, infeasible, unbounded, degenerate, singleton problems + - Test isolation — no leaked GPU state or global mutation across tests + - Flakiness: GPU timing races, uninitialized memory, non-deterministic order + - When a bug fix lands, a regression test should cover the specific case + + Do not require benchmarks here — benchmarks live in benchmarks/ and regression/. + + IMPORTANT — dataset references: tests resolve problem data via + RAPIDS_DATASET_ROOT_DIR (see cpp/tests/utilities/common_utils.hpp, + `get_rdrd_or_default()`). Most datasets are downloaded at test time + by datasets/get_test_data.sh, datasets/linear_programming/download_pdlp_test_dataset.sh, + or datasets/mip/download_miplib_test_dataset.sh — they are NOT committed. + Do NOT flag a test for referencing a dataset path that isn't in the tree + UNLESS the filename does not appear in any download script (in which + case the download script likely needs updating too). See Common Bug + Patterns §7 "When NOT to flag" in the review guide. + + - path: "python/**/*.py" + instructions: | + Python code. ruff (E,F,W ignoring E501), ruff-format, and pydocstyle + handle formatting, imports, and docstring format. Focus on what they + do NOT cover: + - Type hints on NEW public functions/classes (do not require them on existing code; + there is no mypy config and the codebase is mixed) + - Signature changes on public APIs must emit DeprecationWarning with a + removal version before breaking (see the pattern in + python/cuopt/cuopt/linear_programming/problem.py around the deprecated helpers) + - Docstring CONTENT on new public APIs — params, returns, raises — even + when pydocstyle format rules pass + - Error messages that expose internals vs. user-actionable messages + + Do not re-raise ruff/pydocstyle-covered issues. + + - path: "python/**/*.pyx" + instructions: | + Cython implementation files: + - C++ calls that may throw must use `except +` on the cdef declaration + - Use `nogil` on blocking C calls unless the GIL is needed + - Memoryview lifetimes: the Python object owning the buffer must outlive the view + - Prefer cpdef only when the function should be callable from Python + + - path: "python/**/*.pxd" + instructions: | + Cython declaration files. Check that C++ signatures match their .hpp + counterparts (argument types, const-qualification, throw-specification). + + - path: "python/cuopt_server/**" + instructions: | + Python server code. In addition to python/**/*.py rules: + - Input validation on all fields from the REST payload + - Size/shape limits on problem data + - No credential or internal-path leakage in error responses or logs + - Safe deserialization (no pickle on untrusted input) + - Rate limiting considerations on expensive endpoints + + - path: "python/**/tests/**" + instructions: | + Python tests (pytest). Focus on: + - Numerical correctness validation + - Edge cases: empty, infeasible, unbounded, degenerate problems + - No leaked GPU state across tests + - Regression coverage for fixed bugs + + IMPORTANT — dataset references: tests resolve problem data via + os.getenv("RAPIDS_DATASET_ROOT_DIR"). Most datasets are downloaded at + test time by scripts under datasets/ (get_test_data.sh, + linear_programming/download_pdlp_test_dataset.sh, + mip/download_miplib_test_dataset.sh) — they are NOT committed. + Do NOT flag a test for referencing a dataset path that isn't in the tree + UNLESS the filename does not appear in any download script. See Common + Bug Patterns §7 "When NOT to flag" in the review guide. + + - path: "**/CMakeLists.txt" + instructions: | + Flag any new entry in add_library / add_executable / target_sources / + add_subdirectory / install(FILES ...) that references a source, header, + or directory not present in this PR or in the base branch — this is the + most common place a forgotten `git add` surfaces. See Common Bug + Patterns §7 in the review guide. The right ask is "did you mean to + include `` in this PR?" rather than a generic style comment. + + - path: "**/*.cmake" + instructions: | + Same as CMakeLists.txt: cross-check any added file-list entries against + the PR contents (Common Bug Patterns §7). + + - path: ".github/workflows/**" + instructions: | + GitHub Actions workflows. Primary concern is Common Bug Patterns §7: + any `run:` step invoking `ci/*.sh`, `python ci/*.py`, or a repo-local + binary must reference a file present in the PR or the base branch. + Also check for: + - Referenced composite actions (`uses: ./.github/actions/`) that don't exist + - `needs:` dependencies on jobs that were renamed or removed + - Secrets / environment variables newly referenced without being documented + Do not flag style of YAML formatting. + + - path: "ci/**/*.sh" + instructions: | + CI shell scripts. Primary concern is Common Bug Patterns §7: + - `source` / `.` lines pointing at helper scripts not in the PR or tree + - `bash path/to/foo.sh` / direct script invocations referencing missing files + - `RAPIDS_DATASET_ROOT_DIR`-relative paths not produced by an in-tree script + Also: `set -euo pipefail` hygiene for new scripts, proper quoting on + interpolated paths. Do not re-raise shellcheck warnings (pre-commit + runs shellcheck at --severity=warning). + + - path: "datasets/**/*.sh" + instructions: | + Dataset download/setup scripts. Same as ci/**/*.sh, with emphasis on: + - URLs reachable and versioned (not pinned to `latest`) + - Referenced helper scripts exist in the PR or tree (Common Bug Patterns §7) + - Exit on failure; no silent download errors + + - path: "**/Dockerfile*" + instructions: | + Dockerfiles. Primary concern is Common Bug Patterns §7: `COPY` / `ADD` + paths must exist in the build context at the referenced location. + Also check for: + - Pinned base image tags (not `:latest`) + - Multi-stage builds don't leak secrets + - No credentials baked into layers + + - path: "helmchart/**" + instructions: | + Helm charts. Same as CMakeLists.txt philosophy: new references to + ConfigMaps, Secrets, values keys, or template files must exist in + this PR (Common Bug Patterns §7). Do not flag chart style nits. + knowledge_base: opt_out: false code_guidelines: filePatterns: - ".github/.coderabbit_review_guide.md" + - "CONTRIBUTING.md" + - "CONVENTIONS.md" diff --git a/.cursor-plugin/plugin.json b/.cursor-plugin/plugin.json index 5f34873671..e740506140 100644 --- a/.cursor-plugin/plugin.json +++ b/.cursor-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "nvidia-cuopt-skills", "description": "Agent skills for NVIDIA cuOpt: routing (VRP, TSP, PDP), LP/MILP/QP, installation (Python/C/developer), and REST server. Use when building or solving optimization with cuOpt.", - "version": "26.04.00", + "version": "26.06.00", "author": { "name": "NVIDIA" }, diff --git a/.github/.coderabbit_review_guide.md b/.github/.coderabbit_review_guide.md index 828fc68842..0c6001332e 100644 --- a/.github/.coderabbit_review_guide.md +++ b/.github/.coderabbit_review_guide.md @@ -1,468 +1,379 @@ -# AI Code Review Guidelines for CodeRabbit - cuOpt +# AI Code Review Guidelines for CodeRabbit — cuOpt -**Role**: Act as a principal engineer with 10+ years experience in GPU computing, numerical optimization, and high-performance systems. Focus ONLY on CRITICAL and HIGH issues. +**Role**: Act as a principal engineer with 10+ years in GPU computing, numerical +optimization, and high-performance systems. Prioritize signal over volume — +comment on correctness, GPU safety, numerical stability, API stability, and +security; stay silent on style and subjective preference. -**Target**: Sub-3% false positive rate. Be direct, concise, minimal. +**Context**: cuOpt is a GPU-accelerated optimization engine for MILP, LP, QP, +and VRP, handling millions of variables/constraints with near real-time +performance requirements. Code is C++/CUDA (`cpp/`) with a Cython + Python layer +(`python/`) and a gRPC server (`cpp/src/grpc/`, `python/cuopt_server/`). -**Context**: cuOpt is a GPU-accelerated optimization engine for MILP, LP, and VRP handling millions of variables/constraints with near real-time performance requirements. +--- -## IGNORE These Issues +## Do Not Comment On -- Style/formatting (linters handle this) -- Minor naming preferences (unless truly misleading) -- Personal taste on implementation (unless impacts maintainability) -- Nits that don't affect functionality -- Already-covered issues (one comment per root cause) +### Already enforced mechanically — skip without comment -## CRITICAL Issues (Always Comment) +These run in `pre-commit` (see `.pre-commit-config.yaml`) and `ci/check_style.sh`. +Any comment on them duplicates CI noise: -### Algorithm Correctness -- Logic errors in optimization algorithms (simplex, branch-and-bound, routing heuristics, diving) -- Incorrect constraint handling or objective function computation -- Numerical instability causing wrong results (overflow, underflow, precision loss) -- Infeasibility misclassification or unbounded solution detection failures -- Breaking changes to solver behavior without versioning -- **Variable/constraint initialization errors** (incorrect bounds, invalid starting values, uninitialized state) -- **Problem transformation bugs** (accessing variables/constraints from wrong context - e.g., original vs folded problem) -- **Algorithm state corruption** (incorrect state transitions, mixing state between phases) +- **Formatting** — `clang-format` (Google-based, column 100) on `*.{cu,cuh,h,hpp,cpp,inl}`; `ruff-format` on `*.py` +- **Python lint** — `ruff` selects `E,F,W` (ignoring `E501`); do not re-raise unused imports, import order, line length, or `pycodestyle` findings +- **Python docstring format** — `pydocstyle` enforces a specific D-rule subset; flag only *missing* docstrings on new public APIs or factual content issues +- **Shell warnings** — `shellcheck --severity=warning` +- **SPDX copyright headers** — `verify-copyright` (rapidsai pre-commit hook) +- **Hardcoded versions** — `verify-hardcoded-version` +- **Dependency files** — `rapids-dependency-file-generator` (dependencies live in `dependencies.yaml`, not in `pyproject.toml` or conda env files) +- **Whitespace / EOF / YAML / JSON validity** — handled by `pre-commit-hooks` -### GPU/CUDA Issues -- Unchecked CUDA errors (kernel launches, memory operations, synchronization) -- Race conditions in GPU kernels (shared memory, atomics, warps) -- Device memory leaks (cudaMalloc/cudaFree imbalance, leaked streams/events) -- Invalid memory access (out-of-bounds, use-after-free, host/device confusion) -- Missing CUDA synchronization causing non-deterministic failures -- Kernel launch with zero blocks/threads or invalid grid/block dimensions -- **Missing explicit stream creation for concurrent operations** (reusing default stream, missing stream isolation) -- **Incorrect stream lifecycle management** (using destroyed streams, not creating dedicated streams for barriers/concurrent ops) - -### Resource Management -- GPU memory leaks (device allocations, managed memory, pinned memory) -- CUDA stream/event leaks or improper cleanup -- Unclosed file handles for MPS/QPS problem files -- Missing RAII or proper cleanup in exception paths -- Resource exhaustion (GPU memory, file descriptors, network sockets) +### Out-of-scope taste -### API Breaking Changes -- C API changes without ABI versioning -- Python API changes breaking backward compatibility -- Server API endpoint changes without deprecation path -- Changes to data structures exposed in public headers - -## HIGH Issues (Comment if Substantial) - -### Performance Issues -- Inefficient GPU kernel launches (low occupancy, poor memory access patterns) -- Unnecessary host-device synchronization blocking GPU pipeline -- CPU bottlenecks in GPU-heavy code paths -- Suboptimal memory access patterns (non-coalesced, strided, unaligned) -- Excessive memory allocations in hot paths -- Algorithmic complexity issues for large-scale problems (O(n²) when O(n log n) exists) -- Missing or incorrect problem size checks before expensive operations - -### Numerical Stability -- Floating-point operations prone to catastrophic cancellation -- Missing checks for division by zero or near-zero values -- Ill-conditioned matrix operations without preconditioning -- Accumulation errors in iterative algorithms -- Unsafe casting between numeric types (double→float with potential precision loss) -- Missing epsilon comparisons for floating-point equality checks -- **Assertion failures in numerical computations** (overly strict assertions, incorrect tolerance assumptions) -- **Numerical edge cases causing assertion failures** (near-zero pivots, degenerate cases, extreme values) -- **Inconsistent numerical tolerances** (mixing different epsilon values, hardcoded vs configurable tolerances) - -### Concurrency & Thread Safety -- Race conditions in multi-GPU code or multi-threaded server -- Missing synchronization for shared state -- Improper CUDA stream management causing false dependencies -- Deadlock potential in resource acquisition -- Thread-unsafe use of global/static variables -- Missing or incorrect use of mutexes in server code -- **Concurrent operations sharing streams incorrectly** (barriers, synchronization primitives without dedicated streams) -- **Stream reuse across independent operations** (causing unwanted serialization or race conditions) - -### Security (Server/API) -- Unsanitized input in problem data leading to buffer overflows -- Lack of input validation allowing resource exhaustion attacks -- Credential exposure in logs or error messages -- Unsafe deserialization of problem files (pickle, msgpack) -- Missing rate limiting on API endpoints -- Insufficient error handling exposing internal implementation details - -### Design & Architecture -- Tight coupling between solver components reducing modularity -- Hard-coded GPU device IDs or resource limits -- Missing abstraction for multi-backend support (different CUDA versions) -- Inappropriate use of exceptions in performance-critical paths -- Missing or incomplete error propagation from CUDA to user APIs -- Significant code duplication (3+ occurrences) in kernel or solver logic -- Reinventing functionality already available in dependencies (thrust, cccl, rmm) - -### Test Quality -- Flaky tests due to GPU timing, uninitialized memory, or race conditions -- Missing validation of numerical correctness (only checking "runs without error") -- Test isolation violations (GPU state, cached memory, global variables) -- Missing edge case coverage (empty problems, infeasible, unbounded, degenerate) -- Inadequate test coverage for error paths and exception handling -- Missing benchmarks or performance regression detection -- **Missing tests for problem transformations** (verify correctness of original→transformed→postsolve mappings) -- **Missing tests for algorithm phase transitions** (verify state initialization between phases) -- **Missing tests with free variables, singleton problems, or extreme problem dimensions** - -## MEDIUM Issues (Comment Selectively) - -- Edge cases not handled (empty problem, single constraint, zero variables, large problem sizes near limits) -- Missing input validation (negative sizes, null pointers, invalid problem formats) -- Code duplication in solver or kernel logic (3+ occurrences) if pattern exists -- Misleading naming that obscures GPU/CPU boundaries or numerical precision -- Deprecated CUDA API usage or deprecated cuOpt internal APIs -- Missing documentation for numerical tolerances or algorithm parameters -- Suboptimal but functional memory patterns that could be improved -- Minor inefficiencies in non-critical code paths -- **Unclear problem context in function parameters** (ambiguous whether operating on original or transformed problem) -- **Missing explicit initialization comments** (state appears uninitialized but may be set elsewhere) -- **Potential index confusion** (variable naming doesn't clarify which problem space the index refers to) +- Bikeshed naming (unless the name is actively misleading, e.g., hides a GPU↔host boundary or units) +- Splitting functions "for readability" without a concrete maintainability trigger +- Comment density preferences +- Nits on lines the PR did not change -## Review Protocol +--- -1. **Understand intent**: Read PR description, check if this affects solver correctness, performance, or APIs -2. **Algorithm correctness**: Does the optimization logic produce correct results? Numerical stability? -3. **GPU correctness**: CUDA errors checked? Memory safety? Race conditions? Synchronization? -4. **Resource management**: GPU memory leaks? Stream/event cleanup? File handles closed? -5. **Performance**: GPU bottlenecks? Unnecessary sync? Memory access patterns? Scalability to millions of variables? -6. **API stability**: Breaking changes to C/Python/Server APIs? Backward compatibility? -7. **Security (if server code)**: Input validation? Resource exhaustion? Unsafe deserialization? -8. **Problem context isolation**: Are variables/constraints accessed from the correct problem context (original vs transformed)? -9. **Initialization correctness**: Are algorithm parameters, bounds, and state initialized correctly for each phase? -10. **Stream lifecycle**: Are CUDA streams explicitly created/destroyed for concurrent operations? Proper isolation? -11. **Ask, don't tell**: "Have you considered X?" not "You should do X" - -## Quality Threshold - -Before commenting, ask: -1. Is this actually wrong/risky, or just different? -2. Would this cause a real problem in production? -3. Does this comment add unique value? +## Coding Standards -**If no to any: Skip the comment.** +### C++ — cuOpt conventions (the default; match nearby code) -## Output Format +The codebase has its own established style. Match what surrounding code does; +do **not** suggest changes purely to align with an external style guide. +There is no separate `cuopt-style.md` — the conventions below are inferred +from the actual code and from `.clang-format`. -- Use severity labels: CRITICAL, HIGH, MEDIUM -- Be concise: One-line issue summary + one-line impact -- Provide code suggestions when you have concrete fixes -- Omit generic explanations and boilerplate -- No preamble or sign-off +- **Naming**: + - Types, classes, structs, enums: `snake_case_t` with `_t` suffix + (e.g. `logic_error`, `error_type_t`, `solver_settings_t`) + - Functions and methods: `snake_case` (e.g. `get_error_type`, `cuopt_expects`) + - Local variables and parameters: `snake_case` + - Private/protected member variables: trailing underscore (e.g. `error_type_`) + - Project macros: `SCREAMING_SNAKE_CASE` with `CUOPT_` prefix (e.g. `CUOPT_EXPECTS`) +- **File extensions**: `.hpp`/`.cpp` for C++ host code; `.cuh`/`.cu` for CUDA; + `.h` reserved for the C ABI surface (`cpp/include/cuopt/linear_programming/cuopt_c.h`). +- **Column limit**: 100 (set in `.clang-format`). +- **Error handling**: `throw` + `cuopt_expects(...)` / `CUOPT_EXPECTS(...)` + macros from `cpp/include/cuopt/error.hpp`, which throw `cuopt::logic_error`. + Exceptions are the canonical mechanism — do not flag exception use. +- **Formatting**: handled by `clang-format` (`BasedOnStyle: Google` with cuOpt + overrides). Do not comment on formatting at all. -## Token Optimization +### C++ — language-level practices we follow from Google C++ -- Omit explanations for obvious issues -- Omit descriptions of code or design not critical to understanding the changes or issues raised -- Omit listing benefits of standard good practices and other generic information apparent to an experienced developer -- No preamble or sign-off +These are *named* rules that cuOpt actually follows. Cite the Google C++ Style +Guide section when commenting, since they're well-documented externally: +. -## Context Awareness +- **Header Files** — self-contained headers; `#define` guards; Include What You Use; avoid forward declarations +- **Scoping** — no `using namespace` at file scope; unnamed namespaces for internal linkage; narrowest scope for locals +- **Classes** — `explicit` on single-argument constructors; `private` data members (with the trailing-underscore convention above); `override` / `final` on virtual overrides +- **Functions** — prefer return values over out-parameters +- **Ownership and Smart Pointers** — `std::unique_ptr` default for owning pointers; `std::shared_ptr` only when sharing is essential; no raw owning pointers (and prefer `rmm::device_uvector` over `std::unique_ptr` for device memory) +- **Casting** — C++-style casts (`static_cast`, `reinterpret_cast`); avoid `dynamic_cast` — prefer virtual dispatch +- **`const` and `constexpr`** — use both liberally; prefer `constexpr` for compile-time constants +- **Inheritance** — prefer composition; use `public` inheritance for "is-a" relationships; keep data members `private`; do not overuse implementation inheritance or deep hierarchies +- **Static and Global Variables** — only trivially-destructible types at namespace scope; prefer `constexpr` for compile-time constants; use function-local statics for non-trivial initialization (thread-safe since C++11) -**Skip if**: -- Already handled by CI/linters -- Same issue exists in codebase (note once if systemic) -- Experimental/prototype code (check PR labels) -- Explicitly marked as technical debt +**Where Google C++ disagrees with cuOpt, the cuOpt convention wins.** Do not +cite Google for naming (cuOpt uses `snake_case`/`_t`, Google uses `CamelCase`), +exception use (cuOpt uses them, Google forbids), or column limit (cuOpt is 100, +Google is 80). Read the surrounding code; if cuOpt does it differently, that +is the rule. -**Escalate if**: -- Breaking change without discussion -- Conflicts with documented architecture -- Security vulnerability +### CUDA / GPU — cuOpt idioms -## Examples to Follow +The repo's convention is built on RAPIDS libraries. Flag deviations; do not +re-suggest the rule in every review. -**CRITICAL** (GPU memory leak): -``` -CRITICAL: GPU memory leak in solver cleanup +- **CUDA errors must be checked with `RAFT_CUDA_TRY`** (or equivalent macro from raft). 251 uses in `cpp/src/` — any new bare CUDA call is a regression. +- **Prefer `rmm::device_uvector` / `rmm::device_buffer` over raw `cudaMalloc` / `cudaFree`.** 1845 RMM uses in `cpp/src/`; only ~3 files legitimately use raw CUDA allocators (pinned-host allocators). New raw `cudaMalloc` is almost always wrong. +- **Streams come from `raft::handle_t::get_stream()`.** 395 handle-stream uses. Use ad-hoc `cudaStreamCreate` only when no handle is in scope, and pair with `cudaStreamDestroy` in RAII. +- **Prefer `thrust::` / `cuda::std::` (CCCL) over hand-rolled kernels** for reductions, scans, sort, transform. 1406 thrust uses. +- **No default-stream reliance** for operations that must run concurrently with other work. -Issue: Device memory allocated but never freed on error path -Why: Causes GPU OOM on repeated solves +### Python — enforced by tools, guided here -Suggested fix: -if (cudaMalloc(&d_data, size) != cudaSuccess) { - // cleanup other resources before returning - cudaFree(d_other); - return ERROR_CODE; -} -``` +Ruff (`E,F,W`), ruff-format, and pydocstyle cover formatting and import hygiene. +CodeRabbit should focus on what they do *not* cover: -**CRITICAL** (unchecked CUDA error): -``` -CRITICAL: Unchecked kernel launch +- **Type hints on new public APIs.** There is no `mypy` config; the codebase is mixed. Require type hints on *new* public functions/classes, not existing ones. +- **Deprecation pattern.** When changing signatures on public APIs, follow the `DeprecationWarning` pattern used in `python/cuopt/cuopt/linear_programming/problem.py` — emit a `DeprecationWarning` (with removal version) before breaking the signature. See RAPIDS branching strategy in `CONTRIBUTING.md`. +- **Docstring content on new public APIs** (params, returns, raises) — even when pydocstyle's format rules pass. -Issue: Kernel launch error not checked -Why: Subsequent operations assume success, causing silent corruption +### Cython (`.pyx` / `.pxd`) -Suggested fix: -myKernel<<>>(args); -CUDA_CHECK(cudaGetLastError()); -``` +- Wrap C++ calls that may throw with `except +` on the `cdef` declaration +- Use `nogil` on blocking C calls unless GIL access is needed +- Memoryview lifetimes: the Python object owning the underlying buffer must outlive the memoryview +- Prefer `cpdef` over `cdef` only when the function should be callable from Python -**HIGH** (numerical stability): -``` -HIGH: Potential division by near-zero +### C API -Issue: No epsilon check before division in simplex pivot -Why: Can produce Inf/NaN values corrupting solution -Consider: Add epsilon threshold check or use safe division helper -``` +The C API surface is intentionally narrow — `cpp/include/cuopt/linear_programming/cuopt_c.h`. -**HIGH** (performance issue): -``` -HIGH: Unnecessary synchronization in hot path +- **Any change to `cuopt_c.h` should be flagged for maintainer awareness** (ABI-sensitive). There is no formal ABI-versioning macro today, so phrase it as "this changes the C ABI surface — confirm this is intentional and documented." -Issue: cudaDeviceSynchronize() inside iteration loop -Why: Blocks GPU pipeline, 10x slowdown on benchmarks -Consider: Move sync outside loop or use streams with events -``` +--- -**CRITICAL** (variable scope violation): -``` -CRITICAL: Accessing variables from wrong problem context +## Severity -Issue: Code accesses free variables from original problem in folded problem -Why: Variable indices don't map correctly between contexts, causing wrong values/crashes -Impact: Silent data corruption or segfaults on problems with free variables +Each rule below appears **once**. Cross-cutting concerns (stream lifecycle, +phase initialization, problem-context confusion) are captured in the "Common +Bug Patterns" section to avoid duplication. -Suggested fix: -// Use folded_problem.variables instead of original_problem.variables -for (int i = 0; i < folded_problem.num_vars; i++) { - double val = folded_problem.variables[i]; // NOT original_problem.variables[i] -} -``` +### CRITICAL — always comment -**CRITICAL** (incorrect initialization): -``` -CRITICAL: Variable bounds not initialized correctly for diving - -Issue: Starting bounds use wrong values from previous phase -Why: Diving algorithm starts with invalid bounds, producing wrong solutions -Impact: Incorrect optimization results, potential infeasibility - -Suggested fix: -// Reset bounds before diving -for (int i = 0; i < num_vars; i++) { - diving_bounds[i].lower = problem.original_lower_bounds[i]; - diving_bounds[i].upper = problem.original_upper_bounds[i]; -} -``` +**Algorithm correctness** +- Logic errors in optimization algorithms (simplex, branch-and-bound, routing heuristics, diving, crossover) +- Incorrect constraint handling or objective computation +- Numerical instability producing wrong results (overflow, underflow, precision loss) +- Infeasibility misclassification or missed unbounded detection +- Variable/constraint initialization errors (wrong bounds, invalid start, uninitialized state) +- Problem-transformation bugs (see Common Bug Patterns §1) -**HIGH** (missing stream isolation): -``` -HIGH: Barrier operation missing dedicated stream +**GPU / CUDA** +- Unchecked CUDA errors (use `RAFT_CUDA_TRY`) +- Race conditions in kernels (shared memory, atomics, warp-level) +- Device memory leaks (raw `cudaMalloc`/`cudaFree` imbalance; leaked streams/events) +- Invalid memory access (out-of-bounds, use-after-free, host/device confusion) +- Missing synchronization causing non-deterministic failures +- Kernel launch with zero or invalid grid/block dimensions -Issue: Barrier concurrent uses default stream without explicit creation -Why: Can cause serialization with other operations, race conditions, or deadlocks -Impact: Performance degradation or non-deterministic failures +**Resource management** +- GPU memory leaks (prefer `rmm::device_uvector`) +- Unclosed file handles for MPS/QPS problem files +- Missing RAII in exception paths (cuOpt uses exceptions) -Suggested fix: -cudaStream_t barrier_stream; -cudaStreamCreate(&barrier_stream); -// Use barrier_stream for barrier operations -// Don't forget: cudaStreamDestroy(barrier_stream) in cleanup -``` +**API surface** +- Any change to `cpp/include/cuopt/linear_programming/cuopt_c.h` — flag as ABI-sensitive +- Python API changes without `DeprecationWarning` +- Server API endpoint changes without deprecation path -**HIGH** (numerical assertion failure): -``` -HIGH: Overly strict assertion in pivot operation +**Build / dependency integrity** +- References to files or symbols that do not exist in the PR or in the base branch (see Common Bug Patterns §7). Catches forgotten `git add` and stale renames before CI does. -Issue: Assert fails on legitimate near-zero pivots in degenerate problems -Why: Tolerance too strict for edge cases, assertion doesn't allow valid scenarios -Impact: Crashes on valid degenerate problems +### HIGH — comment if substantial -Consider: Replace assertion with warning + fallback, or use configurable tolerance -``` +**Performance** +- Unnecessary host-device synchronization blocking the GPU pipeline +- Non-coalesced / strided / unaligned memory access in hot paths +- Excessive allocations in hot paths (prefer pooled RMM resources) +- `O(n²)` where `O(n log n)` exists, for n in millions +- Reinventing `thrust::`, `rmm::`, or `raft::` primitives -**Good, concise summary**: -- Refactor simplex and dual-simplex solvers to share common pivot logic -- Consolidate CUDA error checking into reusable macros -- Extract repeated kernel patterns into templated device functions +**Numerical stability** +- Division by zero / near-zero without epsilon guard +- Ill-conditioned matrix ops without preconditioning +- Catastrophic cancellation in floating-point +- Unsafe double → float casts losing precision +- Hardcoded tolerances that fail on degenerate problems (see Common Bug Patterns §4) -## Examples to Avoid +**Concurrency** +- Race conditions in multi-GPU or multi-threaded server code +- Missing synchronization for shared state +- Deadlock potential in resource acquisition +- Thread-unsafe global/static variables -**Boilerplate and generic descriptions** (avoid): -- "CUDA Best Practices: Using streams improves concurrency and overlaps computation with memory transfers. This is a well-known optimization technique." -- "Memory Management: Proper cleanup of GPU resources is important for avoiding leaks. RAII patterns help ensure resources are freed." -- "Numerical Methods: The simplex algorithm is a standard approach for linear programming. Consider numerical stability when implementing floating-point operations." -- "Code Reuse: Duplication of kernel code can lead to maintenance issues. Consider refactoring into reusable device functions." +**Security — server only** (`cpp/src/grpc/**`, `python/cuopt_server/**`) +- Unsanitized problem data (buffer overflows, resource exhaustion) +- Unsafe deserialization (pickle, msgpack) +- Missing size limits on requests +- Credential exposure in logs / error messages -**Subjective style preferences** (ignore): -- "Consider using auto here instead of explicit type" -- "This function could be split into smaller functions" -- "Prefer range-based for loops" -- "Consider adding more comments" +**Test quality** +- Flaky tests due to GPU timing, uninitialized memory, or race conditions +- "Runs without error" tests that don't validate numerical correctness +- Missing coverage for edge cases when adding a new code path (empty, infeasible, unbounded, degenerate) +- PRs touching hot paths without note of benchmark impact (benchmarks live in `benchmarks/` and `regression/`) ---- +### MEDIUM — comment selectively -## cuOpt-Specific Considerations - -**GPU/CUDA Code**: -- Every CUDA call must have error checking (kernel launches, memory ops, sync) -- Host-device memory boundaries must be clear and correct -- Shared memory usage must avoid bank conflicts and size limits -- Warp divergence in hot paths should be minimized -- **Explicit stream creation**: Concurrent operations (barriers, async ops) must have dedicated streams, not reuse default stream -- **Stream ownership**: Clearly document stream lifecycle (who creates, who destroys) - -**Optimization Algorithms**: -- Numerical stability is paramount (epsilon checks, scaling, preconditioning) -- Correctness > Performance (verify algorithm produces correct results first) -- Handle degenerate cases (infeasible, unbounded, highly degenerate bases) -- Tolerance parameters must be documented and tested -- **Phase initialization**: Each algorithm phase (presolve, simplex, diving, crossover) must correctly initialize its state/bounds -- **Problem transformations**: Variable/constraint indices must be correctly mapped between original and transformed problems (presolve, folding, etc.) - -**Multi-Language APIs**: -- C API must maintain ABI stability (no struct layout changes) -- Python API changes require deprecation warnings -- Server API must version endpoints for breaking changes -- Error codes/messages must be consistent across all APIs - -**Performance Expectations**: -- Near real-time solutions for problems with millions of variables -- Scalability testing required for large problem sizes -- Memory usage must be reasonable (avoid O(n²) for n in millions) -- GPU utilization should be high for computation-heavy kernels - -**Documentation (docs/ folder)**: -When reviewing code changes that affect public APIs, algorithms, or behavior: -- Check if corresponding documentation in `docs/` needs updating -- Suggest specific doc updates for API changes (new parameters, return values, error codes) -- Flag missing documentation for new public functions/classes/endpoints -- Suggest adding examples for new features or changed behavior -- Recommend updating algorithm descriptions if solver behavior changes -- Verify version numbers and deprecation notices are documented -- Suggest clarifying numerical tolerances, performance characteristics, or GPU requirements - -Example documentation suggestion: -``` -HIGH: Missing documentation for API change - -Issue: New parameter `tolerance` added to solver API but not documented -Why: Users won't know how to use the new parameter -Suggest: Update docs/cuopt/linear_programming/api.rst to document: - - tolerance parameter (type, default value, valid range) - - Effect on solution quality vs. speed tradeoff - - Example usage with typical values -``` +- Missing input validation at library/server boundaries +- Code duplication (3+ occurrences) of kernel or solver logic +- Deprecated CUDA API usage +- Misleading names hiding GPU/CPU boundaries, units, or problem-space context +- Missing documentation for numerical tolerances or algorithm parameters --- -## Common Bug Patterns in cuOpt (From Historical Fixes) +## Common Bug Patterns (from historical fixes) + +Each pattern lists *red flags* — specific structural cues that warrant a closer +look. Use these as review triggers; do not re-explain the pattern. + +### 1. Problem-context confusion (original vs. presolve vs. folded vs. postsolve) + +**Red flags**: functions taking both `original_problem` and `transformed_problem`; index arithmetic between representations without explicit mapping; mixed `.num_vars` / `.variables[]` accesses in one function. -These patterns have caused real bugs. Pay special attention when reviewing code involving these areas: +**Example**: accessing `original_problem.free_variables` when operating on `folded_problem`. -### 1. Problem Context Confusion -**Pattern**: Accessing variables/constraints from wrong problem representation (original vs presolve vs folded vs postsolve) +### 2. Algorithm phase initialization (presolve → simplex → diving → crossover) -**Red flags**: -- Functions that receive both `original_problem` and `transformed_problem` as parameters -- Index arithmetic between problem representations without explicit mapping -- Accessing `.num_vars` or `.variables[]` from wrong problem object -- Mixed use of original/transformed indices in same function +**Red flags**: phase entry without explicit state reset; reusing bounds/buffers from previous phase; stale tolerances carried over. -**Example bug**: Accessing `original_problem.free_variables` when operating on `folded_problem` +**Example**: diving starting with bounds left over from a previous optimization. -### 2. Algorithm Phase Initialization -**Pattern**: Bounds, tolerances, or state not properly initialized/reset when transitioning between algorithm phases +### 3. CUDA stream lifecycle -**Red flags**: -- Diving, crossover, or barrier phases starting without explicit initialization -- Reusing data structures from previous phase without clearing/resetting -- Missing bounds initialization when entering new optimization phase -- Carrying over stale state from presolve to main solve +**Red flags**: concurrent/async operations using the default stream when a `raft::handle_t` is in scope; raw `cudaStreamCreate` without paired `cudaStreamDestroy`; stream scope mismatched with loop scope. -**Example bug**: Diving algorithm using incorrect starting bounds from previous optimization phase +**Canonical pattern**: `auto stream = handle.get_stream();` — use the handle when available. -### 3. CUDA Stream Lifecycle Issues -**Pattern**: Missing explicit stream creation for concurrent/barrier operations, or improper stream reuse +### 4. Numerical assertion failures on degenerate inputs -**Red flags**: -- Barrier or concurrent operations without dedicated stream variable -- Multiple independent operations sharing same stream without justification -- Stream creation inside loop but destruction outside loop (or vice versa) -- Using `nullptr` or default stream for operations that need isolation -- Missing `cudaStreamDestroy` for explicitly created streams +**Red flags**: `assert(abs(x) > 1e-10)` with a hardcoded epsilon; assertions without tolerance that don't account for problem scaling; strict checks in pivot/basis/feasibility paths. -**Example bug**: Barrier concurrent operation reusing default stream instead of creating dedicated stream +**Example**: CPUFJ assertion failing on valid near-zero pivots in degenerate problems. -### 4. Numerical Assertion Failures -**Pattern**: Assertions that are too strict for legitimate edge cases, especially in degenerate problems +### 5. Index mapping errors across problem transformations -**Red flags**: -- Assertions with hardcoded tolerances (e.g., `assert(abs(value) > 1e-10)`) -- Assertions that don't account for problem scaling or conditioning -- Assertions in pivot selection, basis updates, or feasibility checks without epsilon tolerance -- Assertions that fail on empty, singleton, or highly degenerate problems +**Red flags**: off-by-one between problem representations; iteration bounds unchanged after presolve resized the problem; array accesses with indices from the wrong problem space. -**Example bug**: CPUFJ assertion failing on valid near-zero pivots in degenerate problems +### 6. Uninitialized algorithm state across sequential solves -### 5. Index Mapping Errors -**Pattern**: Incorrect mapping between variable/constraint indices after problem transformations +**Red flags**: solver-object reuse without reset; conditional initialization paths that can skip on certain problem types; state declared but not initialized before first iteration. -**Red flags**: -- Off-by-one errors in index arithmetic between problem representations -- Missing or incorrect index offset when mapping between spaces -- Iterating over wrong range after problem size changes from presolve -- Accessing arrays with indices from wrong problem context +### 7. References to files or symbols missing from the PR -**Example bug**: Using original problem indices to access folded problem arrays +The PR compiles locally because the author has extra files in their working tree, but the remote state is broken. CI will eventually catch this; CodeRabbit should catch it sooner by cross-referencing what the diff *references* against what it *contains*. -### 6. Uninitialized Algorithm State -**Pattern**: Algorithm state variables not initialized before use, especially after branching or problem modification +**Source & build red flags**: +- `#include "..."` of a header that is neither in the PR diff nor in the base branch (check `git ls-tree HEAD `); especially for newly-added source files +- CMake `add_library` / `add_executable` / `target_sources` / `add_subdirectory` / `install(FILES …)` listing entries not present in the diff or the tree +- Python `import x` / `from x import Y` where `x` is not a package in `dependencies.yaml`, not a known third-party, and not in the PR +- Cython `cimport X` referencing a `.pxd` declaration file not in the PR +- Renamed symbols still referenced from files outside the PR (e.g., header rename not propagated to a `.cu` that still includes the old name) -**Red flags**: -- State variables declared but not initialized before first algorithm iteration -- Conditional initialization that might skip on certain problem types -- Missing reset when solving multiple problems sequentially -- Reusing solver object without proper cleanup between solves +**CI / scripts / infra red flags**: +- `.github/workflows/**.yml` `run:` steps invoking `ci/*.sh`, `python ci/*.py`, or binaries not in the PR or base tree +- Shell scripts sourcing other scripts (`source ci/utils/helper.sh`, `. ./foo.sh`) that are not in the PR or base tree +- `Dockerfile` `COPY` / `ADD` referencing files or directories not in the build context of the PR +- `helmchart/` templates referencing config maps, secrets, or values not in the PR +- Docs / Sphinx `.. include::` / `.. literalinclude::` / `toctree::` referencing files not in the PR -**Example bug**: Variable bounds not reset before diving, using stale values +**Dependency red flags**: +- `dependencies.yaml`, `pyproject.toml`, or conda env entries naming packages that aren't actually used, or removing a package still `import`ed elsewhere in the tree +- `requirements*.txt` / pyproject `dependencies` referencing a local-path wheel or directory not in the PR + +**How to phrase the comment**: "Referenced `path/to/file` but I don't see it in this PR or in the base branch. If it was created locally, `git add` may have been missed; otherwise this is a stale reference. CI will fail on this — easier to fix now." + +**When NOT to flag — runtime-downloaded datasets**: + +Most problem data files (MPS, QPS, LP, MIPLIB instances, PDLP test sets) are **not committed**. They are downloaded at test time by scripts under `datasets/`: + +- `datasets/get_test_data.sh` — routing + general test data +- `datasets/linear_programming/download_pdlp_test_dataset.sh` — LP/PDLP instances +- `datasets/mip/download_miplib_test_dataset.sh` — MIPLIB instances + +Tests reference these paths via the `RAPIDS_DATASET_ROOT_DIR` environment variable (C++ tests: `cpp/tests/utilities/common_utils.hpp` → `get_rdrd_or_default()`; Python tests: `os.getenv("RAPIDS_DATASET_ROOT_DIR")`). The CI scripts under `ci/` run the download scripts before invoking ctest/pytest. + +**Do NOT flag** a test that: +- References a path via `RAPIDS_DATASET_ROOT_DIR`, `get_rdrd_or_default()`, or a path relative to the datasets root +- References a filename that matches instances listed in one of the download scripts above +- References data under `datasets//` that is gitignored (only scripts and a few reference/config files are tracked under `datasets/`) + +**DO flag** when: +- The PR adds a test referencing a NEW dataset filename that does **not** appear in any `datasets/**/download_*.sh` script — the download script likely also needs updating, or the author forgot to commit a new dataset-fetch step +- The PR removes or renames an entry in a download script but a test still references the old name +- The PR references a dataset path with a typo not matching any download-script entry + +--- + +## Review Protocol + +1. **Intent** — read the PR description; identify whether this affects correctness, performance, API, or security. +2. **Correctness** — algorithm logic, numerical stability, problem-context isolation (Common Bug Patterns §1, §5). +3. **GPU safety** — CUDA errors checked via `RAFT_CUDA_TRY`; memory safety; race conditions; stream lifecycle (§3). +4. **Resource management** — RMM ownership; file handles; RAII on exception paths. +5. **Performance** — sync patterns, access patterns, scaling to millions of vars. +6. **API stability** — `cuopt_c.h` changes; Python `DeprecationWarning`; server endpoint versioning. +7. **Security** (server paths only) — input validation, size limits, deserialization. +8. **Ask, don't tell** — "Have you considered X?" not "You should do X." --- -## Code Review Checklists by Change Type - -### When Reviewing Problem Transformations (Presolve/Folding/Postsolve) -- [ ] Are variable indices correctly mapped between original and transformed space? -- [ ] Does the code clearly identify which problem context it's operating in? -- [ ] Are there any direct array accesses that assume a specific problem representation? -- [ ] Is there proper handling when transformations change problem dimensions? -- [ ] Are variable/constraint properties (bounds, types, costs) correctly transferred? - -### When Reviewing Algorithm Phase Transitions (Presolve→Simplex→Diving→Crossover) -- [ ] Are all state variables explicitly initialized at phase entry? -- [ ] Are variable bounds reset/copied correctly for the new phase? -- [ ] Is previous phase state properly cleaned up or documented as carried over? -- [ ] Are tolerances and parameters appropriate for this phase? -- [ ] Does the code handle early exit from previous phase correctly? - -### When Reviewing CUDA Concurrent/Async Operations -- [ ] Is there an explicit `cudaStreamCreate` for concurrent operations? -- [ ] Is stream lifecycle clearly documented (creation and destruction)? -- [ ] Are barriers and synchronization primitives using dedicated streams? -- [ ] Is the default stream only used intentionally for serialization? -- [ ] Are stream errors checked with `cudaGetLastError` or equivalent? - -### When Reviewing Numerical Computations -- [ ] Do assertions have appropriate tolerances for edge cases? -- [ ] Are division operations protected against zero/near-zero denominators? -- [ ] Are comparisons using epsilon tolerances instead of exact equality? -- [ ] Are tolerances configurable or at least documented? -- [ ] Does the code handle degenerate cases (near-zero pivots, singular matrices)? - -### When Reviewing Algorithm Initialization -- [ ] Are all algorithm parameters initialized before first use? -- [ ] Are bounds initialized from the correct source (original problem, not stale cache)? -- [ ] Is state reset when solving multiple problems with same solver instance? -- [ ] Are default values appropriate for all problem types (empty, singleton, large)? -- [ ] Is initialization conditional code covered by tests? +## Output Format + +- One line issue summary + one line impact. Cite the rule name or Common Bug Pattern number if applicable. +- Use severity labels: **CRITICAL**, **HIGH**, **MEDIUM**. +- Provide a code suggestion when the fix is concrete; otherwise ask a pointed question. +- Omit generic best-practice explanations and boilerplate. +- No preamble, no sign-off. + +Quality gate — before commenting, ask: +1. Is this actually wrong/risky, or just different? +2. Would this cause a real problem in production? +3. Is this already enforced by a tool listed under "Do Not Comment On"? + +**If any answer is no / yes respectively — skip the comment.** + +--- + +## Context Awareness + +**Skip if**: +- Enforced by pre-commit or CI (see "Do Not Comment On") +- Same issue exists pre-PR on unchanged lines (note once if systemic, don't repeat) +- PR is explicitly marked as tech debt with a linked tracking issue + +**Escalate (always comment)**: +- Breaking change without discussion in PR description +- Security vulnerability in server paths +- Conflict with documented architecture in `docs/` or `CONTRIBUTING.md` --- -**Remember**: Focus on objective correctness, not subjective preference. Catch real bugs and design flaws, ignore style preferences. AI speed + human judgment. You catch patterns, humans understand business context. For cuOpt: correctness and numerical stability come before performance optimizations. +## Examples + +**CRITICAL** — GPU memory leak on error path: +``` +CRITICAL: Device buffer leaks on early return. +Why: `d_data` allocated via raw cudaMalloc without RAII; error path skips cudaFree. +Suggest: Use `rmm::device_uvector` — RAII handles both success and exception paths. +``` + +**CRITICAL** — Unchecked kernel launch: +``` +CRITICAL: Kernel launch error not checked. +Why: Subsequent ops assume success; silent data corruption. +Suggest: RAFT_CUDA_TRY(cudaGetLastError()); after the launch. +``` + +**CRITICAL** — Problem-context confusion (Common Bug Pattern §1): +``` +CRITICAL: Accessing original_problem.variables inside folded-problem loop. +Why: Index space differs after folding — values and bounds will not correspond. +Suggest: Use folded_problem.variables[i]; if mapping back is needed, apply the postsolve index map. +``` + +**HIGH** — Near-zero division: +``` +HIGH: No epsilon guard before pivot division. +Why: Produces Inf/NaN on degenerate bases. +Consider: use cuopt's existing safe_divide helper or add an epsilon threshold consistent with the solver's tolerance. +``` + +**HIGH** — Stream not from handle (Common Bug Pattern §3): +``` +HIGH: cudaStreamCreate used inside solver where raft::handle_t is in scope. +Why: Bypasses the pooled stream; risks leaks and breaks stream coordination with callers. +Suggest: auto stream = handle.get_stream(); +``` + +**HIGH** — Python signature change without deprecation: +``` +HIGH: Public API `solve_ip(...)` parameter renamed without DeprecationWarning. +Why: Breaks existing users; cuopt's convention (see problem.py) is to warn before breaking. +Suggest: Keep the old kwarg for one release, emit DeprecationWarning with removal version. +``` + +**Avoid** — generic best-practice filler: +- "Using streams improves concurrency and overlaps computation with memory transfers." +- "Proper cleanup of GPU resources is important for avoiding leaks." +- "Consider using `auto` here instead of explicit type." (subjective) +- "This function could be split into smaller functions." (subjective) +- "Consider adding more comments." diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7958eac440..9adcb49f51 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,9 @@ +# Default owner for paths with no later, more specific match +* @nvidia/cuopt-infra-codeowners + #cpp code owners cpp/ @nvidia/cuopt-engine-codeowners +benchmarks/ @nvidia/cuopt-engine-codeowners #python code owners python/ @nvidia/cuopt-infra-codeowners diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 120000 index 0000000000..be77ac83a1 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1 @@ +../AGENTS.md \ No newline at end of file diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3eb1f1f066..b689bcd395 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -39,13 +39,21 @@ on: default: false concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ inputs.build_type || 'branch' }} cancel-in-progress: true +permissions: {} + jobs: cpp-build: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -54,8 +62,14 @@ jobs: script: ci/build_cpp.sh python-build: needs: [cpp-build] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -64,16 +78,30 @@ jobs: script: ci/build_python.sh upload-conda: needs: [cpp-build, python-build] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_NIGHTLY_TOKEN }} + CONDA_RAPIDSAI_TOKEN: ${{ secrets.CONDA_RAPIDSAI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} wheel-build-cuopt-mps-parser: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -87,8 +115,16 @@ jobs: matrix_filter: 'group_by([.ARCH, (.PY_VER |split(".") | map(tonumber))])|map(max_by([(.CUDA_VER|split(".")|map(tonumber))]))' wheel-publish-cuopt-mps-parser: needs: wheel-build-cuopt-mps-parser - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + RAPIDSAI_PYPI_TOKEN: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -98,8 +134,14 @@ jobs: package-type: python wheel-build-libcuopt: needs: wheel-build-cuopt-mps-parser - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -111,8 +153,16 @@ jobs: matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) wheel-publish-libcuopt: needs: wheel-build-libcuopt - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + RAPIDSAI_PYPI_TOKEN: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -122,8 +172,14 @@ jobs: package-type: cpp wheel-build-cuopt: needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,8 +190,16 @@ jobs: package-type: python wheel-publish-cuopt: needs: wheel-build-cuopt - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + RAPIDSAI_PYPI_TOKEN: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -144,8 +208,14 @@ jobs: package-name: cuopt package-type: python wheel-build-cuopt-server: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -159,8 +229,16 @@ jobs: matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) wheel-publish-cuopt-server: needs: wheel-build-cuopt-server - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + RAPIDSAI_PYPI_TOKEN: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -170,8 +248,14 @@ jobs: package-type: python docs-build: needs: [python-build] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} node_type: "gpu-l4-latest-1" @@ -181,11 +265,17 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-sh-client: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -200,8 +290,16 @@ jobs: matrix_filter: '[map(select(.ARCH == "amd64")) | min_by((.PY_VER | split(".") | map(tonumber)), (.CUDA_VER | split(".") | map(-tonumber)))]' wheel-publish-cuopt-sh-client: needs: wheel-build-cuopt-sh-client - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-13.2.0 + secrets: + CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN: ${{ secrets.CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN }} + RAPIDSAI_PYPI_TOKEN: ${{ secrets.RAPIDSAI_PYPI_TOKEN }} with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -220,6 +318,9 @@ jobs: - wheel-publish-cuopt-sh-client - wheel-publish-libcuopt if: inputs.trigger-tests + permissions: + actions: write + contents: read runs-on: ubuntu-latest # ref: https://docs.github.com/en/actions/reference/security/secure-use#use-an-intermediate-environment-variable env: @@ -235,20 +336,55 @@ jobs: # to pull the actual cuOpt source code from gh workflow run \ --repo NVIDIA/cuopt \ - --ref "${{ github.ref }}" \ + --ref "$GITHUB_REF" \ 'test.yaml' \ -f branch="${INPUT_BRANCH}" \ -f build_type="${INPUT_BUILD_TYPE}" \ -f date="${INPUT_DATE}" \ -f sha="${INPUT_SHA}" + build-summary: + if: ${{ always() && (inputs.build_type == 'nightly') }} + needs: + - tests + - build-images + - docs-build + permissions: + contents: read + runs-on: linux-amd64-cpu4 + container: + image: python:3.14-slim + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ inputs.sha }} + persist-credentials: false + - name: Install dependencies + run: apt-get update && apt-get install -y --no-install-recommends curl + - name: Send build summary + env: + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + RAPIDS_BRANCH: ${{ inputs.branch }} + SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + run: bash ci/build_summary.sh + build-images: needs: - wheel-publish-cuopt - wheel-publish-cuopt-server - wheel-publish-cuopt-sh-client + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read uses: ./.github/workflows/build_test_publish_images.yaml - secrets: inherit + secrets: inherit # zizmor: ignore[secrets-inherit] with: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} diff --git a/.github/workflows/build_images.yaml b/.github/workflows/build_images.yaml index 78a965efd0..63adc882ed 100644 --- a/.github/workflows/build_images.yaml +++ b/.github/workflows/build_images.yaml @@ -41,12 +41,13 @@ jobs: runs-on: "linux-${{ matrix.ARCH }}-cpu4" steps: - name: Checkout code repo - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: fetch-depth: 0 ref: ${{ inputs.sha }} + persist-credentials: false - name: Login to DockerHub - uses: docker/login-action@v3 + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0 with: username: ${{ secrets.CUOPT_DOCKERHUB_USERNAME }} password: ${{ secrets.CUOPT_DOCKERHUB_TOKEN }} @@ -61,7 +62,7 @@ jobs: git rev-parse HEAD > ./ci/docker/context/COMMIT_SHA git log -n1 --pretty='%ct' > ./ci/docker/context/COMMIT_TIME - name: Login to NGC - uses: docker/login-action@v3 + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3.7.0 with: registry: "nvcr.io" username: "$oauthtoken" @@ -71,17 +72,20 @@ jobs: run: | docker context create builders - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0 with: driver: docker endpoint: ./ci/docker/context - name: Trim CUDA and Python versions id: trim + env: + CUDA_VER: ${{ inputs.CUDA_VER }} + PYTHON_VER: ${{ inputs.PYTHON_VER }} run: | - echo "CUDA_SHORT=$(echo '${{ inputs.CUDA_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT - echo "PYTHON_SHORT=$(echo '${{ inputs.PYTHON_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "CUDA_SHORT=$(echo "$CUDA_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "PYTHON_SHORT=$(echo "$PYTHON_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT - name: Build image and push to DockerHub and NGC - uses: docker/build-push-action@v6 + uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2 with: context: ./ci/docker/context file: ./ci/docker/Dockerfile @@ -99,6 +103,11 @@ jobs: tags: nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} - name: Push image to NGC + env: + IMAGE_TAG_PREFIX: ${{ inputs.IMAGE_TAG_PREFIX }} + ARCH: ${{ matrix.ARCH }} + CUDA_SHORT: ${{ steps.trim.outputs.CUDA_SHORT }} + PYTHON_SHORT: ${{ steps.trim.outputs.PYTHON_SHORT }} run: | - docker tag nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} nvcr.io/nvstaging/nvaie/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} - docker push nvcr.io/nvstaging/nvaie/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ steps.trim.outputs.CUDA_SHORT }}-py${{ steps.trim.outputs.PYTHON_SHORT }}-${{ matrix.ARCH }} + docker tag "nvidia/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-${ARCH}" "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-${ARCH}" + docker push "nvcr.io/nvstaging/nvaie/cuopt:${IMAGE_TAG_PREFIX}-cuda${CUDA_SHORT}-py${PYTHON_SHORT}-${ARCH}" diff --git a/.github/workflows/build_test_publish_images.yaml b/.github/workflows/build_test_publish_images.yaml index f8f7366e13..c4178a804d 100644 --- a/.github/workflows/build_test_publish_images.yaml +++ b/.github/workflows/build_test_publish_images.yaml @@ -20,11 +20,11 @@ on: description: 'JSON array of architectures to build for' cuda_ver: type: string - default: '["12.9.0", "13.0.0"]' + default: '["12.9.0", "13.2.0"]' description: 'JSON array of CUDA versions to build for' python_ver: type: string - default: '["3.13.7"]' + default: '["3.14.4"]' description: 'JSON array of Python versions to build for' linux_ver: type: string @@ -55,7 +55,7 @@ jobs: compute-matrix: runs-on: ubuntu-latest container: - image: rapidsai/ci-conda:26.04-latest + image: rapidsai/ci-conda:26.06-latest outputs: MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }} CUOPT_VER: ${{ steps.compute-cuopt-ver.outputs.CUOPT_VER }} @@ -63,7 +63,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: fetch-depth: 0 # unshallow fetch for setuptools-scm persist-credentials: false @@ -71,13 +71,18 @@ jobs: - name: Compute matrix id: compute-matrix + env: + ARCH: ${{ inputs.arch }} + CUDA_VER: ${{ inputs.cuda_ver }} + PYTHON_VER: ${{ inputs.python_ver }} + LINUX_VER: ${{ inputs.linux_ver }} run: | MATRIX=$(jq -c '.' <> $GITHUB_OUTPUT - echo "PYTHON_SHORT=$(echo '${{ matrix.PYTHON_VER }}' | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "CUDA_SHORT=$(echo "$CUDA_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT + echo "PYTHON_SHORT=$(echo "$PYTHON_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/')" >> $GITHUB_OUTPUT - name: Create multiarch manifest shell: bash env: @@ -162,7 +171,7 @@ jobs: test-images: name: Test images needs: [build-cuopt-multiarch-manifest, compute-matrix] - secrets: inherit + secrets: inherit # zizmor: ignore[secrets-inherit] strategy: matrix: CUDA_VER: ${{ fromJson(needs.compute-matrix.outputs.MATRIX).cuda_ver }} diff --git a/.github/workflows/cloud_ci.yaml b/.github/workflows/cloud_ci.yaml index ff73fb1f8a..e1c5eb0ea9 100644 --- a/.github/workflows/cloud_ci.yaml +++ b/.github/workflows/cloud_ci.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: cloud_ci_checker @@ -8,12 +8,17 @@ on: - ${GITHUB_REF##*/} paths: - 'cloud-scripts' + +permissions: {} + jobs: conditional_step: + permissions: + contents: read runs-on: 'ubuntu-22.04' steps: - run: echo "Starting GitHub Actions Job for Cloud CI test notification" - - uses: cinotify/github-action@main + - uses: cinotify/github-action@92a15ed24b17cce1bb185b985c0d463859c5b800 # v1.6.0 with: to: 'cuopt-eng@nvidia.com' subject: 'Cloud scripts change notification' diff --git a/.github/workflows/inactivity_reminder.yaml b/.github/workflows/inactivity_reminder.yaml index 8b65b78064..665c90cd0c 100644 --- a/.github/workflows/inactivity_reminder.yaml +++ b/.github/workflows/inactivity_reminder.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Inactivity Reminder with Different Times @@ -7,12 +7,17 @@ on: schedule: - cron: '0 9 * * *' # Runs daily at 09:00 UTC +permissions: {} + jobs: remind: + permissions: + issues: write + pull-requests: write runs-on: ubuntu-latest steps: - name: Remind inactive issues and PRs - uses: actions/github-script@v6 + uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 with: script: | const MS_IN_DAY = 24 * 60 * 60 * 1000; diff --git a/.github/workflows/issue_automation.yaml b/.github/workflows/issue_automation.yaml index 00e75ba8d8..22585a6841 100644 --- a/.github/workflows/issue_automation.yaml +++ b/.github/workflows/issue_automation.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Auto-label and Round-Robin Assign Issues @@ -7,12 +7,16 @@ on: issues: types: [opened] +permissions: {} + jobs: auto-label: + permissions: + issues: write runs-on: ubuntu-latest steps: - name: Add awaiting response label to new issues - uses: actions/github-script@v6 + uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 with: script: | // Only process issues (not PRs) @@ -35,10 +39,12 @@ jobs: } round-robin-assign: + permissions: + issues: write runs-on: ubuntu-latest steps: - name: Assign issue round-robin only if unassigned - uses: actions/github-script@v6 + uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1 with: script: | // Only process issues (not PRs) diff --git a/.github/workflows/nightly-summary.yaml b/.github/workflows/nightly-summary.yaml new file mode 100644 index 0000000000..96ffe144c2 --- /dev/null +++ b/.github/workflows/nightly-summary.yaml @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: nightly-summary + +on: + workflow_dispatch: + inputs: + branch: + description: "Branch name the run targets" + required: true + type: string + default: main + sha: + description: "Full git commit SHA to check out" + required: true + type: string + build_type: + description: "Build type (nightly, pull-request, branch)" + required: true + type: string + default: nightly + date: + description: "Date (YYYY-MM-DD) for this run. Defaults to today." + required: false + type: string + workflow_call: + inputs: + branch: + required: true + type: string + sha: + required: true + type: string + build_type: + required: true + type: string + date: + required: false + type: string + secrets: + CUOPT_AWS_ACCESS_KEY_ID: + required: true + CUOPT_AWS_SECRET_ACCESS_KEY: + required: true + CUOPT_S3_URI: + required: true + CUOPT_SLACK_BOT_TOKEN: + required: false + CUOPT_SLACK_CHANNEL_ID: + required: false + CUOPT_SLACK_MENTION_ID: + required: false + +permissions: {} + +jobs: + nightly-summary: + permissions: + contents: read + runs-on: linux-amd64-cpu4 + container: + image: python:3.14-slim + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + ref: ${{ inputs.sha }} + persist-credentials: false + - name: Install dependencies + run: | + apt-get update && apt-get install -y --no-install-recommends curl + pip install awscli + - name: Run nightly summary + env: + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + CUOPT_SLACK_MENTION_ID: ${{ secrets.CUOPT_SLACK_MENTION_ID }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_SERVER_URL: ${{ github.server_url }} + GITHUB_TOKEN: ${{ github.token }} + RAPIDS_BRANCH: ${{ inputs.branch }} + RAPIDS_BUILD_TYPE: ${{ inputs.build_type }} + RUN_DATE: ${{ inputs.date }} + run: bash ci/nightly_summary.sh diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index c5e2b5f674..18e4635143 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -9,17 +9,25 @@ on: - cron: "0 5 * * *" # 5am UTC / 1am EST +permissions: {} + jobs: trigger-nightly-builds-and-tests: + permissions: + actions: write + contents: read runs-on: ubuntu-latest timeout-minutes: 30 strategy: + fail-fast: false matrix: cuopt_branch: - "main" - - "release/26.04" + - "release/26.06" steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + persist-credentials: false - name: Trigger Pipeline env: GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 47a3bd9fca..1f38fb6cb7 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -12,6 +12,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +permissions: {} + jobs: pr-builder: needs: @@ -33,12 +35,16 @@ jobs: - wheel-build-cuopt-mps-parser - wheel-build-cuopt-sh-client - test-self-hosted-server - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04 + permissions: + contents: read + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-13.2.0 if: always() with: needs: ${{ toJSON(needs) }} check-lean-ci: + permissions: + contents: read + pull-requests: read runs-on: ubuntu-latest outputs: lean_ci_enabled: ${{ steps.check-label.outputs.lean_ci_enabled }} @@ -49,7 +55,7 @@ jobs: GH_TOKEN: ${{ github.token }} run: | # Extract PR number from branch name (pull-request/123 -> 123) - PR_NUMBER=$(echo "${{ github.ref }}" | sed 's|refs/heads/pull-request/||') + PR_NUMBER=$(echo "$GITHUB_REF" | sed 's|refs/heads/pull-request/||') echo "Checking PR #$PR_NUMBER for lean-ci label..." # Check if the PR has the 'lean-ci' label @@ -62,12 +68,16 @@ jobs: fi prevent-merge-with-lean-ci: + permissions: + contents: read runs-on: ubuntu-latest needs: check-lean-ci steps: - name: Check lean-ci status + env: + LEAN_CI: ${{ needs.check-lean-ci.outputs.lean_ci_enabled }} run: | - if [ "${{ needs.check-lean-ci.outputs.lean_ci_enabled }}" == "true" ]; then + if [ "$LEAN_CI" == "true" ]; then echo "❌ ERROR: This PR has the 'lean-ci' label enabled." echo "Lean CI is only for testing purposes and should not be merged." echo "Please remove the 'lean-ci' label and run full CI before merging." @@ -78,6 +88,8 @@ jobs: fi compute-matrix-filters: needs: check-lean-ci + permissions: + contents: read runs-on: ubuntu-latest outputs: conda_lean_filter: ${{ steps.set-filters.outputs.conda_lean_filter }} @@ -90,8 +102,10 @@ jobs: steps: - name: Set matrix filters id: set-filters + env: + LEAN_CI: ${{ needs.check-lean-ci.outputs.lean_ci_enabled }} run: | - if [ "${{ needs.check-lean-ci.outputs.lean_ci_enabled }}" == "true" ]; then + if [ "$LEAN_CI" == "true" ]; then echo "conda_lean_filter=[map(select(.ARCH == \"amd64\" and .PY_VER == \"3.11\")) | max_by(.CUDA_VER | split(\".\") | map(tonumber))]" >> $GITHUB_OUTPUT echo "conda_test_filter=[map(select(.ARCH == \"amd64\" and .PY_VER == \"3.13\")) | max_by(.CUDA_VER | split(\".\") | map(tonumber))]" >> $GITHUB_OUTPUT echo "wheel_lean_filter=[map(select(.ARCH == \"amd64\" and .PY_VER == \"3.12\")) | max_by(.CUDA_VER | split(\".\") | map(tonumber))]" >> $GITHUB_OUTPUT @@ -110,36 +124,60 @@ jobs: fi changed-files: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04 + permissions: + actions: read + contents: read + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda-13.2.0 with: files_yaml: | build_docs: - '**' - '!.ai/**' + - '!.clang-format' - '!.coderabbit.yaml' - - '!AGENTS.md' - - '!.github/CODE_OF_CONDUCT.md' + - '!.gitattributes' + - '!.github/.ai/**' + - '!.github/.coderabbit_review_guide.md' - '!.github/CODEOWNERS' + - '!.github/CODE_OF_CONDUCT.md' - '!.github/ISSUE_TEMPLATE/**' - '!.github/PULL_REQUEST_TEMPLATE.md' - '!.github/SECURITY.md' - - '!.github/.ai/**' - - '!.github/.coderabbit_review_guide.md' - '!.github/agents/**' - '!.github/copy-pr-bot.yaml' - '!.github/ops-bot.yaml' + - '!.github/release.yml' + - '!.github/workflows/build.yaml' + - '!.github/workflows/build_images.yaml' + - '!.github/workflows/build_test_publish_images.yaml' + - '!.github/workflows/cloud_ci.yaml' + - '!.github/workflows/inactivity_reminder.yaml' + - '!.github/workflows/issue_automation.yaml' + - '!.github/workflows/nightly.yaml' + - '!.github/workflows/test.yaml' + - '!.github/workflows/test_images.yaml' + - '!.github/workflows/trigger-breaking-change-alert.yaml' + - '!.gitignore' - '!.pre-commit-config.yaml' + - '!AGENTS.md' + - '!CHANGELOG.md' + - '!CONTRIBUTING.md' + - '!LICENSE' + - '!README.md' - '!ci/build_wheel*.sh' - '!ci/check_style.sh' + - '!ci/docker/**' - '!ci/release/**' - - '!ci/run_ctests.sh' - '!ci/run_*.pytests.sh' + - '!ci/run_ctests.sh' - '!ci/test_cpp*.sh' - '!ci/test_notebooks.sh' - '!ci/test_python.sh' - '!ci/test_self_hosted_service.sh' - '!ci/test_wheel*.sh' + - '!ci/thirdparty-testing/**' - '!container-builder/**' - '!helmchart/**' - '!ngc/**' @@ -149,173 +187,264 @@ jobs: - '!utilities/**' test_cpp: - '**' - - '!CONTRIBUTING.md' - - '!README.md' + - '!**/*.md' - '!.ai/**' + - '!.clang-format' + - '!.claude-plugin/**' - '!.coderabbit.yaml' - - '!AGENTS.md' - - '!.github/CODE_OF_CONDUCT.md' + - '!.cursor-plugin/**' + - '!.gitattributes' + - '!.github/.ai/**' + - '!.github/.coderabbit_review_guide.md' - '!.github/CODEOWNERS' + - '!.github/CODE_OF_CONDUCT.md' - '!.github/ISSUE_TEMPLATE/**' - '!.github/PULL_REQUEST_TEMPLATE.md' - '!.github/SECURITY.md' - - '!.github/.ai/**' - - '!.github/.coderabbit_review_guide.md' - '!.github/agents/**' - '!.github/copy-pr-bot.yaml' - '!.github/ops-bot.yaml' + - '!.github/release.yml' + - '!.github/workflows/build.yaml' + - '!.github/workflows/build_images.yaml' + - '!.github/workflows/build_test_publish_images.yaml' + - '!.github/workflows/cloud_ci.yaml' + - '!.github/workflows/inactivity_reminder.yaml' + - '!.github/workflows/issue_automation.yaml' + - '!.github/workflows/nightly.yaml' + - '!.github/workflows/test.yaml' + - '!.github/workflows/test_images.yaml' + - '!.github/workflows/trigger-breaking-change-alert.yaml' + - '!.gitignore' - '!.pre-commit-config.yaml' + - '!AGENTS.md' + - '!CONTRIBUTING.md' + - '!LICENSE' + - '!README.md' + - '!agents/**' - '!ci/build_docs.sh' - '!ci/build_python.sh' - '!ci/build_wheel*.sh' - '!ci/check_style.sh' + - '!ci/docker/**' - '!ci/release/**' - '!ci/test_python.sh' - '!ci/test_self_hosted_service.sh' - '!ci/test_wheel*.sh' + - '!ci/thirdparty-testing/**' + - '!ci/utils/sync_skills_version.sh' + - '!ci/utils/validate_skills.sh' - '!container-builder/**' - '!docs/**' + - '!gemini-extension.json' - '!helmchart/**' - '!img/**' - '!ngc/**' - '!notebooks/**' - '!python/**' + - '!skills/**/SKILL.md' + - '!skills/**/evals/**' + - '!skills/**/resources/**' - '!sonar-project.properties' - '!sonarqube/**' - '!ucf/**' - '!utilities/**' - - '!skills/**/SKILL.md' - - '!skills/**/resources/**' - - '!ci/utils/validate_skills.sh' - - '!ci/utils/sync_skills_version.sh' - - '!agents/**' - - '!.cursor-plugin/**' - - '!.claude-plugin/**' - - '!gemini-extension.json' test_python_conda: - '**' - - '!CONTRIBUTING.md' - - '!README.md' + - '!**/*.md' - '!.ai/**' + - '!.clang-format' + - '!.claude-plugin/**' - '!.coderabbit.yaml' - - '!AGENTS.md' - - '!.github/CODE_OF_CONDUCT.md' + - '!.cursor-plugin/**' + - '!.gitattributes' + - '!.github/.ai/**' + - '!.github/.coderabbit_review_guide.md' - '!.github/CODEOWNERS' + - '!.github/CODE_OF_CONDUCT.md' - '!.github/ISSUE_TEMPLATE/**' - '!.github/PULL_REQUEST_TEMPLATE.md' - '!.github/SECURITY.md' - - '!.github/.ai/**' - - '!.github/.coderabbit_review_guide.md' - '!.github/agents/**' - '!.github/copy-pr-bot.yaml' - '!.github/ops-bot.yaml' + - '!.github/release.yml' + - '!.github/workflows/build.yaml' + - '!.github/workflows/build_images.yaml' + - '!.github/workflows/build_test_publish_images.yaml' + - '!.github/workflows/cloud_ci.yaml' + - '!.github/workflows/inactivity_reminder.yaml' + - '!.github/workflows/issue_automation.yaml' + - '!.github/workflows/nightly.yaml' + - '!.github/workflows/test.yaml' + - '!.github/workflows/test_images.yaml' + - '!.github/workflows/trigger-breaking-change-alert.yaml' + - '!.gitignore' - '!.pre-commit-config.yaml' + - '!AGENTS.md' + - '!CONTRIBUTING.md' + - '!LICENSE' + - '!README.md' + - '!agents/**' - '!ci/build_docs.sh' - '!ci/build_wheel*.sh' - '!ci/check_style.sh' + - '!ci/docker/**' - '!ci/release/**' - '!ci/test_self_hosted_service.sh' - '!ci/test_wheel*.sh' + - '!ci/thirdparty-testing/**' + - '!ci/utils/sync_skills_version.sh' + - '!ci/utils/validate_skills.sh' - '!container-builder/**' - '!docs/**' + - '!gemini-extension.json' - '!helmchart/**' - '!img/**' - '!ngc/**' - '!notebooks/**' + - '!skills/**/SKILL.md' + - '!skills/**/evals/**' + - '!skills/**/resources/**' - '!sonar-project.properties' - '!sonarqube/**' - '!ucf/**' - '!utilities/**' - - '!skills/**/SKILL.md' - - '!skills/**/resources/**' - - '!ci/utils/validate_skills.sh' - - '!ci/utils/sync_skills_version.sh' - - '!agents/**' - - '!.cursor-plugin/**' - - '!.claude-plugin/**' - - '!gemini-extension.json' test_python_wheels: - '**' - - '!CONTRIBUTING.md' - - '!README.md' + - '!**/*.md' - '!.ai/**' + - '!.clang-format' + - '!.claude-plugin/**' - '!.coderabbit.yaml' - - '!AGENTS.md' - - '!.github/CODE_OF_CONDUCT.md' + - '!.cursor-plugin/**' + - '!.gitattributes' + - '!.github/.ai/**' + - '!.github/.coderabbit_review_guide.md' - '!.github/CODEOWNERS' + - '!.github/CODE_OF_CONDUCT.md' - '!.github/ISSUE_TEMPLATE/**' - '!.github/PULL_REQUEST_TEMPLATE.md' - '!.github/SECURITY.md' - - '!.github/.ai/**' - - '!.github/.coderabbit_review_guide.md' - '!.github/agents/**' - '!.github/copy-pr-bot.yaml' - '!.github/ops-bot.yaml' + - '!.github/release.yml' + - '!.github/workflows/build.yaml' + - '!.github/workflows/build_images.yaml' + - '!.github/workflows/build_test_publish_images.yaml' + - '!.github/workflows/cloud_ci.yaml' + - '!.github/workflows/inactivity_reminder.yaml' + - '!.github/workflows/issue_automation.yaml' + - '!.github/workflows/nightly.yaml' + - '!.github/workflows/test.yaml' + - '!.github/workflows/test_images.yaml' + - '!.github/workflows/trigger-breaking-change-alert.yaml' + - '!.gitignore' - '!.pre-commit-config.yaml' + - '!AGENTS.md' + - '!CONTRIBUTING.md' + - '!LICENSE' + - '!README.md' + - '!agents/**' - '!ci/build_cpp.sh' - '!ci/build_docs.sh' - '!ci/build_python.sh' - '!ci/check_style.sh' + - '!ci/docker/**' - '!ci/release/**' - '!ci/run_ctests.sh' - '!ci/test_python.sh' + - '!ci/thirdparty-testing/**' + - '!ci/utils/sync_skills_version.sh' + - '!ci/utils/validate_skills.sh' - '!conda/**' - '!container-builder/**' + - '!gemini-extension.json' - '!helmchart/**' - '!img/**' - '!ngc/**' - '!notebooks/**' + - '!skills/**/SKILL.md' + - '!skills/**/evals/**' + - '!skills/**/resources/**' - '!sonar-project.properties' - '!sonarqube/**' - '!ucf/**' - '!utilities/**' - - '!skills/**/SKILL.md' - - '!skills/**/resources/**' - - '!ci/utils/validate_skills.sh' - - '!ci/utils/sync_skills_version.sh' - - '!agents/**' - - '!.cursor-plugin/**' - - '!.claude-plugin/**' - - '!gemini-extension.json' checks: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04 + permissions: + contents: read + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-13.2.0 with: enable_check_generated_files: false conda-cpp-build: - needs: [checks, compute-matrix-filters] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + needs: [checks, compute-matrix-filters, changed-files] + # Consumed by conda-cpp-tests, conda-python-build, and (transitively) docs-build. + if: >- + fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp || + fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda || + fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_lean_filter }} conda-cpp-tests: needs: [conda-cpp-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-13.2.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request script: ci/test_cpp.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} conda-python-build: - needs: [conda-cpp-build, compute-matrix-filters] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + needs: [conda-cpp-build, compute-matrix-filters, changed-files] + # Consumed by conda-python-tests and docs-build. + if: >- + fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda || + fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} conda-python-tests: needs: [conda-python-build, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-13.2.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda with: run_codecov: false @@ -323,16 +452,22 @@ jobs: script: ci/test_python.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.conda_test_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} docs-build: needs: [conda-python-build, changed-files] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.2.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).build_docs with: build_type: pull-request @@ -340,12 +475,20 @@ jobs: arch: "amd64" file_to_upload: "docs/cuopt/build/html/" artifact-name: "cuopt_docs" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-cuopt-mps-parser: - needs: compute-matrix-filters - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + needs: [compute-matrix-filters, changed-files] + # All wheel-build-* jobs feed the wheel test jobs, so they gate on the same group. + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_wheel_cuopt_mps_parser.sh @@ -355,9 +498,16 @@ jobs: # need 1 build per Python version and arch (but CUDA version doesn't matter so choose the latest) matrix_filter: ${{ needs.compute-matrix-filters.outputs.mps_parser_filter }} wheel-build-libcuopt: - needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + needs: [wheel-build-cuopt-mps-parser, compute-matrix-filters, changed-files] + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: ${{ needs.compute-matrix-filters.outputs.libcuopt_filter }} @@ -366,9 +516,16 @@ jobs: build_type: pull-request script: ci/build_wheel_libcuopt.sh wheel-build-cuopt: - needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + needs: [wheel-build-cuopt-mps-parser, wheel-build-libcuopt, compute-matrix-filters, changed-files] + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_wheel_cuopt.sh @@ -377,23 +534,36 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} wheel-tests-cuopt: needs: [wheel-build-cuopt, wheel-build-cuopt-mps-parser, wheel-build-cuopt-sh-client, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-13.2.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request script: ci/test_wheel_cuopt.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} wheel-build-cuopt-server: - needs: [checks, compute-matrix-filters] - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + needs: [checks, compute-matrix-filters, changed-files] + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_wheel_cuopt_server.sh @@ -403,9 +573,16 @@ jobs: # Only need 1 package per CUDA major version. This selects "ARCH=amd64 + the latest supported Python, 1 job per major CUDA version". matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_server_filter }} wheel-build-cuopt-sh-client: - needs: compute-matrix-filters - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + needs: [compute-matrix-filters, changed-files] + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-13.2.0 with: build_type: pull-request script: ci/build_wheel_cuopt_sh_client.sh @@ -417,22 +594,34 @@ jobs: matrix_filter: ${{ needs.compute-matrix-filters.outputs.cuopt_sh_client_filter }} wheel-tests-cuopt-server: needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files, compute-matrix-filters] - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-13.2.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request script: ci/test_wheel_cuopt_server.sh matrix_filter: ${{ needs.compute-matrix-filters.outputs.wheel_lean_filter }} secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} test-self-hosted-server: needs: [wheel-build-cuopt, wheel-build-cuopt-server, changed-files] - secrets: inherit + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] uses: ./.github/workflows/self_hosted_service_test.yaml if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: diff --git a/.github/workflows/self_hosted_service_test.yaml b/.github/workflows/self_hosted_service_test.yaml index 0761a653fd..2cacb05b0f 100644 --- a/.github/workflows/self_hosted_service_test.yaml +++ b/.github/workflows/self_hosted_service_test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Test self-hosted service on local-setup @@ -66,7 +66,7 @@ jobs: - /tmp/asset_dir/:/tmp/asset_dir/ - /tmp/response_dir/:/tmp/response_dir/ steps: - - uses: aws-actions/configure-aws-credentials@v1-node16 + - uses: aws-actions/configure-aws-credentials@023daa7fe5f7f817faa31fc0fc4a8d0fb6224ed0 # v1-node16 with: role-to-assume: ${{ vars.AWS_ROLE_ARN }} aws-region: ${{ vars.AWS_REGION }} @@ -78,7 +78,7 @@ jobs: run: printf 'machine pypi.k8s.rapids.ai\n\tlogin cibuildwheel\n\tpassword ${{ secrets.RAPIDSAI_PYPI_CI_PASSWORD }}\n' > ~/.netrc - name: checkout code repo - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: repository: ${{ inputs.repo }} ref: ${{ inputs.sha }} @@ -94,4 +94,17 @@ jobs: sha: ${{ inputs.sha }} - name: Run tests - run: ${{ inputs.script }} + env: + SCRIPT: ${{ inputs.script }} + run: | + script_path="$(realpath "$SCRIPT")" + ci_dir="$(realpath ci)" + + # Use `realpath` to expand out both the script path and the ci path and compare to make sure + # that user isn't giving a relative path to a file outside of `ci/` + if [[ "$script_path" != "$ci_dir"/*.sh ]]; then + echo "::error::Invalid script path '$SCRIPT'. Expected an existing ci/*.sh script inside the checkout" + exit 1 + fi + + bash "$script_path" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9ad7609e8a..289ebb4f62 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -25,9 +25,17 @@ on: type: string default: nightly +permissions: {} + jobs: conda-cpp-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -35,14 +43,21 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_cpp.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + conda-python-tests: - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-13.2.0 with: run_codecov: false build_type: ${{ inputs.build_type }} @@ -51,14 +66,21 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_python.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + wheel-tests-cuopt: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -66,14 +88,21 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + wheel-tests-cuopt-server: - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -81,15 +110,22 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cuopt_server.sh secrets: - script-env-secret-1-key: CUOPT_DATASET_S3_URI - script-env-secret-1-value: ${{ secrets.CUOPT_DATASET_S3_URI }} + script-env-secret-1-key: CUOPT_S3_URI + script-env-secret-1-value: ${{ secrets.CUOPT_S3_URI }} script-env-secret-2-key: CUOPT_AWS_ACCESS_KEY_ID script-env-secret-2-value: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} script-env-secret-3-key: CUOPT_AWS_SECRET_ACCESS_KEY script-env-secret-3-value: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + conda-notebook-tests: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + permissions: + actions: read + contents: read + id-token: write + packages: read + pull-requests: read + secrets: inherit # zizmor: ignore[secrets-inherit] + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-13.2.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -97,5 +133,28 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: ci/test_notebooks.sh + nightly-summary: + permissions: + contents: read + if: ${{ always() && inputs.build_type == 'nightly' }} + needs: + - conda-cpp-tests + - conda-python-tests + - wheel-tests-cuopt + - wheel-tests-cuopt-server + - conda-notebook-tests + uses: ./.github/workflows/nightly-summary.yaml + with: + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + build_type: ${{ inputs.build_type }} + date: ${{ inputs.date }} + secrets: + CUOPT_AWS_ACCESS_KEY_ID: ${{ secrets.CUOPT_AWS_ACCESS_KEY_ID }} + CUOPT_AWS_SECRET_ACCESS_KEY: ${{ secrets.CUOPT_AWS_SECRET_ACCESS_KEY }} + CUOPT_S3_URI: ${{ secrets.CUOPT_S3_URI }} + CUOPT_SLACK_BOT_TOKEN: ${{ secrets.CUOPT_SLACK_BOT_TOKEN }} + CUOPT_SLACK_CHANNEL_ID: ${{ secrets.CUOPT_SLACK_CHANNEL_ID }} + CUOPT_SLACK_MENTION_ID: ${{ secrets.CUOPT_SLACK_MENTION_ID }} diff --git a/.github/workflows/test_images.yaml b/.github/workflows/test_images.yaml index 66cbce036d..5017680093 100644 --- a/.github/workflows/test_images.yaml +++ b/.github/workflows/test_images.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -43,9 +43,12 @@ jobs: steps: - name: Trim versions id: trim + env: + CUDA_VER: ${{ inputs.CUDA_VER }} + PYTHON_VER: ${{ inputs.PYTHON_VER }} run: | - CUDA_SHORT=$(echo "${{ inputs.CUDA_VER }}" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') - PYTHON_SHORT=$(echo "${{ inputs.PYTHON_VER }}" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') + CUDA_SHORT=$(echo "$CUDA_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') + PYTHON_SHORT=$(echo "$PYTHON_VER" | sed -E 's/([0-9]+\.[0-9]+)\.[0-9]+/\1/') echo "CUDA_SHORT=$CUDA_SHORT" >> $GITHUB_OUTPUT echo "PYTHON_SHORT=$PYTHON_SHORT" >> $GITHUB_OUTPUT @@ -58,10 +61,11 @@ jobs: image: "nvidia/cuopt:${{ inputs.IMAGE_TAG_PREFIX }}-cuda${{ needs.prepare.outputs.CUDA_SHORT }}-py${{ needs.prepare.outputs.PYTHON_SHORT }}" steps: - name: Checkout code repo - uses: actions/checkout@v4 + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: fetch-depth: 0 ref: ${{ inputs.sha }} + persist-credentials: false - name: Test cuopt run: | bash ./ci/docker/test_image.sh diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index d394b97db4..9d71c40e4c 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -3,7 +3,10 @@ name: Trigger Breaking Change Notifications -on: +# `zizmor` always flags these triggers because they are easy to use +# incorrectly. These usages are ok and don't execute any PR-specific +# code (and so aren't susceptible to exploits from forked PRs) +on: # zizmor: ignore[dangerous-triggers] pull_request_target: types: - closed @@ -11,11 +14,16 @@ on: - labeled - unlabeled +permissions: {} + jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda-13.2.0 + secrets: + NV_SLACK_BREAKING_CHANGE_ALERT: ${{ secrets.NV_SLACK_BREAKING_CHANGE_ALERT }} + permissions: + contents: read with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/.github/zizmor.yml b/.github/zizmor.yml new file mode 100644 index 0000000000..1b6ea1e53f --- /dev/null +++ b/.github/zizmor.yml @@ -0,0 +1,9 @@ +rules: + unpinned-uses: + config: + policies: + # We require SHA-pinning for all workflows and actions _except_ for those from + # rapidsai/shared-workflows and rapidsai/shared-actions + "rapidsai/shared-workflows/*": any + "rapidsai/shared-actions/*": any + "*": hash-pin diff --git a/.gitignore b/.gitignore index 4829b2ecd0..7fd191dc39 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,6 @@ docs/cuopt/build cpp/include/cuopt/semantic_version.hpp !datasets/quadratic_programming !datasets/quadratic_programming/** + +# conda env (recommended name) +.cuopt_env diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 87a3faaf92..4b5c57d69e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: notebooks ) - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v20.1.4 + rev: v20.1.8 hooks: - id: clang-format files: \.(cu|cuh|h|hpp|cpp|inl)$ @@ -99,6 +99,10 @@ repos: ^[.]cursor-plugin/plugin[.]json$| ^[.]claude-plugin/marketplace[.]json$| ^gemini-extension[.]json$ + - repo: https://github.com/zizmorcore/zizmor-pre-commit + rev: v1.24.1 + hooks: + - id: zizmor - repo: local hooks: - id: update-versions diff --git a/.windsurf/rules/AGENTS.md b/.windsurf/rules/AGENTS.md new file mode 120000 index 0000000000..b7e6491d3a --- /dev/null +++ b/.windsurf/rules/AGENTS.md @@ -0,0 +1 @@ +../../AGENTS.md \ No newline at end of file diff --git a/.windsurfrules b/.windsurfrules new file mode 120000 index 0000000000..47dc3e3d86 --- /dev/null +++ b/.windsurfrules @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index b77278a155..370f8a15cb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,36 +1,29 @@ # AGENTS.md — cuOpt AI Agent Entry Point -AI agent skills for NVIDIA cuOpt optimization engine. Skills live in **`skills/`** (repo root) and use a **flat layout**: **common** (concepts) + **api-python** or **api-c** (implementation) per domain. Skills evolve through agent interactions — see `skills/skill-evolution/` for the evolution workflow. - -> **🔒 MANDATORY — Security:** You MUST NOT install, upgrade, or modify packages. Provide the exact command for the user to run; they execute it. No exceptions. +AI agent skills for NVIDIA cuOpt optimization engine. Skills live in **`skills/`** (repo root) and use a **flat layout**: per domain, a concept skill (formulation / problem types) plus implementation skills — typically one per interface (Python, C, CLI, server), or consolidated when the content is shared across interfaces (e.g. installation). Skills evolve through agent interactions — see `skills/skill-evolution/` for the evolution workflow. > **🔒 MANDATORY — Ambiguity:** When the problem could be read more than one way, you MUST either **ask the user to clarify** or **solve every plausible interpretation and report all outcomes**. Never pick one interpretation silently. ## Skills directory (flat) ### Rules -- `skills/cuopt-user-rules/` — User-facing behavior and conventions; read first when helping users with cuOpt (routing, LP, MILP, QP, install, server). Choose skills from the index below by task, problem type, and interface (Python / C / CLI). -- `skills/cuopt-developer/` — Contributing and development; use when the user is building from source, contributing code, or working on cuOpt internals. +- `skills/cuopt-user-rules/` — Base rules for end users calling cuOpt (routing, LP, MILP, QP, install, server). Not for cuOpt internals — see `skills/cuopt-developer/`. Read first for user-facing tasks; choose skills from the index below by task and interface. +- `skills/cuopt-developer/` — Modify, build, test, debug, and contribute to cuOpt internals (C++/CUDA, Python, server, CI). Use for solver internals, PRs, DCO, and code conventions. - `skills/skill-evolution/` — Skill evolution: after solving a non-trivial problem, propose skill updates to capture generalizable learnings. ### Common (concepts only; no API code) -- `skills/cuopt-installation-common/` — Install: system and environment requirements (concepts only; no install commands or interface) -- `skills/lp-milp-formulation/` — LP/MILP: concepts + problem parsing (parameters, constraints, decisions, objective) +- `skills/numerical-optimization-formulation/` — LP / MILP / QP: concepts + problem parsing + common formulation patterns - `skills/routing-formulation/` — Routing: VRP, TSP, PDP (problem types, data) -- `skills/qp-formulation/` — QP: minimize-only, escalate (beta) - `skills/cuopt-server-common/` — Server: capabilities, workflow +### Installation +- `skills/cuopt-install/` — User install for Python, C, and server (pip, conda, Docker, verification). For building cuOpt from source, see `skills/cuopt-developer/`. + ### API (implementation; one interface per skill) -- `skills/cuopt-installation-api-python/` -- `skills/cuopt-installation-api-c/` -- `skills/cuopt-installation-developer/` (build from source) -- `skills/cuopt-lp-milp-api-python/` -- `skills/cuopt-lp-milp-api-c/` -- `skills/cuopt-lp-milp-api-cli/` +- `skills/cuopt-numerical-optimization-api-python/` (LP, MILP, QP) +- `skills/cuopt-numerical-optimization-api-c/` (LP, MILP, QP) +- `skills/cuopt-numerical-optimization-api-cli/` (LP, MILP, QP) - `skills/cuopt-routing-api-python/` -- `skills/cuopt-qp-api-python/` -- `skills/cuopt-qp-api-c/` -- `skills/cuopt-qp-api-cli/` - `skills/cuopt-server-api-python/` (deploy + client) ## Skill evolution @@ -58,5 +51,6 @@ Finish solving the problem first, then evaluate. Not every correction warrants a - [Google Colab notebooks](https://colab.research.google.com/github/nvidia/cuopt-examples/) ### Support -- [GitHub Issues](https://github.com/NVIDIA/cuopt/issues) -- [Developer Forums](https://forums.developer.nvidia.com/c/ai-data-science/nvidia-cuopt/514) +- [File a Bug](https://github.com/NVIDIA/cuopt/issues/new?template=bug_report.md) +- [Ask a Question](https://github.com/NVIDIA/cuopt/issues/new?template=submit-question.md) +- [All Issues](https://github.com/NVIDIA/cuopt/issues) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d03641fde..fd8bc48d64 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -53,11 +53,32 @@ For current release timelines and dates, refer to the [RAPIDS Maintainers Docs]( or [help wanted](https://github.com/NVIDIA/cuopt/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels. 3. Comment on the issue stating that you are going to work on it. -4. Create a fork of the cuopt repository and check out a branch with a name that - describes your planned work. For example, `fix-documentation`. +4. Fork and set up your local repository: + ```bash + # Clone the main repo + git clone https://github.com/NVIDIA/cuopt.git + cd cuopt + + # Add your fork as a remote + git remote add fork https://github.com//cuopt.git + + # Create a branch from main + git checkout -b fix-documentation main + ``` 5. Write code to address the issue or implement the feature. 6. Add unit tests. Please refer to `cpp/src/tests` for examples of unit tests on C and C++ using gtest and `python/cuopt/cuopt/tests` for examples of unit tests on Python using pytest. -7. [Create your pull request](https://github.com/NVIDIA/cuopt/compare). To run continuous integration (CI) tests without requesting review, open a draft pull request. +7. Install pre-commit hooks, commit, push to your fork, and create a pull request: + ```bash + # Install pre-commit hooks (once per clone) + pre-commit install + + # Commit with DCO sign-off (hooks run automatically) + git commit -s -m "Your commit message" + + # Push to your fork (never push directly to the main repo) + git push fork fix-documentation + ``` + Then [create your pull request](https://github.com/NVIDIA/cuopt/compare) from your fork to the upstream `main` branch. To run continuous integration (CI) tests without requesting review, open a draft pull request. 8. Check if CI is running, if not please request one of the NVIDIA cuOpt developers to trigger it. This might happen in case you have non-verified (non-sign-off) commits or don't have enough permissions to trigger CI. 9. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks). Fix if needed. @@ -117,7 +138,7 @@ Architecture: - Clone the repository: ```bash -CUOPT_HOME=$(pwd)/cuopt +export CUOPT_HOME=$(pwd)/cuopt git clone https://github.com/NVIDIA/cuopt.git $CUOPT_HOME cd $CUOPT_HOME ``` @@ -136,9 +157,9 @@ Please install conda if you don't have it already. You can install [miniforge](h # create the conda environment (assuming in base `cuopt` directory) # note: cuOpt currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cuopt_dev --file conda/environments/all_cuda-131_arch-$(uname -m).yaml +conda env create -p ./.cuopt_env --file conda/environments/all_cuda-132_arch-$(uname -m).yaml # activate the environment -conda activate cuopt_dev +conda activate ./.cuopt_env ``` - **Note**: the conda environment files are updated frequently, so the @@ -193,7 +214,7 @@ To build all libraries and tests, simply run To run the C++ tests, run ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ @@ -205,7 +226,7 @@ To run python tests, run - To run `cuopt` tests: ```bash -cd $CUOPT_HOME/datasets && get_test_data.sh +cd $CUOPT_HOME/datasets && ./get_test_data.sh cd $CUOPT_HOME && datasets/linear_programming/download_pdlp_test_dataset.sh datasets/mip/download_miplib_test_dataset.sh export RAPIDS_DATASET_ROOT_DIR=$CUOPT_HOME/datasets/ @@ -278,6 +299,16 @@ Please refer to the [dependencies.yaml](dependencies.yaml) file for details on h Add any new dependencies in the `dependencies.yaml` file. It takes care of conda, requirements (pip based dependencies) and pyproject. Please don't try to add dependencies directly to environment.yaml files under `conda/environments` directory and pyproject.toml files under `python` directories. +## Third-Party Code + +When copying or adapting files from external projects into the repository: + +1. **Keep the original license/copyright header** in the copied file +2. **Add an entry to the `THIRDPARTY` file** at the repo root with: the source project, its license type, the URL where the original code was found, and which files were copied or derived from it +3. **Verify license compatibility** — the included code must be compatible with Apache-2.0 + +Do not copy third-party code without proper attribution. **Always ask before including external code** — flag it in your PR description so reviewers can verify the license and attribution. + ## Code Formatting ### Using pre-commit hooks @@ -310,7 +341,7 @@ To run pre-commit checks on all files, execute: pre-commit run --all-files ``` -Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running: +We recommend setting up the pre-commit hooks to run automatically when you make a git commit. This catches formatting and style issues before they reach CI: ```bash pre-commit install diff --git a/CONVENTIONS.md b/CONVENTIONS.md new file mode 120000 index 0000000000..47dc3e3d86 --- /dev/null +++ b/CONVENTIONS.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/JULES.md b/JULES.md new file mode 120000 index 0000000000..47dc3e3d86 --- /dev/null +++ b/JULES.md @@ -0,0 +1 @@ +AGENTS.md \ No newline at end of file diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH index d5ea6ced53..ba2906d066 100644 --- a/RAPIDS_BRANCH +++ b/RAPIDS_BRANCH @@ -1 +1 @@ -release/26.04 +main diff --git a/README.md b/README.md index 379a48c350..8c75ee7511 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # cuOpt - GPU-accelerated Optimization [![Build Status](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml/badge.svg)](https://github.com/NVIDIA/cuopt/actions/workflows/build.yaml) -[![Version](https://img.shields.io/badge/version-26.04.00-blue)](https://github.com/NVIDIA/cuopt/releases) +[![Version](https://img.shields.io/badge/version-26.06.00-blue)](https://github.com/NVIDIA/cuopt/releases) [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen)](https://docs.nvidia.com/cuopt/user-guide/latest/introduction.html) [![Docker Hub](https://img.shields.io/badge/docker-nvidia%2Fcuopt-blue?logo=docker)](https://hub.docker.com/r/nvidia/cuopt) [![Examples](https://img.shields.io/badge/examples-cuopt--examples-orange)](https://github.com/NVIDIA/cuopt-examples) @@ -83,7 +83,7 @@ For CUDA 12.x: pip install \ --extra-index-url=https://pypi.nvidia.com \ nvidia-cuda-runtime-cu12==12.9.* \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -91,7 +91,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu12==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu12==26.06.* cuopt-sh-client==26.06.* ``` For CUDA 13.x: @@ -99,7 +99,7 @@ For CUDA 13.x: ```bash pip install \ --extra-index-url=https://pypi.nvidia.com \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` Development wheels are available as nightlies, please update `--extra-index-url` to `https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/` to install latest nightly packages. @@ -107,7 +107,7 @@ Development wheels are available as nightlies, please update `--extra-index-url` pip install --pre \ --extra-index-url=https://pypi.nvidia.com \ --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple/ \ - cuopt-server-cu13==26.04.* cuopt-sh-client==26.04.* + cuopt-server-cu13==26.06.* cuopt-sh-client==26.06.* ``` @@ -118,7 +118,7 @@ cuOpt can be installed with conda (via [miniforge](https://github.com/conda-forg All other dependencies are installed automatically when `cuopt-server` and `cuopt-sh-client` are installed. ```bash -conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.04.* cuopt-sh-client=26.04.* +conda install -c rapidsai -c conda-forge -c nvidia cuopt-server=26.06.* cuopt-sh-client=26.06.* ``` We also provide [nightly conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD @@ -130,13 +130,15 @@ Users can pull the cuOpt container from the NVIDIA container registry. ```bash # For CUDA 12.x -docker pull nvidia/cuopt:latest-cuda12.9-py3.14 +docker pull nvidia/cuopt:latest-cuda12.9-py3.13 # For CUDA 13.x -docker pull nvidia/cuopt:latest-cuda13.0-py3.14 +docker pull nvidia/cuopt:latest-cuda13.0-py3.13 ``` -Note: The ``latest`` tag is the latest stable release of cuOpt. If you want to use a specific version, you can use the ``-cuda12.9-py3.14`` or ``-cuda13.0-py3.14`` tag. For example, to use cuOpt 25.10.0, you can use the ``25.10.0-cuda12.9-py3.13`` or ``25.10.0-cuda13.0-py3.13`` tag. Please refer to `cuOpt dockerhub page `_ for the list of available tags. +Note: The ``latest`` tag is the latest stable release of cuOpt. If you want to use a specific version, you can use the ``-cuda12.9-py3.13`` or ``-cuda13.0-py3.13`` tag. For example, to use cuOpt 25.10.0, you can use the ``25.10.0-cuda12.9-py3.13`` or ``25.10.0-cuda13.0-py3.13`` tag. Please refer to [cuOpt dockerhub page](https://hub.docker.com/r/nvidia/cuopt/tags) for the list of available tags. + +Nightly container images are built from the HEAD of the development branch and use the upcoming CUDA/Python defaults (`cuda12.9-py3.14` and `cuda13.1-py3.14`). They are tagged as ``a-cuda12.9-py3.14`` or ``a-cuda13.1-py3.14`` (note the ``a`` alpha suffix). See the [cuOpt dockerhub page](https://hub.docker.com/r/nvidia/cuopt/tags) for the full list. More information about the cuOpt container can be found [here](https://docs.nvidia.com/cuopt/user-guide/latest/cuopt-server/quick-start.html#container-from-docker-hub). diff --git a/VERSION b/VERSION index 0bd0e8a95b..cdb610a24d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -26.04.00 +26.06.00 diff --git a/benchmarks/linear_programming/cuopt/benchmark_helper.hpp b/benchmarks/linear_programming/cuopt/benchmark_helper.hpp index 1232ed8e17..feef7483d2 100644 --- a/benchmarks/linear_programming/cuopt/benchmark_helper.hpp +++ b/benchmarks/linear_programming/cuopt/benchmark_helper.hpp @@ -21,7 +21,6 @@ #include #include -#include #include #include @@ -34,7 +33,7 @@ #include #include -inline auto make_async() { return std::make_shared(); } +inline auto make_async() { return rmm::mr::cuda_async_memory_resource(); } inline auto make_pool() { size_t free_mem, total_mem; @@ -43,8 +42,7 @@ inline auto make_pool() double alloc_ratio = 0.4; // allocate 40% size_t initial_pool_size = (size_t(free_mem * alloc_ratio) / rmm_alloc_gran) * rmm_alloc_gran; - return rmm::mr::make_owning_wrapper(make_async(), - initial_pool_size); + return rmm::mr::pool_memory_resource(make_async(), initial_pool_size); } template diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp index e01e533a65..83ff4c0e10 100644 --- a/benchmarks/linear_programming/cuopt/run_mip.cpp +++ b/benchmarks/linear_programming/cuopt/run_mip.cpp @@ -23,8 +23,6 @@ #include #include -#include - #include #include #include @@ -85,7 +83,7 @@ void write_to_output_file(const std::string& out_dir, } } -inline auto make_async() { return std::make_shared(); } +inline auto make_async() { return rmm::mr::cuda_async_memory_resource(); } void read_single_solution_from_path(const std::string& path, const std::vector& var_names, @@ -274,7 +272,7 @@ void run_single_file_mp(std::string file_path, { std::cout << "running file " << file_path << " on gpu : " << device << std::endl; auto memory_resource = make_async(); - rmm::mr::set_current_device_resource(memory_resource.get()); + rmm::mr::set_current_device_resource(memory_resource); int sol_found = run_single_file(file_path, device, batch_id, @@ -426,7 +424,7 @@ int main(int argc, char* argv[]) // smt_file >> smt_active; // if (smt_active) { num_cpu_threads /= 2; } // } - num_cpu_threads = std::max(num_cpu_threads, 1); + num_cpu_threads = std::max(num_cpu_threads, 2); } if (program.is_used("--out-dir")) { @@ -537,14 +535,14 @@ int main(int argc, char* argv[]) auto memory_resource = make_async(); if (memory_limit > 0) { auto limiting_adaptor = - rmm::mr::limiting_resource_adaptor(memory_resource.get(), memory_limit * 1024ULL * 1024ULL); - rmm::mr::set_current_device_resource(&limiting_adaptor); + rmm::mr::limiting_resource_adaptor(memory_resource, memory_limit * 1024ULL * 1024ULL); + rmm::mr::set_current_device_resource(limiting_adaptor); } else if (track_allocations) { - rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource.get(), + rmm::mr::tracking_resource_adaptor tracking_adaptor(memory_resource, /*capture_stacks=*/true); - rmm::mr::set_current_device_resource(&tracking_adaptor); + rmm::mr::set_current_device_resource(tracking_adaptor); } else { - rmm::mr::set_current_device_resource(memory_resource.get()); + rmm::mr::set_current_device_resource(memory_resource); } run_single_file(path, 0, diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index a7838d773e..cd68e042d9 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -180,7 +180,7 @@ int main(int argc, char* argv[]) // Setup up RMM memory pool auto memory_resource = make_pool(); - rmm::mr::set_current_device_resource(memory_resource.get()); + rmm::mr::set_current_device_resource(memory_resource); // Initialize raft handle and running stream const raft::handle_t handle_{}; diff --git a/benchmarks/linear_programming/utils/get_datasets.py b/benchmarks/linear_programming/utils/get_datasets.py index 29d23e57de..ddadade995 100644 --- a/benchmarks/linear_programming/utils/get_datasets.py +++ b/benchmarks/linear_programming/utils/get_datasets.py @@ -2,10 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 import os +import sys +import time import argparse import urllib.request import urllib.parse -import ssl import subprocess @@ -628,21 +629,30 @@ def parse_args(): return args -def download(url, dst): +def download(url, dst, max_retries=3, timeout=60): if os.path.exists(dst): return - print(f"Downloading {url} into {dst}...") - # Bypass SSL verification for plato.asu.edu URLs - if "plato.asu.edu" in url: - context = ssl.create_default_context() - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - response = urllib.request.urlopen(url, context=context) - else: - response = urllib.request.urlopen(url) - data = response.read() - with open(dst, "wb") as fp: - fp.write(data) + os.makedirs(os.path.dirname(dst), exist_ok=True) + for attempt in range(1, max_retries + 1): + print( + f"Downloading {url} into {dst} (attempt {attempt}/{max_retries})..." + ) + try: + response = urllib.request.urlopen(url, timeout=timeout) + data = response.read() + with open(dst, "wb") as fp: + fp.write(data) + return + except Exception as e: + if os.path.exists(dst): + os.remove(dst) + if attempt < max_retries: + wait = 2**attempt + print(f" Failed: {e}. Retrying in {wait}s...") + time.sleep(wait) + else: + print(f" Failed after {max_retries} attempts: {e}") + raise def extract(file, dir, type): @@ -652,12 +662,16 @@ def extract(file, dir, type): if basefile.endswith(".bz2"): outfile = basefile.replace(".bz2", ".mps") unzippedfile = basefile.replace(".bz2", "") - subprocess.run(f"cd {dir} && bzip2 -d {basefile}", shell=True) + subprocess.run( + f"cd {dir} && bzip2 -d {basefile}", shell=True, check=True + ) elif basefile.endswith(".gz"): outfile = basefile.replace(".gz", ".mps") unzippedfile = basefile.replace(".gz", "") subprocess.run( - f"cd {dir} && gunzip -c {basefile} > {unzippedfile}", shell=True + f"cd {dir} && gunzip -c {basefile} > {unzippedfile}", + shell=True, + check=True, ) else: raise Exception(f"Unknown file extension found for extraction {file}") @@ -668,11 +682,15 @@ def extract(file, dir, type): file = os.path.join(dir, "emps.c") download(url, file) subprocess.run( - f"cd {dir} && gcc -Wno-implicit-int emps.c -o emps", shell=True + f"cd {dir} && gcc -Wno-implicit-int emps.c -o emps", + shell=True, + check=True, ) # determine output file and run emps subprocess.run( - f"cd {dir} && ./emps {unzippedfile} > {outfile}", shell=True + f"cd {dir} && ./emps {unzippedfile} > {outfile}", + shell=True, + check=True, ) # cleanup emps and emps.c subprocess.run(f"rm -rf {dir}/emps*", shell=True) @@ -692,8 +710,7 @@ def download_dataset(name, root): if url == "": print(f"Dataset {name} doesn't have a URL. Skipping...") return - else: - os.mkdir(dir) + os.makedirs(dir, exist_ok=True) file = os.path.join(dir, os.path.basename(url)) download(url, file) extract(file, dir, type) @@ -715,17 +732,35 @@ def main(): if not os.path.exists(args.instance_download_path): os.makedirs(args.instance_download_path) instance_download_path = args.instance_download_path + + failed = [] + datasets_to_download = [] if args.LPfeasible: - for name in LPFeasibleMittelmannSet: - download_dataset(name, instance_download_path) + datasets_to_download.extend(LPFeasibleMittelmannSet) if args.datasets: - for name in args.datasets: - download_dataset(name, instance_download_path) + datasets_to_download.extend(args.datasets) if args.benchmarks: for bench in args.benchmarks: - for name in MittelmannInstances["benchmarks"][bench]: - download_dataset(name, instance_download_path) - return + if bench not in MittelmannInstances["benchmarks"]: + print(f"ERROR: Unknown benchmark '{bench}'") + failed.append(bench) + continue + datasets_to_download.extend( + MittelmannInstances["benchmarks"][bench] + ) + + for name in datasets_to_download: + try: + download_dataset(name, instance_download_path) + except Exception as e: + print(f"ERROR: Failed to download dataset '{name}': {e}") + failed.append(name) + + if failed: + print( + f"\n{len(failed)} dataset(s) failed to download: {', '.join(failed)}" + ) + sys.exit(1) if __name__ == "__main__": diff --git a/build.sh b/build.sh index 5f9ac4071a..218505ed46 100755 --- a/build.sh +++ b/build.sh @@ -15,7 +15,7 @@ REPODIR=$(cd "$(dirname "$0")"; pwd) LIBCUOPT_BUILD_DIR=${LIBCUOPT_BUILD_DIR:=${REPODIR}/cpp/build} LIBMPS_PARSER_BUILD_DIR=${LIBMPS_PARSER_BUILD_DIR:=${REPODIR}/cpp/libmps_parser/build} -VALIDARGS="clean libcuopt cuopt_grpc_server libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -tsan -msan -v -l= --verbose-pdlp --build-lp-only --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-fatbin-write --host-lineinfo [--cmake-args=\\\"\\\"] [--cache-tool=] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help" +VALIDARGS="clean libcuopt cuopt_grpc_server libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -tsan -msan -v -l= --verbose-pdlp --build-lp-only --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-grpc-build --skip-fatbin-write --host-lineinfo [--cmake-args=\\\"\\\"] [--cache-tool=] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help" HELP="$0 [ ...] [ ...] where is: clean - remove all existing build artifacts and configuration (start over) @@ -44,6 +44,7 @@ HELP="$0 [ ...] [ ...] --skip-c-python-adapters - skip building C and Python adapter files (cython_solve.cu and cuopt_c.cpp) --skip-tests-build - disable building of all tests --skip-routing-build - skip building routing components + --skip-grpc-build - skip building gRPC and protobuf components (auto-enabled with -tsan) --skip-fatbin-write - skip the fatbin write --host-lineinfo - build with debug line information for host code --cache-tool= - pass the build cache tool (eg: ccache, sccache, distcc) that will be used @@ -54,7 +55,7 @@ HELP="$0 [ ...] [ ...] --show_depr_warn - show cmake deprecation warnings -h - print this text - default action (no args) is to build and install 'libcuopt' then 'cuopt' then 'docs' targets + default action (no args) is to build and install 'libmps_parser', 'libcuopt', 'cuopt', 'cuopt_mps_parser', 'cuopt_server', and 'cuopt_sh_client' targets (pass 'docs' explicitly to build documentation) libcuopt build dir is: ${LIBCUOPT_BUILD_DIR} @@ -84,6 +85,7 @@ BUILD_MSAN=0 SKIP_C_PYTHON_ADAPTERS=0 SKIP_TESTS_BUILD=0 SKIP_ROUTING_BUILD=0 +SKIP_GRPC_BUILD=0 WRITE_FATBIN=1 HOST_LINEINFO=0 CACHE_ARGS=() @@ -238,6 +240,7 @@ if hasArg -fsanitize; then fi if hasArg -tsan; then BUILD_TSAN=1 + SKIP_GRPC_BUILD=1 fi if hasArg -msan; then BUILD_MSAN=1 @@ -251,6 +254,9 @@ fi if hasArg --skip-routing-build; then SKIP_ROUTING_BUILD=1 fi +if hasArg --skip-grpc-build; then + SKIP_GRPC_BUILD=1 +fi if hasArg --skip-fatbin-write; then WRITE_FATBIN=0 fi @@ -379,6 +385,7 @@ if buildAll || hasArg libcuopt || hasArg cuopt_grpc_server; then -DSKIP_C_PYTHON_ADAPTERS=${SKIP_C_PYTHON_ADAPTERS} \ -DBUILD_TESTS=$((1 - ${SKIP_TESTS_BUILD})) \ -DSKIP_ROUTING_BUILD=${SKIP_ROUTING_BUILD} \ + -DSKIP_GRPC_BUILD=${SKIP_GRPC_BUILD} \ -DWRITE_FATBIN=${WRITE_FATBIN} \ -DHOST_LINEINFO=${HOST_LINEINFO} \ -DPARALLEL_LEVEL="${PARALLEL_LEVEL}" \ @@ -443,8 +450,8 @@ if buildAll || hasArg cuopt_sh_client; then python "${PYTHON_ARGS_FOR_INSTALL[@]}" . fi -# Build the docs -if buildAll || hasArg docs; then +# Build the docs (opt-in; pass 'docs' explicitly to build) +if hasArg docs; then cd "${REPODIR}"/cpp/doxygen doxygen Doxyfile diff --git a/ci/build_summary.sh b/ci/build_summary.sh new file mode 100755 index 0000000000..f10e81bc12 --- /dev/null +++ b/ci/build_summary.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a Slack notification summarizing the build workflow status. +# Queries the GitHub API for job statuses and posts a compact message. + +set -euo pipefail + +BRANCH="${RAPIDS_BRANCH:-main}" +RUN_DATE="$(date +%F)" +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:-}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:-}" + +if [ -z "${SLACK_BOT_TOKEN}" ] || [ -z "${SLACK_CHANNEL_ID}" ]; then + echo "SLACK_BOT_TOKEN or SLACK_CHANNEL_ID not set, skipping build summary." + exit 0 +fi + +# Fetch workflow job statuses +JOBS_FILE=$(mktemp) +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching build job statuses from GitHub API..." + curl -s -L --max-time 30 \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${JOBS_FILE}" || echo "{}" > "${JOBS_FILE}" +else + echo "{}" > "${JOBS_FILE}" +fi + +# Generate Slack payload +PAYLOAD=$(python3 -c " +import json, sys + +with open(sys.argv[1]) as f: + data = json.load(f) +branch = sys.argv[2] +date = sys.argv[3] +run_url = sys.argv[4] + +jobs = data.get('jobs', []) + +# Filter out build-summary itself and compute-matrix helpers +jobs = [j for j in jobs + if 'build-summary' not in j.get('name', '').lower() + and 'compute-matrix' not in j.get('name', '').lower()] + +# Group by workflow prefix +groups = {} +for j in jobs: + name = j.get('name', '') + prefix = name.split(' / ')[0] if ' / ' in name else name + groups.setdefault(prefix, []).append(j) + +total = len(jobs) +failed_count = sum(1 for j in jobs if j.get('conclusion') == 'failure') +passed_count = sum(1 for j in jobs if j.get('conclusion') == 'success') + +if failed_count > 0: + emoji = ':x:' + status = f'{failed_count} build job(s) failed' +else: + emoji = ':white_check_mark:' + status = f'All {passed_count} build jobs passed' + +blocks = [] +blocks.append({ + 'type': 'header', + 'text': {'type': 'plain_text', 'text': f'cuOpt Build \u2014 {branch} \u2014 {date}', 'emoji': True}, +}) +blocks.append({ + 'type': 'section', + 'text': {'type': 'mrkdwn', 'text': f'{emoji} *{status}*'}, +}) +blocks.append({'type': 'divider'}) + +# Build status per group +lines = [] +for group_name, group_jobs in sorted(groups.items()): + g_passed = sum(1 for j in group_jobs if j.get('conclusion') == 'success') + g_failed = sum(1 for j in group_jobs if j.get('conclusion') == 'failure') + g_total = len(group_jobs) + + if g_failed > 0: + icon = ':x:' + detail = f'{g_failed}/{g_total} failed' + # Add clickable log links for failed jobs + failed_in_group = [j for j in group_jobs if j.get('conclusion') == 'failure'] + if failed_in_group and failed_in_group[0].get('html_url'): + log_url = failed_in_group[0]['html_url'] + detail += f' <{log_url}|View Logs>' + elif g_passed == g_total: + icon = ':white_check_mark:' + detail = f'{g_total} passed' + else: + icon = ':grey_question:' + detail = f'{g_passed}/{g_total} passed' + lines.append(f'{icon} *{group_name}* \u2014 {detail}') + +current = '' +for line in lines: + if current and len(current) + len(line) + 1 > 2900: + blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': current.rstrip()}}) + current = '' + current += line + '\n' +if current.strip(): + blocks.append({'type': 'section', 'text': {'type': 'mrkdwn', 'text': current.rstrip()}}) + +# Link +if run_url: + blocks.append({'type': 'divider'}) + blocks.append({ + 'type': 'context', + 'elements': [{'type': 'mrkdwn', 'text': f'<{run_url}|:github: GitHub Actions>'}], + }) + +print(json.dumps({ + 'username': 'cuOpt Build Bot', + 'icon_emoji': ':package:', + 'blocks': blocks, +})) +" "${JOBS_FILE}" "${BRANCH}" "${RUN_DATE}" "${GITHUB_RUN_URL}") + +rm -f "${JOBS_FILE}" + +# Send via bot token +echo "Sending build summary to Slack..." +BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${PAYLOAD}" "${SLACK_CHANNEL_ID}") + +RESPONSE=$(curl -s --max-time 30 -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage" || echo '{"ok":false,"error":"curl_failed"}') + +OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") +if [ "${OK}" != "True" ]; then + echo "ERROR: chat.postMessage failed: ${RESPONSE}" >&2 +else + echo "Build summary posted to Slack." +fi diff --git a/ci/dashboard/index.html b/ci/dashboard/index.html new file mode 100644 index 0000000000..98900ab458 --- /dev/null +++ b/ci/dashboard/index.html @@ -0,0 +1,696 @@ + + + + + + +cuOpt Nightly Test Dashboard + + + + + + + + + + + + +
+
Loading dashboard data...
+
+ + + + + + + + + diff --git a/ci/docker/Dockerfile b/ci/docker/Dockerfile index 6df4159d81..6167308ea0 100644 --- a/ci/docker/Dockerfile +++ b/ci/docker/Dockerfile @@ -41,7 +41,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends build-essential ENV DEBIAN_FRONTEND="" -RUN ln -sf /usr/bin/python${PYTHON_SHORT_VER} /usr/bin/python +RUN update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_SHORT_VER} 100 && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_SHORT_VER} 100 FROM python-env AS install-env @@ -59,6 +60,9 @@ RUN \ --no-cache-dir \ "pyyaml" \ "cuopt-server-${cuda_suffix}==${CUOPT_VER}" \ + "cuopt-${cuda_suffix}==${CUOPT_VER}" \ + "libcuopt-${cuda_suffix}==${CUOPT_VER}" \ + "cuopt-mps-parser==${CUOPT_VER}" \ "cuopt-sh-client==${CUOPT_VER}" && \ python -m pip list diff --git a/ci/nightly_summary.sh b/ci/nightly_summary.sh new file mode 100755 index 0000000000..0340889c77 --- /dev/null +++ b/ci/nightly_summary.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Aggregate all per-matrix nightly test summaries and send a single +# consolidated Slack notification. Runs as a post-test job after all +# matrix CI jobs finish. +# +# The script needs S3 access via CUOPT_S3_URI (bucket root) and CUOPT_AWS_* credentials. +# +# Optional: +# CUOPT_SLACK_BOT_TOKEN - sends Slack if set (with CUOPT_SLACK_CHANNEL_ID) +# CUOPT_SLACK_CHANNEL_ID - Slack channel ID +# RAPIDS_BRANCH - branch name (default: main) +# RAPIDS_BUILD_TYPE - build type (nightly, pull-request, etc.) +# GITHUB_TOKEN - for querying workflow job statuses +# GITHUB_RUN_ID - current workflow run ID + +set -euo pipefail + +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" +OUTPUT_DIR="${PWD}/aggregate-output" +mkdir -p "${OUTPUT_DIR}" + +RUN_DATE="${RUN_DATE:-$(date +%F)}" +BRANCH="${RAPIDS_BRANCH:-main}" + +GITHUB_RUN_URL="${GITHUB_SERVER_URL:-https://github.com}/${GITHUB_REPOSITORY:-NVIDIA/cuopt}/actions/runs/${GITHUB_RUN_ID:-}" + +# Map CUOPT_AWS_* to standard AWS env vars for the aws CLI +export AWS_ACCESS_KEY_ID="${CUOPT_AWS_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" +export AWS_SECRET_ACCESS_KEY="${CUOPT_AWS_SECRET_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" +unset AWS_SESSION_TOKEN + +if [ -z "${CUOPT_S3_URI:-}" ]; then + echo "WARNING: CUOPT_S3_URI is not set. Skipping nightly aggregation." >&2 + exit 0 +fi + +S3_BASE="${CUOPT_S3_URI}ci_test_reports/nightly" +BRANCH_SLUG=$(echo "${BRANCH}" | tr '/' '-') + +# Summaries are scoped by GITHUB_RUN_ID so each workflow run is isolated. +# The run-scoped path has no date component — the run ID is unique, and +# dropping the date prevents mismatches when test jobs span midnight UTC. +# Fallback: branch-scoped path for backwards compat or non-CI runs. +if [ -n "${GITHUB_RUN_ID:-}" ]; then + S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/run-${GITHUB_RUN_ID}/" +else + S3_SUMMARIES_PREFIX="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" +fi +S3_REPORTS_PREFIX="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/" +S3_CONSOLIDATED_JSON="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/consolidated.json" +S3_CONSOLIDATED_HTML="${S3_BASE}/reports/${RUN_DATE}/${BRANCH_SLUG}/consolidated.html" +S3_INDEX_URI="${S3_BASE}/index.json" +S3_DASHBOARD_URI="${S3_BASE}/dashboard/${BRANCH_SLUG}/index.html" +DASHBOARD_DIR="${SCRIPT_DIR}/dashboard" + +# --- Query GitHub API for workflow job statuses --- +WORKFLOW_JOBS_JSON="${OUTPUT_DIR}/workflow_jobs.json" +if [ -n "${GITHUB_TOKEN:-}" ] && [ -n "${GITHUB_RUN_ID:-}" ] && [ -n "${GITHUB_REPOSITORY:-}" ]; then + echo "Fetching workflow job statuses from GitHub API..." + curl -s -L --max-time 30 \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100" \ + > "${WORKFLOW_JOBS_JSON}" || echo "{}" > "${WORKFLOW_JOBS_JSON}" +else + echo "WARNING: GITHUB_TOKEN or GITHUB_RUN_ID not set, skipping workflow job status." >&2 + echo "{}" > "${WORKFLOW_JOBS_JSON}" +fi + + +# Fallback: if the primary prefix is empty, try the branch-slug prefix. +# This handles cases where GITHUB_RUN_ID wasn't available in test containers +# (summaries were uploaded under the branch slug instead of run ID). +S3_SUMMARIES_FALLBACK="${S3_BASE}/summaries/${RUN_DATE}/${BRANCH_SLUG}/" + +echo "Aggregating nightly summaries from ${S3_SUMMARIES_PREFIX}" + +python3 "${SCRIPT_DIR}/utils/aggregate_nightly.py" \ + --s3-summaries-prefix "${S3_SUMMARIES_PREFIX}" \ + --s3-summaries-fallback "${S3_SUMMARIES_FALLBACK}" \ + --s3-reports-prefix "${S3_REPORTS_PREFIX}" \ + --s3-output-uri "${S3_CONSOLIDATED_JSON}" \ + --s3-html-output-uri "${S3_CONSOLIDATED_HTML}" \ + --s3-index-uri "${S3_INDEX_URI}" \ + --s3-dashboard-uri "${S3_DASHBOARD_URI}" \ + --dashboard-dir "${DASHBOARD_DIR}" \ + --output-dir "${OUTPUT_DIR}" \ + --date "${RUN_DATE}" \ + --branch "${BRANCH}" \ + --github-run-url "${GITHUB_RUN_URL}" \ + --workflow-jobs "${WORKFLOW_JOBS_JSON}" + +# --- Write GitHub Step Summary (if available) --- +if [ -n "${GITHUB_STEP_SUMMARY:-}" ] && [ -f "${OUTPUT_DIR}/consolidated_summary.json" ]; then + python3 "${SCRIPT_DIR}/utils/generate_step_summary.py" "${OUTPUT_DIR}/consolidated_summary.json" >> "${GITHUB_STEP_SUMMARY}" || true +fi + +# --- Generate presigned URLs for reports (7-day expiry) --- +PRESIGN_EXPIRY=604800 +PRESIGNED_HTML=$(aws s3 presign "${S3_CONSOLIDATED_HTML}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null) || { + echo "WARNING: Failed to generate presigned URL for report" >&2 + PRESIGNED_HTML="" +} +PRESIGNED_DASHBOARD=$(aws s3 presign "${S3_DASHBOARD_URI}" --expires-in "${PRESIGN_EXPIRY}" 2>/dev/null) || { + echo "WARNING: Failed to generate presigned URL for dashboard" >&2 + PRESIGNED_DASHBOARD="" +} + +# Send consolidated Slack notification if bot token is available and this is a nightly build +if [ -n "${CUOPT_SLACK_BOT_TOKEN:-}" ] && [ -n "${CUOPT_SLACK_CHANNEL_ID:-}" ] && [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ]; then + echo "Sending consolidated Slack notification" + CONSOLIDATED_SUMMARY="${OUTPUT_DIR}/consolidated_summary.json" \ + CONSOLIDATED_HTML="${OUTPUT_DIR}/consolidated_report.html" \ + SLACK_BOT_TOKEN="${CUOPT_SLACK_BOT_TOKEN}" \ + SLACK_CHANNEL_ID="${CUOPT_SLACK_CHANNEL_ID}" \ + CUOPT_SLACK_MENTION_ID="${CUOPT_SLACK_MENTION_ID:-}" \ + PRESIGNED_REPORT_URL="${PRESIGNED_HTML}" \ + PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD}" \ + bash "${SCRIPT_DIR}/utils/send_consolidated_summary.sh" +fi + +echo "Nightly summary complete." diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 3d6c356b3d..9a67bb65a5 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -152,3 +152,6 @@ elif [[ "${RUN_CONTEXT}" == "release" ]]; then sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/faq.rst sed_runner "s|\\bmain\\b|release/${NEXT_SHORT_TAG}|g" docs/cuopt/source/cuopt-python/routing/routing-example.ipynb fi + +# Update docs version switcher to include the new version +python ci/utils/update_doc_versions.py diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh index fc1de8e1b4..7cf7b60d03 100755 --- a/ci/run_ctests.sh +++ b/ci/run_ctests.sh @@ -2,13 +2,31 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# Run gtests with per-test-case retry for flaky detection. +# +# Features: +# - Runs each gtest binary and collects JUnit XML results +# - On failure, parses XML to find failing test cases and retries them individually +# - Produces separate XML files per retry so nightly_report.py can classify flaky tests +# - Detects segfaults (signal 11) and isolates crashing tests +# +# Environment variables: +# GTEST_OUTPUT - gtest XML output prefix (set by test_cpp.sh) +# GTEST_MAX_RETRIES - max retries per failing test case (default: 2) +# RAPIDS_TESTS_DIR - directory for test results + set -euo pipefail +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +# shellcheck source=ci/utils/crash_helpers.sh +source "${SCRIPT_DIR}/utils/crash_helpers.sh" + # Support customizing the gtests' install location # First, try the installed location (CI/conda environments) installed_test_location="${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcuopt/" # Fall back to the build directory (devcontainer environments) -devcontainers_test_location="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../cpp/build/latest/gtests/libcuopt/" +devcontainers_test_location="${SCRIPT_DIR}/../cpp/build/latest/gtests/libcuopt/" if [[ -d "${installed_test_location}" ]]; then GTEST_DIR="${installed_test_location}" @@ -21,16 +39,202 @@ else exit 1 fi -for gt in "${GTEST_DIR}"/*_TEST; do +GTEST_MAX_RETRIES=${GTEST_MAX_RETRIES:-2} +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +IS_NIGHTLY="${RAPIDS_BUILD_TYPE:-}" + + +JUNIT_HELPERS="${SCRIPT_DIR}/utils/junit_helpers.py" + +# Extract failing test case names from a gtest JUnit XML file +extract_failed_tests() { + local xml_file="$1" + if [ ! -f "${xml_file}" ]; then + echo "" + return + fi + python3 "${JUNIT_HELPERS}" failed "${xml_file}" +} + +OVERALL_RC=0 +FAILED_BINARIES=() + +# Record a failed gtest binary for the end-of-run summary. +# Args: +record_binary_failure() { + FAILED_BINARIES+=("$1 — $2") +} + +# Synthesize a JUnit crash record so a binary-level crash is visible to +# nightly_report.py. gtest only writes its XML at the end of +# RUN_ALL_TESTS(); a SIGSEGV/SIGABRT mid-run leaves no XML behind, so +# without this record the failure is invisible to the classifier. +# Written to a separate *-crash.xml file to preserve any partial XML. +# Args: +write_binary_crash_marker() { + local test_name="$1" + local xml_dir="$2" + local rc="$3" + local sig + sig=$(signal_name "${rc}") + local crash_xml="${xml_dir}/${test_name}-crash.xml" + write_crash_xml "${crash_xml}" "${test_name}" "PROCESS_CRASH" \ + "${test_name} crashed with ${sig} (exit code ${rc})" \ + "Process terminated by ${sig} mid-run. gtest did not emit a JUnit XML because RUN_ALL_TESTS() did not complete; inspect the run log for [FAILED] / stack-trace lines that preceded the crash." +} + +run_gtest_with_retry() { + local gt="$1" + shift + local test_name test_name=$(basename "${gt}") + local xml_file="${RAPIDS_TESTS_DIR}/${test_name}.xml" + echo "Running gtest ${test_name}" - "${gt}" "$@" + + # First run — full binary + local rc=0 + "${gt}" --gtest_output="xml:${xml_file}" "$@" || rc=$? + + if [ "${rc}" -eq 0 ]; then + return 0 + fi + + # For non-nightly builds: fail immediately, no retries + # PRs should surface failures directly so authors can see what broke + if [ "${IS_NIGHTLY}" != "nightly" ]; then + if was_signal_death "${rc}"; then + local sig + sig=$(signal_name "${rc}") + echo "CRASH: ${test_name} died from ${sig} (exit code ${rc})" + write_binary_crash_marker "${test_name}" "${RAPIDS_TESTS_DIR}" "${rc}" + record_binary_failure "${test_name}" "CRASH (${sig})" + else + echo "FAILED: ${test_name} (exit code ${rc})" + record_binary_failure "${test_name}" "exit ${rc}" + fi + OVERALL_RC=1 + return 1 + fi + + # Determine which tests to retry + local tests_to_retry="" + + if was_signal_death "${rc}"; then + echo "CRASH: ${test_name} died from $(signal_name ${rc}) (exit code ${rc})" + + # Find tests that didn't get to run (not in the partial XML) + # plus any that failed. Only retry those, not the ones that passed. + echo "INFO: Finding tests that need retry in ${test_name}" + local all_tests + all_tests=$("${gt}" --gtest_list_tests "$@" 2>/dev/null \ + | python3 "${JUNIT_HELPERS}" gtest-list || echo "") + + # Extract tests that already passed from partial XML + local passed_tests="" + if [ -f "${xml_file}" ]; then + passed_tests=$(python3 "${JUNIT_HELPERS}" passed "${xml_file}" || echo "") + fi + + # Retry = all_tests - passed_tests + if [ -n "${passed_tests}" ]; then + tests_to_retry=$(comm -23 \ + <(echo "${all_tests}" | sort) \ + <(echo "${passed_tests}" | sort)) + else + tests_to_retry="${all_tests}" + fi + + if [ -z "${tests_to_retry}" ]; then + echo "FAILED: Could not list tests in ${test_name}, cannot retry" + write_crash_xml "${xml_file}" "${test_name}" "PROCESS_CRASH" \ + "${test_name} crashed with $(signal_name ${rc}) (exit code ${rc})" \ + "Process terminated by $(signal_name ${rc}). This may indicate a segfault, double-free, or stack overflow." + record_binary_failure "${test_name}" "CRASH ($(signal_name ${rc})), gtest_list_tests unavailable" + OVERALL_RC=1 + return 1 + fi + else + # Normal failure — extract which test cases failed from XML + tests_to_retry=$(extract_failed_tests "${xml_file}") + + if [ -z "${tests_to_retry}" ]; then + echo "FAILED: ${test_name} failed but could not identify failing test cases" + record_binary_failure "${test_name}" "exit ${rc}, no failing testcase parseable from XML" + OVERALL_RC=1 + return 1 + fi + fi + + local num_to_retry + num_to_retry=$(echo "${tests_to_retry}" | wc -l) + echo "INFO: Retrying ${num_to_retry} test case(s) from ${test_name} individually" + + # Retry each test case individually + local all_passed=true + while IFS= read -r tc; do + local tc_passed=false + for attempt in $(seq 1 "${GTEST_MAX_RETRIES}"); do + local tc_safe + tc_safe=$(echo "${tc}" | tr -c '[:alnum:]._-' '_') + local retry_xml="${RAPIDS_TESTS_DIR}/${test_name}-retry${attempt}-${tc_safe}.xml" + echo " Retry ${attempt}/${GTEST_MAX_RETRIES}: ${tc}" + + local retry_rc=0 + "${gt}" --gtest_filter="${tc}" --gtest_output="xml:${retry_xml}" "$@" || retry_rc=$? + + if [ "${retry_rc}" -eq 0 ]; then + echo " FLAKY: ${tc} passed on retry ${attempt}" + tc_passed=true + break + fi + + if was_signal_death "${retry_rc}"; then + echo " CRASH: ${tc} died from $(signal_name ${retry_rc}) on retry ${attempt}" + write_crash_xml "${retry_xml}" "${test_name}" "${tc}" \ + "${tc} crashed with $(signal_name ${retry_rc}) on retry ${attempt}" \ + "Process terminated by $(signal_name ${retry_rc}). This test causes intermittent crashes." + # Don't break — keep retrying, might be a flaky crash + fi + done + + if [ "${tc_passed}" = false ]; then + echo " FAILED: ${tc} failed after $((GTEST_MAX_RETRIES + 1)) attempts" + all_passed=false + fi + done <<< "${tests_to_retry}" + + if [ "${all_passed}" = false ]; then + record_binary_failure "${test_name}" "retries exhausted" + OVERALL_RC=1 + return 1 + fi + return 0 +} + +for gt in "${GTEST_DIR}"/*_TEST; do + run_gtest_with_retry "${gt}" "$@" || true done # Run C_API_TEST with CPU memory for local solves (excluding time limit tests) if [ -x "${GTEST_DIR}/C_API_TEST" ]; then echo "Running gtest C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL" - CUOPT_USE_CPU_MEM_FOR_LOCAL=1 "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" + CUOPT_USE_CPU_MEM_FOR_LOCAL=1 run_gtest_with_retry "${GTEST_DIR}/C_API_TEST" --gtest_filter=-c_api/TimeLimitTestFixture.* "$@" || true else echo "Skipping C_API_TEST with CUOPT_USE_CPU_MEM_FOR_LOCAL (binary not found)" fi + +# Final summary so failures are easy to spot in the raw run log. +# nightly_report.py also produces a structured report from the XML files, +# but this prints early (before any post-test-script steps) and surfaces +# crashes that bypassed gtest's XML output. +if [ "${#FAILED_BINARIES[@]}" -gt 0 ]; then + echo "" + echo "==================== FAILED gtest BINARIES (${#FAILED_BINARIES[@]}) ====================" + for entry in "${FAILED_BINARIES[@]}"; do + echo " - ${entry}" + done + echo "================================================================" +fi + +exit ${OVERALL_RC} diff --git a/ci/run_cuopt_pytests.sh b/ci/run_cuopt_pytests.sh index 66e996715a..9ee7780dc0 100755 --- a/ci/run_cuopt_pytests.sh +++ b/ci/run_cuopt_pytests.sh @@ -6,7 +6,54 @@ set -euo pipefail # It is essential to cd into python/cuopt/cuopt as `pytest-xdist` + `coverage` seem to work only at this directory level. +# Resolve paths before cd (BASH_SOURCE is relative and won't resolve after cd) +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +# shellcheck source=ci/utils/crash_helpers.sh +source "${SCRIPT_DIR}/utils/crash_helpers.sh" + # Support invoking run_cuopt_pytests.sh outside the script directory -cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt/cuopt/ +cd "${SCRIPT_DIR}/../python/cuopt/cuopt/" + +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +export RAPIDS_TESTS_DIR +PYTEST_MAX_CRASH_RETRIES=${PYTEST_MAX_CRASH_RETRIES:-2} +IS_NIGHTLY="${RAPIDS_BUILD_TYPE:-}" + +# Extract junitxml path from args +xml_file="" +for arg in "$@"; do + if [[ "${arg}" == *"junitxml"* ]]; then + xml_file="${arg#*=}" + break + fi +done + +# Add CI utils to PYTHONPATH so the rerun XML plugin is importable +export PYTHONPATH="${SCRIPT_DIR}/utils:${PYTHONPATH:-}" + +rc=0 +if [ "${IS_NIGHTLY}" = "nightly" ]; then + pytest -s --cache-clear --reruns 2 --reruns-delay 5 -p cuopt_rerun_xml "$@" tests || rc=$? +else + pytest -s --cache-clear "$@" tests || rc=$? +fi + +# If not a crash, exit normally +if [ "${rc}" -le 128 ]; then + exit ${rc} +fi + +echo "CRASH: pytest process died from $(signal_name ${rc}) (exit code ${rc})" + +# For non-nightly builds, fail immediately — no crash isolation. But +# still write a synthetic crash XML so nightly_report.py reports the +# failure (pytest didn't finalize JUnit on a mid-run crash). +if [ "${IS_NIGHTLY}" != "nightly" ]; then + write_pytest_crash_marker "${xml_file}" "pytest-cuopt" "${rc}" + exit ${rc} +fi + +pytest_crash_isolate "${rc}" "${xml_file}" -pytest -s --cache-clear "$@" tests +exit ${rc} diff --git a/ci/run_cuopt_server_pytests.sh b/ci/run_cuopt_server_pytests.sh index 4cb361a473..1580c038f0 100755 --- a/ci/run_cuopt_server_pytests.sh +++ b/ci/run_cuopt_server_pytests.sh @@ -6,7 +6,49 @@ set -euo pipefail # It is essential to cd into python/cuopt_server/cuopt_server as `pytest-xdist` + `coverage` seem to work only at this directory level. +# Resolve paths before cd (BASH_SOURCE is relative and won't resolve after cd) +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +# shellcheck source=ci/utils/crash_helpers.sh +source "${SCRIPT_DIR}/utils/crash_helpers.sh" + # Support invoking run_cuopt_server_pytests.sh outside the script directory -cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuopt_server/cuopt_server/ +cd "${SCRIPT_DIR}/../python/cuopt_server/cuopt_server/" + +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +export RAPIDS_TESTS_DIR +PYTEST_MAX_CRASH_RETRIES=${PYTEST_MAX_CRASH_RETRIES:-2} +IS_NIGHTLY="${RAPIDS_BUILD_TYPE:-}" + +xml_file="" +for arg in "$@"; do + if [[ "${arg}" == *"junitxml"* ]]; then + xml_file="${arg#*=}" + break + fi +done + +# Add CI utils to PYTHONPATH so the rerun XML plugin is importable +export PYTHONPATH="${SCRIPT_DIR}/utils:${PYTHONPATH:-}" + +rc=0 +if [ "${IS_NIGHTLY}" = "nightly" ]; then + pytest -s --cache-clear --reruns 2 --reruns-delay 5 -p cuopt_rerun_xml "$@" tests || rc=$? +else + pytest -s --cache-clear "$@" tests || rc=$? +fi + +if [ "${rc}" -le 128 ]; then + exit ${rc} +fi + +echo "CRASH: pytest process died from $(signal_name ${rc}) (exit code ${rc})" + +if [ "${IS_NIGHTLY}" != "nightly" ]; then + write_pytest_crash_marker "${xml_file}" "pytest-cuopt-server" "${rc}" + exit ${rc} +fi + +pytest_crash_isolate "${rc}" "${xml_file}" -pytest -s --cache-clear "$@" tests +exit ${rc} diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 653c44133a..840b6f8af0 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail @@ -45,14 +45,28 @@ pushd "${RAPIDS_DATASET_ROOT_DIR}" popd EXITCODE=0 +FAILED_STEPS=() trap "EXITCODE=1" ERR set +e # Run gtests from libcuopt-tests package -export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/ +# XML output and retry logic handled by run_ctests.sh +export RAPIDS_TESTS_DIR rapids-logger "Run gtests" -timeout 40m ./ci/run_ctests.sh +timeout 50m ./ci/run_ctests.sh || FAILED_STEPS+=("gtests (run_ctests.sh)") + +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "cpp" + +if [ "${#FAILED_STEPS[@]}" -gt 0 ]; then + EXITCODE=1 + echo "" + echo "==================== FAILED TEST STEPS (${#FAILED_STEPS[@]}) ====================" + for s in "${FAILED_STEPS[@]}"; do echo " - ${s}"; done + echo "================================================================" +fi rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index 22c41af84c..0b2b339ba1 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail @@ -64,5 +64,11 @@ for nb in ${NBLIST}; do fi done +popd + +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "notebooks" --with-python-version + rapids-logger "Notebook test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_python.sh b/ci/test_python.sh index 4f91c83334..df27dfddc5 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -30,6 +30,7 @@ conda activate test set -u RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +export RAPIDS_TESTS_DIR RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" @@ -48,14 +49,12 @@ rapids-logger "Check GPU usage" nvidia-smi EXITCODE=0 +FAILED_STEPS=() trap "EXITCODE=1" ERR set +e -# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1 -export OMP_NUM_THREADS=1 - rapids-logger "Test cuopt_cli" -timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh +timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh || FAILED_STEPS+=("cuopt_cli") rapids-logger "pytest cuopt" timeout 30m ./ci/run_cuopt_pytests.sh \ @@ -64,7 +63,7 @@ timeout 30m ./ci/run_cuopt_pytests.sh \ --cov=cuopt \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cuopt-coverage.xml" \ --cov-report=term \ - --ignore=raft + --ignore=raft || FAILED_STEPS+=("pytest cuopt") rapids-logger "pytest cuopt-server" timeout 20m ./ci/run_cuopt_server_pytests.sh \ @@ -72,10 +71,22 @@ timeout 20m ./ci/run_cuopt_server_pytests.sh \ --cov-config=.coveragerc \ --cov=cuopt_server \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cuopt-server-coverage.xml" \ - --cov-report=term + --cov-report=term || FAILED_STEPS+=("pytest cuopt-server") rapids-logger "Test skills/ assets (Python, C, CLI)" -timeout 10m ./ci/test_skills_assets.sh +timeout 10m ./ci/test_skills_assets.sh || FAILED_STEPS+=("skills assets") + +rapids-logger "Generate nightly test report" +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "python" --with-python-version + +if [ "${#FAILED_STEPS[@]}" -gt 0 ]; then + EXITCODE=1 + echo "" + echo "==================== FAILED TEST STEPS (${#FAILED_STEPS[@]}) ====================" + for s in "${FAILED_STEPS[@]}"; do echo " - ${s}"; done + echo "================================================================" +fi rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh index a327082e83..255727bfb5 100755 --- a/ci/test_wheel_cuopt.sh +++ b/ci/test_wheel_cuopt.sh @@ -63,20 +63,41 @@ cd - RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR -# Run CLI tests -timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +export RAPIDS_TESTS_DIR +mkdir -p "${RAPIDS_TESTS_DIR}" -# Run Python tests +EXITCODE=0 +FAILED_STEPS=() +trap "EXITCODE=1" ERR +set +e -# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1 -export OMP_NUM_THREADS=1 +# Run CLI tests +timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh || FAILED_STEPS+=("cuopt_cli") -timeout 30m ./ci/run_cuopt_pytests.sh --verbose --capture=no +# Run Python tests +timeout 30m ./ci/run_cuopt_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt.xml" \ + --verbose --capture=no || FAILED_STEPS+=("pytest cuopt (wheel)") # run thirdparty integration tests for only nightly builds if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then - ./ci/thirdparty-testing/run_jump_tests.sh - ./ci/thirdparty-testing/run_cvxpy_tests.sh - ./ci/thirdparty-testing/run_pulp_tests.sh - ./ci/thirdparty-testing/run_pyomo_tests.sh + ./ci/thirdparty-testing/run_jump_tests.sh || FAILED_STEPS+=("thirdparty jump") + ./ci/thirdparty-testing/run_cvxpy_tests.sh || FAILED_STEPS+=("thirdparty cvxpy") + ./ci/thirdparty-testing/run_pulp_tests.sh || FAILED_STEPS+=("thirdparty pulp") + ./ci/thirdparty-testing/run_pyomo_tests.sh || FAILED_STEPS+=("thirdparty pyomo") fi + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-python" --with-python-version + +if [ "${#FAILED_STEPS[@]}" -gt 0 ]; then + EXITCODE=1 + echo "" + echo "==================== FAILED TEST STEPS (${#FAILED_STEPS[@]}) ====================" + for s in "${FAILED_STEPS[@]}"; do echo " - ${s}"; done + echo "================================================================" +fi + +exit ${EXITCODE} diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh index a76969b965..b6c8165f35 100755 --- a/ci/test_wheel_cuopt_server.sh +++ b/ci/test_wheel_cuopt_server.sh @@ -39,7 +39,31 @@ rapids-pip-retry install \ RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" export RAPIDS_DATASET_ROOT_DIR -timeout 30m ./ci/run_cuopt_server_pytests.sh --verbose --capture=no +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" + +EXITCODE=0 +FAILED_STEPS=() +trap "EXITCODE=1" ERR +set +e + +timeout 30m ./ci/run_cuopt_server_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-wheel-cuopt-server.xml" \ + --verbose --capture=no || FAILED_STEPS+=("pytest cuopt-server (wheel)") # Run documentation tests -./ci/test_doc_examples.sh +./ci/test_doc_examples.sh || FAILED_STEPS+=("doc examples") + +# Generate nightly test report +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/utils/nightly_report_helper.sh" +generate_nightly_report "wheel-server" --with-python-version + +if [ "${#FAILED_STEPS[@]}" -gt 0 ]; then + EXITCODE=1 + echo "" + echo "==================== FAILED TEST STEPS (${#FAILED_STEPS[@]}) ====================" + for s in "${FAILED_STEPS[@]}"; do echo " - ${s}"; done + echo "================================================================" +fi + +exit ${EXITCODE} diff --git a/ci/thirdparty-testing/run_cvxpy_tests.sh b/ci/thirdparty-testing/run_cvxpy_tests.sh index c336f6a800..51bfbce760 100755 --- a/ci/thirdparty-testing/run_cvxpy_tests.sh +++ b/ci/thirdparty-testing/run_cvxpy_tests.sh @@ -4,6 +4,9 @@ set -e -u -o pipefail +# shellcheck source=ci/utils/crash_helpers.sh +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../utils/crash_helpers.sh" + echo "building 'cvxpy' from source" PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') @@ -32,10 +35,26 @@ python -m pip install \ # ensure that environment is still consistent (i.e. cvxpy requirements do not conflict with cuopt's) pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + echo "running 'cvxpy' tests" +pytest_rc=0 timeout 3m python -m pytest \ --verbose \ --capture=no \ --error-for-skips \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-cvxpy.xml" \ -k "TestCUOPT" \ - ./cvxpy/tests/test_conic_solvers.py + ./cvxpy/tests/test_conic_solvers.py || pytest_rc=$? + +# pytest's normal exit codes are 0-5 (passed / failed / interrupted / +# internal error / usage / no tests collected). Anything beyond that +# (timeout=124, signal deaths >128, etc.) means pytest did not finalize +# its JUnit XML, so synthesize a crash marker — otherwise nightly_report.py +# would see no failure and report "All tests passed." +if [ "${pytest_rc}" -gt 5 ]; then + write_pytest_crash_marker "${RAPIDS_TESTS_DIR}/junit-thirdparty-cvxpy.xml" "thirdparty-cvxpy" "${pytest_rc}" +fi + +exit "${pytest_rc}" diff --git a/ci/thirdparty-testing/run_pulp_tests.sh b/ci/thirdparty-testing/run_pulp_tests.sh index f9cb0ca8a5..dd31bdec93 100755 --- a/ci/thirdparty-testing/run_pulp_tests.sh +++ b/ci/thirdparty-testing/run_pulp_tests.sh @@ -4,6 +4,9 @@ set -e -u -o pipefail +# shellcheck source=ci/utils/crash_helpers.sh +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../utils/crash_helpers.sh" + rapids-logger "building 'pulp' from source and running cuOpt tests" if [ -z "${PIP_CONSTRAINT:-}" ]; then @@ -23,14 +26,22 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running PuLP tests (cuOpt-related)" # PuLP uses pytest; run only tests that reference cuopt/CUOPT # Exit code 5 = no tests collected; then try run_tests.py which detects solvers (including cuopt) pytest_rc=0 +# test_numpy_float calls model.solve() with no explicit solver; PuLP's +# default-solver auto-detection list doesn't include CUOPT, so it raises +# "No solver available" in our cuopt-only test environment. Skip it here. timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pulp.xml" \ -k "cuopt or CUOPT" \ + --deselect pulp/tests/test_pulp.py::CUOPTTest::test_numpy_float \ pulp/tests/ || pytest_rc=$? if [ "$pytest_rc" -eq 5 ]; then @@ -39,5 +50,14 @@ if [ "$pytest_rc" -eq 5 ]; then pytest_rc=$? fi +# pytest's normal exit codes are 0-5 (passed / failed / interrupted / +# internal error / usage / no tests collected). Anything beyond that +# (timeout=124, signal deaths >128, etc.) means pytest did not finalize +# its JUnit XML, so synthesize a crash marker — otherwise nightly_report.py +# would see no failure and report "All tests passed." +if [ "${pytest_rc}" -gt 5 ]; then + write_pytest_crash_marker "${RAPIDS_TESTS_DIR}/junit-thirdparty-pulp.xml" "thirdparty-pulp" "${pytest_rc}" +fi + popd || exit 1 exit "$pytest_rc" diff --git a/ci/thirdparty-testing/run_pyomo_tests.sh b/ci/thirdparty-testing/run_pyomo_tests.sh index f50df676c9..e6c5a962e5 100755 --- a/ci/thirdparty-testing/run_pyomo_tests.sh +++ b/ci/thirdparty-testing/run_pyomo_tests.sh @@ -4,6 +4,9 @@ set -e -u -o pipefail +# shellcheck source=ci/utils/crash_helpers.sh +source "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../utils/crash_helpers.sh" + rapids-logger "building 'pyomo' from source and running cuOpt tests" if [ -z "${PIP_CONSTRAINT:-}" ]; then @@ -23,12 +26,27 @@ python -m pip install \ pip check +RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" +mkdir -p "${RAPIDS_TESTS_DIR}" + rapids-logger "running Pyomo tests (cuopt_direct / cuOpt-related)" # Run only tests that reference cuopt (cuopt_direct solver) +pytest_rc=0 timeout 5m python -m pytest \ --verbose \ --capture=no \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-thirdparty-pyomo.xml" \ -k "cuopt or CUOPT" \ - pyomo/solvers/tests/ + pyomo/solvers/tests/ || pytest_rc=$? + +# pytest's normal exit codes are 0-5 (passed / failed / interrupted / +# internal error / usage / no tests collected). Anything beyond that +# (timeout=124, signal deaths >128, etc.) means pytest did not finalize +# its JUnit XML, so synthesize a crash marker — otherwise nightly_report.py +# would see no failure and report "All tests passed." +if [ "${pytest_rc}" -gt 5 ]; then + write_pytest_crash_marker "${RAPIDS_TESTS_DIR}/junit-thirdparty-pyomo.xml" "thirdparty-pyomo" "${pytest_rc}" +fi popd || exit 1 +exit "${pytest_rc}" diff --git a/ci/utils/aggregate_nightly.py b/ci/utils/aggregate_nightly.py new file mode 100644 index 0000000000..4901fab7c3 --- /dev/null +++ b/ci/utils/aggregate_nightly.py @@ -0,0 +1,840 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Aggregate per-matrix nightly test summaries into a single consolidated report. + +Runs as a post-test job after all matrix CI jobs finish. It: + 1. Lists all JSON summaries uploaded to S3 for today's date + 2. Downloads and merges them + 3. Builds a matrix grid (test_type x matrix_label → status) + 4. Generates a consolidated JSON, HTML report, and Slack payload + 5. Uploads the consolidated report to S3 + +Usage: + python ci/utils/aggregate_nightly.py \\ + --s3-summaries-prefix s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/ \\ + --s3-reports-prefix s3://bucket/ci_test_reports/nightly/reports/2026-04-13/ \\ + --output-dir /tmp/aggregate-output \\ + --date 2026-04-13 \\ + --branch main +""" + +import argparse +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload, s3_list # noqa: E402 + + +# --------------------------------------------------------------------------- +# Download and merge summaries +# --------------------------------------------------------------------------- + + +def download_summaries(s3_prefix, local_dir, s3_fallback_prefix=""): + """Download all JSON summaries from S3 prefix into local_dir. + If s3_fallback_prefix is set and no summaries found at s3_prefix, + retries with the fallback (used when RAPIDS_BRANCH in rapidsai + containers doesn't match the branch input). + Returns list of loaded summary dicts.""" + local_dir = Path(local_dir) + local_dir.mkdir(parents=True, exist_ok=True) + + uris = s3_list(s3_prefix) + json_uris = [ + u + for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] + + # Fallback: search the parent date prefix if branch-specific path is empty + if ( + not json_uris + and s3_fallback_prefix + and s3_fallback_prefix != s3_prefix + ): + print( + f"No summaries at {s3_prefix}, trying fallback: {s3_fallback_prefix}" + ) + uris = s3_list(s3_fallback_prefix) + json_uris = [ + u + for u in uris + if u.endswith(".json") and not u.endswith("/consolidated.json") + ] + if json_uris: + s3_prefix = s3_fallback_prefix + + print(f"Found {len(json_uris)} summary file(s) at {s3_prefix}") + + summaries = [] + for uri in json_uris: + filename = uri.rsplit("/", 1)[-1] + local_path = str(local_dir / filename) + if s3_download(uri, local_path): + try: + with open(local_path) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse {local_path}: {exc}", + file=sys.stderr, + ) + return summaries + + +def load_local_summaries(local_dir): + """Load summaries from a local directory (for testing without S3).""" + local_dir = Path(local_dir) + summaries = [] + for json_file in sorted(local_dir.glob("*.json")): + try: + with open(json_file) as f: + summaries.append(json.load(f)) + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse {json_file}: {exc}", file=sys.stderr + ) + return summaries + + +# --------------------------------------------------------------------------- +# Aggregation +# --------------------------------------------------------------------------- + + +def aggregate_summaries(summaries): + """Merge per-matrix summaries into a consolidated view. + + Returns a dict with: + - matrix_grid: list of {test_type, matrix_label, status, counts, ...} + - totals: aggregate counts + - all_new_failures, all_recurring_failures, all_flaky_tests, + all_resolved_tests: merged lists with matrix context added + """ + grid = [] + totals = { + "total": 0, + "passed": 0, + "failed": 0, + "flaky": 0, + "skipped": 0, + "resolved": 0, + } + all_new_failures = [] + all_recurring_failures = [] + all_flaky_tests = [] + all_resolved_tests = [] + any_new_flaky = False + + for s in summaries: + test_type = s.get("test_type", "unknown") + matrix_label = s.get("matrix_label", "unknown") + counts = s.get("counts", {}) + + # Determine job status + failed = counts.get("failed", 0) + flaky = counts.get("flaky", 0) + has_new = s.get("has_new_failures", False) + if s.get("has_new_flaky", False): + any_new_flaky = True + + if failed > 0: + status = "failed-new" if has_new else "failed-recurring" + elif flaky > 0: + status = "flaky" + elif counts.get("total", 0) == 0: + status = "no-results" + else: + status = "passed" + + grid.append( + { + "test_type": test_type, + "matrix_label": matrix_label, + "status": status, + "counts": counts, + "sha": s.get("sha", ""), + } + ) + + # Accumulate totals + for key in totals: + totals[key] += counts.get(key, 0) + + # Merge failure lists with matrix context + ctx = {"test_type": test_type, "matrix_label": matrix_label} + for entry in s.get("new_failures", []): + all_new_failures.append({**entry, **ctx}) + for entry in s.get("recurring_failures", []): + all_recurring_failures.append({**entry, **ctx}) + for entry in s.get("flaky_tests", []): + all_flaky_tests.append({**entry, **ctx}) + for entry in s.get("resolved_tests", []): + all_resolved_tests.append({**entry, **ctx}) + + # Sort grid for consistent display + grid.sort(key=lambda g: (g["test_type"], g["matrix_label"])) + + return { + "matrix_grid": grid, + "totals": totals, + "all_new_failures": all_new_failures, + "all_recurring_failures": all_recurring_failures, + "all_flaky_tests": all_flaky_tests, + "all_resolved_tests": all_resolved_tests, + "has_new_flaky": any_new_flaky, + } + + +# --------------------------------------------------------------------------- +# Consolidated JSON +# --------------------------------------------------------------------------- + + +def parse_workflow_jobs(workflow_jobs_path): + """Parse GitHub Actions workflow job statuses from JSON file. + Returns all jobs (except nightly-summary itself) with name, + conclusion, URL, and whether they are tracked by per-matrix + S3 summaries.""" + if not workflow_jobs_path or not Path(workflow_jobs_path).exists(): + return [] + + # Job name prefixes that are covered by per-matrix S3 reports. + # These jobs also have detailed test results; other jobs only have + # a pass/fail status at the workflow level. + TRACKED_PREFIXES = ( + "conda-cpp-tests", + "conda-python-tests", + "wheel-tests-cuopt-server", + "wheel-tests-cuopt", + ) + + try: + with open(workflow_jobs_path) as f: + data = json.load(f) + jobs_list = data.get("jobs", []) + result = [] + for job in jobs_list: + name = job.get("name", "") + # Skip the nightly-summary job itself + if "nightly-summary" in name.lower(): + continue + # Skip helper jobs (compute-matrix, etc.) + if "compute-matrix" in name.lower(): + continue + tracked = any(name.startswith(p) for p in TRACKED_PREFIXES) + result.append( + { + "name": name, + "conclusion": job.get("conclusion", "unknown"), + "status": job.get("status", "unknown"), + "url": job.get("html_url", ""), + "has_test_details": tracked, + } + ) + return result + except (json.JSONDecodeError, OSError) as exc: + print( + f"WARNING: Failed to parse workflow jobs: {exc}", + file=sys.stderr, + ) + return [] + + +def generate_consolidated_json( + agg, date_str, branch, github_run_url="", workflow_jobs=None +): + """Generate the consolidated JSON for Slack and dashboard.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + flaky_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "flaky") + passed_jobs = sum(1 for g in agg["matrix_grid"] if g["status"] == "passed") + + # Workflow-level CI job statuses + wf_jobs = workflow_jobs or [] + failed_ci_jobs = [j for j in wf_jobs if j["conclusion"] == "failure"] + # Jobs without per-matrix S3 tracking (notebooks, JuMP, etc.) + untracked_failed = [ + j for j in failed_ci_jobs if not j.get("has_test_details", False) + ] + + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "date": date_str, + "branch": branch, + "github_run_url": github_run_url, + "job_summary": { + "total": total_jobs, + "passed": passed_jobs, + "failed": failed_jobs, + "flaky": flaky_jobs, + }, + "test_totals": agg["totals"], + "has_new_failures": len(agg["all_new_failures"]) > 0, + "has_new_flaky": agg.get("has_new_flaky", False), + "matrix_grid": agg["matrix_grid"], + "new_failures": agg["all_new_failures"], + "recurring_failures": agg["all_recurring_failures"], + "flaky_tests": agg["all_flaky_tests"], + "resolved_tests": agg["all_resolved_tests"], + "workflow_jobs": wf_jobs, + "failed_ci_jobs": failed_ci_jobs, + "untracked_failed_ci_jobs": untracked_failed, + } + + +# --------------------------------------------------------------------------- +# Consolidated HTML +# --------------------------------------------------------------------------- + + +def _html_escape(text): + return ( + str(text) + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def _status_badge(status): + """Return an HTML badge for a matrix cell status.""" + colors = { + "passed": ("#388e3c", "PASS"), + "failed-new": ("#d32f2f", "NEW FAIL"), + "failed-recurring": ("#e65100", "RECURRING"), + "flaky": ("#f9a825", "FLAKY"), + "no-results": ("#757575", "NO DATA"), + } + bg, label = colors.get(status, ("#757575", status.upper())) + text_color = "#212121" if status == "flaky" else "#fff" + return ( + f'' + f"{label}" + ) + + +def generate_consolidated_html( + agg, + date_str, + branch, + github_run_url="", + s3_reports_prefix="", +): + """Generate a consolidated HTML dashboard for all matrix combos.""" + total_jobs = len(agg["matrix_grid"]) + failed_jobs = sum( + 1 for g in agg["matrix_grid"] if g["status"].startswith("failed") + ) + + if failed_jobs > 0: + bar_color = "#d32f2f" + bar_text = f"{failed_jobs} of {total_jobs} matrix jobs have failures" + elif any(g["status"] == "flaky" for g in agg["matrix_grid"]): + bar_color = "#f9a825" + bar_text = "All jobs passed (flaky tests detected)" + else: + bar_color = "#388e3c" + bar_text = f"All {total_jobs} matrix jobs passed" + + totals = agg["totals"] + + parts = [] + parts.append(f""" + + + + +cuOpt Nightly — {_html_escape(branch)} — {_html_escape(date_str)} + + + +

cuOpt Nightly Tests — {_html_escape(branch)}

+
+ Date: {_html_escape(date_str)}""") + + if github_run_url: + parts.append( + f'  |  ' + f"GitHub Actions Run" + ) + + parts.append(f"""
+
{bar_text}
+
+
{totals["total"]}
Total Tests
+
{totals["passed"]}
Passed
+
{totals["failed"]}
Failed
+
{totals["flaky"]}
Flaky
+
Skipped
+
{totals["resolved"]}
Stabilized
+
""") + + # Helper: build a GitHub source link for test names when suite looks like a file path + def _test_name_html(entry): + """Return HTML for the test name, linked to source if suite looks like a file path.""" + name_escaped = _html_escape(entry["name"]) + suite = entry.get("suite", "") + # Find the sha from the matching grid entry + sha = "unknown" + for g in agg["matrix_grid"]: + if ( + g["test_type"] == entry.get("test_type") + and g["matrix_label"] == entry.get("matrix_label") + and g.get("sha") + ): + sha = g["sha"] + break + if ( + sha != "unknown" + and suite + and ("/" in suite or suite.endswith(".py")) + ): + url = f"https://github.com/NVIDIA/cuopt/blob/{_html_escape(sha)}/{_html_escape(suite)}" + return f'{name_escaped}' + return f"{name_escaped}" + + def _error_summary(message, max_len=200): + """Extract the most useful part of an error message for display. + Prefers the last line (usually the assertion) over the first + (usually the test method signature).""" + if not message: + return "" + lines = [ + ln.strip() for ln in message.strip().splitlines() if ln.strip() + ] + # Use the last non-empty line (typically the assertion/error) + if lines: + summary = lines[-1] + # If the last line is very short, include the previous line too + if len(summary) < 40 and len(lines) > 1: + summary = lines[-2] + " — " + summary + else: + summary = message + if len(summary) > max_len: + summary = summary[:max_len] + "..." + return summary + + # --- New failures --- + if agg["all_new_failures"]: + parts.append("

New Failures

") + parts.append( + "" + "" + ) + for e in agg["all_new_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(_error_summary(e.get("message", ""))) + parts.append( + f"" + f"" + f"" + f"" + f"' + ) + parts.append("
Test TypeMatrixSuiteTestError
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_test_name_html(e)}
{short}" + f'
{msg}
") + + # --- Flaky --- + if agg["all_flaky_tests"]: + parts.append("

Flaky Tests

") + parts.append( + "" + "" + ) + for e in agg["all_flaky_tests"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(_error_summary(e.get("message", ""))) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"' + ) + parts.append("
Test TypeMatrixSuiteTestRetriesError
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{e.get('retry_count', '?')}
{short}" + f'
{msg}
") + + # --- Recurring failures --- + if agg["all_recurring_failures"]: + parts.append("

Recurring Failures

") + parts.append( + "" + "" + ) + for e in agg["all_recurring_failures"]: + msg = _html_escape(e.get("message", "")) + short = _html_escape(_error_summary(e.get("message", ""))) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"' + ) + parts.append("
Test TypeMatrixSuiteTestSinceError
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_test_name_html(e)}{_html_escape(e.get('first_seen', '?'))}
{short}" + f'
{msg}
") + + # --- Resolved --- + if agg["all_resolved_tests"]: + parts.append("

Stabilized Tests

") + parts.append( + "" + "" + ) + for e in agg["all_resolved_tests"]: + parts.append( + f"" + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixSuiteTestFailing SinceCount
{_html_escape(e['test_type'])}{_html_escape(e['matrix_label'])}{_html_escape(e['suite'])}{_html_escape(e['name'])}{_html_escape(e.get('first_seen', '?'))}{e.get('failure_count', '?')}
") + + if ( + not agg["all_new_failures"] + and not agg["all_recurring_failures"] + and not agg["all_flaky_tests"] + and not agg["all_resolved_tests"] + ): + parts.append( + '

' + "All tests passed across all matrices!

" + ) + + # --- Matrix grid (at the end) --- + parts.append("

Matrix Overview

") + parts.append( + "" + "" + ) + for g in agg["matrix_grid"]: + counts = g["counts"] + report_link = "" + if s3_reports_prefix: + report_filename = f"{g['test_type']}-{g['matrix_label']}.html" + prefix = s3_reports_prefix.rstrip("/") + "/" + report_link = ( + f'View' + ) + parts.append( + f"" + f"" + f"" + f"" + f"" + f"" + f"" + f"" + ) + parts.append("
Test TypeMatrixStatusPassedFailedFlakyTotalReport
{_html_escape(g['test_type'])}{_html_escape(g['matrix_label'])}{_status_badge(g['status'])}{counts.get('passed', 0)}{counts.get('failed', 0)}{counts.get('flaky', 0)}{counts.get('total', 0)}{report_link}
") + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Index management +# --------------------------------------------------------------------------- + +MAX_INDEX_DAYS = 90 # Keep at most 90 days in the index + + +def update_index(s3_index_uri, date_str, consolidated, output_dir): + """Download index.json, add today's entry, prune old entries, re-upload.""" + local_index = str(output_dir / "index.json") + + # Download existing index (or start fresh) + index = {"_schema_version": 1, "dates": {}} + if s3_download(s3_index_uri, local_index): + try: + with open(local_index) as f: + loaded = json.load(f) + if "dates" in loaded: + index = loaded + except (json.JSONDecodeError, OSError): + pass + + # Add today's entry keyed by date/branch for multi-branch support + branch = consolidated.get("branch", "main") + entry_key = f"{date_str}/{branch}" + index["dates"][entry_key] = { + "date": date_str, + "branch": branch, + "job_summary": consolidated.get("job_summary", {}), + "test_totals": consolidated.get("test_totals", {}), + "has_new_failures": consolidated.get("has_new_failures", False), + "github_run_url": consolidated.get("github_run_url", ""), + } + + # Prune to last N entries + dates_sorted = sorted(index["dates"].keys(), reverse=True) + if len(dates_sorted) > MAX_INDEX_DAYS: + for old_key in dates_sorted[MAX_INDEX_DAYS:]: + del index["dates"][old_key] + + # Write and upload + with open(local_index, "w") as f: + json.dump(index, f, indent=2, sort_keys=True) + f.write("\n") + print(f"Updated index.json with {len(index['dates'])} date(s)") + + s3_upload(local_index, s3_index_uri) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Aggregate per-matrix nightly test summaries" + ) + parser.add_argument( + "--s3-summaries-prefix", + default="", + help="S3 prefix for per-matrix JSON summaries (e.g., s3://bucket/.../summaries/2026-04-13/)", + ) + parser.add_argument( + "--s3-summaries-fallback", + default="", + help="Fallback S3 prefix if no summaries found at primary prefix", + ) + parser.add_argument( + "--s3-reports-prefix", + default="", + help="S3 prefix where per-matrix HTML reports live (for linking)", + ) + parser.add_argument( + "--s3-output-uri", + default="", + help="S3 URI to upload the consolidated JSON", + ) + parser.add_argument( + "--s3-html-output-uri", + default="", + help="S3 URI to upload the consolidated HTML report", + ) + parser.add_argument( + "--s3-index-uri", + default="", + help="S3 URI for the index.json that tracks all available dates (read + write)", + ) + parser.add_argument( + "--s3-dashboard-uri", + default="", + help="S3 URI to upload the dashboard HTML (e.g., s3://bucket/.../dashboard/index.html)", + ) + parser.add_argument( + "--dashboard-dir", + default="", + help="Local directory containing dashboard files to upload", + ) + parser.add_argument( + "--local-summaries-dir", + default="", + help="Local directory with JSON summaries (alternative to S3, for testing)", + ) + parser.add_argument( + "--output-dir", + default="aggregate-output", + help="Local directory to write output files", + ) + parser.add_argument( + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument("--branch", default="main", help="Branch name") + parser.add_argument( + "--github-run-url", + default="", + help="URL to the GitHub Actions run", + ) + parser.add_argument( + "--workflow-jobs", + default="", + help="Path to JSON file with GitHub Actions workflow job statuses", + ) + + args = parser.parse_args() + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # ---- Step 1: Collect summaries ---- + if args.local_summaries_dir: + summaries = load_local_summaries(args.local_summaries_dir) + elif args.s3_summaries_prefix: + download_dir = output_dir / "downloaded_summaries" + summaries = download_summaries( + args.s3_summaries_prefix, download_dir, args.s3_summaries_fallback + ) + else: + print( + "ERROR: Provide --s3-summaries-prefix or --local-summaries-dir", + file=sys.stderr, + ) + return 1 + + if not summaries: + print( + "WARNING: No summaries found. Generating empty report.", + file=sys.stderr, + ) + + print(f"Loaded {len(summaries)} matrix summary file(s)") + + # ---- Step 2: Aggregate ---- + agg = aggregate_summaries(summaries) + print( + f"Matrix grid: {len(agg['matrix_grid'])} jobs — " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'passed')} passed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'].startswith('failed'))} failed, " + f"{sum(1 for g in agg['matrix_grid'] if g['status'] == 'flaky')} flaky" + ) + + # ---- Step 2b: Parse workflow job statuses ---- + workflow_jobs = parse_workflow_jobs(args.workflow_jobs) + if workflow_jobs: + failed_wf = [j for j in workflow_jobs if j["conclusion"] == "failure"] + print( + f"Workflow jobs: {len(workflow_jobs)} total, " + f"{len(failed_wf)} failed" + ) + + # ---- Step 3: Generate outputs ---- + consolidated = generate_consolidated_json( + agg, + args.date, + args.branch, + args.github_run_url, + workflow_jobs, + ) + + json_path = output_dir / "consolidated_summary.json" + json_path.write_text(json.dumps(consolidated, indent=2) + "\n") + print(f"Consolidated JSON written to {json_path}") + + html_report = generate_consolidated_html( + agg, + args.date, + args.branch, + args.github_run_url, + args.s3_reports_prefix, + ) + html_path = output_dir / "consolidated_report.html" + html_path.write_text(html_report) + print(f"Consolidated HTML written to {html_path}") + + # ---- Step 4: Upload to S3 ---- + if args.s3_output_uri: + s3_upload(str(json_path), args.s3_output_uri) + if args.s3_html_output_uri: + s3_upload(str(html_path), args.s3_html_output_uri) + + # ---- Step 5: Update index.json ---- + if args.s3_index_uri: + update_index( + args.s3_index_uri, + args.date, + consolidated, + output_dir, + ) + + # ---- Step 6: Upload dashboard (self-contained with embedded data) ---- + if args.s3_dashboard_uri and args.dashboard_dir: + dashboard_file = Path(args.dashboard_dir) / "index.html" + if dashboard_file.exists(): + # Read the index.json we just uploaded/created + index_path = output_dir / "index.json" + index_data = {} + if index_path.exists(): + with open(index_path) as f: + index_data = json.load(f) + + # Inject data into dashboard HTML so it works without S3 fetches + dashboard_html = dashboard_file.read_text() + # Escape closing + # when test names or error messages contain HTML-like content + safe_index = json.dumps(index_data).replace("\n" + "// Embedded data — injected by aggregate_nightly.py\n" + f"window.__EMBEDDED_INDEX__ = {safe_index};\n" + f"window.__EMBEDDED_CONSOLIDATED__ = {safe_consolidated};\n" + "\n" + ) + # Insert before + dashboard_html = dashboard_html.replace( + "", inject_script + "" + ) + + embedded_path = output_dir / "dashboard.html" + embedded_path.write_text(dashboard_html) + s3_upload(str(embedded_path), args.s3_dashboard_uri) + print("Dashboard uploaded with embedded data") + else: + print( + f"WARNING: Dashboard not found at {dashboard_file}", + file=sys.stderr, + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/crash_helpers.sh b/ci/utils/crash_helpers.sh new file mode 100644 index 0000000000..3f8c37538e --- /dev/null +++ b/ci/utils/crash_helpers.sh @@ -0,0 +1,187 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Shared helpers for crash detection and JUnit XML crash markers. +# Source this from test runner scripts (run_ctests.sh, run_cuopt_pytests.sh, etc.) + +# Convert an abnormal exit code to a human-readable description. +# Handles GNU coreutils 'timeout' (124) and signal deaths (> 128). +signal_name() { + case "$1" in + 124) echo "timeout (killed by 'timeout' command)" ;; + *) + local sig=$(($1 - 128)) + case "${sig}" in + 6) echo "SIGABRT" ;; + 11) echo "SIGSEGV (segfault)" ;; + *) echo "signal ${sig}" ;; + esac + ;; + esac +} + +# Check if an exit code indicates signal death (exit code > 128). +was_signal_death() { + [ "$1" -gt 128 ] +} + +# Escape XML special characters in a string. +# Replaces &, <, >, and " with their XML entity equivalents. +xml_escape() { + local s="$1" + s=$(printf '%s' "$s" | sed -e 's/&/\&/g' \ + -e 's//\>/g' \ + -e 's/"/\"/g') + printf '%s' "$s" +} + +# Write a JUnit XML crash marker to a file. +# This records a crash as a test failure so nightly_report.py can track it. +# +# Usage: write_crash_xml +write_crash_xml() { + local xml_file="$1" + local suite_name + local test_name + local message + local detail + suite_name=$(xml_escape "$2") + test_name=$(xml_escape "$3") + message=$(xml_escape "$4") + detail=$(xml_escape "$5") + + cat > "${xml_file}" < + + + + +${detail} + + + + +XMLEOF +} + +# Synthesize a JUnit XML crash record for a pytest invocation that died +# from a signal mid-run. Without this marker, nightly_report.py — which +# classifies tests purely from XML files — sees no failure and reports +# "All tests passed." even though the runner exited non-zero. +# +# Written to -crash.xml so any partial XML pytest may have +# emitted is preserved alongside it. +# +# Usage: write_pytest_crash_marker +write_pytest_crash_marker() { + local junitxml_path="$1" + local suite_name="$2" + local rc="$3" + + if [ -z "${junitxml_path}" ]; then + return + fi + + local sig + sig=$(signal_name "${rc}") + local crash_xml="${junitxml_path%.xml}-crash.xml" + write_crash_xml "${crash_xml}" "${suite_name}" "PROCESS_CRASH" \ + "${suite_name} crashed with ${sig} (exit code ${rc})" \ + "pytest process terminated by ${sig} mid-run. The JUnit XML was not finalized; the test that triggered the crash is unknown — inspect the run log for the last test invoked." +} + +# Isolate crashing pytest tests by retrying individually. +# Called after pytest exits with a signal (exit code > 128) on nightly builds. +# +# Requires: RAPIDS_TESTS_DIR, PYTEST_MAX_CRASH_RETRIES, SCRIPT_DIR (for junit_helpers.py) +# Usage: pytest_crash_isolate +pytest_crash_isolate() { + local rc="$1" + local xml_file="$2" + + echo "INFO: Collecting test list for individual retry..." + local test_list + test_list=$(pytest --collect-only -q tests 2>/dev/null | grep "::" | head -500 || echo "") + + if [ -z "${test_list}" ]; then + echo "FAILED: Could not collect test list, cannot isolate crashing test" + if [ -n "${xml_file}" ]; then + # Write crash marker to a separate file to preserve any partial + # results already written to xml_file by the crashed pytest run + local crash_marker="${RAPIDS_TESTS_DIR}/crash-marker-collection-failed.xml" + write_crash_xml "${crash_marker}" "pytest-crash" "PROCESS_CRASH" \ + "pytest crashed with $(signal_name "${rc}") (exit code ${rc})" \ + "pytest process terminated by $(signal_name "${rc}"). Could not collect test list for retry." + fi + return + fi + + # Extract tests that already passed from partial JUnit XML (if any) + local passed_tests="" + if [ -n "${xml_file}" ] && [ -f "${xml_file}" ]; then + passed_tests=$(python3 "${SCRIPT_DIR}/utils/junit_helpers.py" passed "${xml_file}" --sep "::" 2>/dev/null || echo "") + fi + + # Only retry tests that didn't already pass + if [ -n "${passed_tests}" ]; then + local num_passed + num_passed=$(echo "${passed_tests}" | wc -l) + echo "INFO: ${num_passed} tests already passed before crash, skipping those" + test_list=$(comm -23 \ + <(echo "${test_list}" | sort) \ + <(echo "${passed_tests}" | sort)) + fi + + local num_tests + num_tests=$(echo "${test_list}" | grep -c '.' || echo "0") + if [ "${num_tests}" -eq 0 ]; then + echo "INFO: All tests already passed before crash, nothing to retry" + return + fi + echo "INFO: Retrying ${num_tests} tests individually to isolate crash" + + local crash_tests=() + local flaky_crash_tests=() + + while IFS= read -r test_id; do + [ -z "${test_id}" ] && continue + local safe_name + safe_name=$(echo "${test_id}" | tr -c '[:alnum:]._-' '_') + + for attempt in $(seq 1 "${PYTEST_MAX_CRASH_RETRIES}"); do + local retry_rc=0 + local retry_xml="${RAPIDS_TESTS_DIR}/crash-retry${attempt}-${safe_name}.xml" + pytest -s --no-header -x --junitxml="${retry_xml}" "${test_id}" 2>/dev/null || retry_rc=$? + + if [ "${retry_rc}" -eq 0 ]; then + if [ "${attempt}" -gt 1 ]; then + echo " FLAKY-CRASH: ${test_id} — crashed then passed on retry ${attempt}" + flaky_crash_tests+=("${test_id}") + fi + break + elif [ "${retry_rc}" -gt 128 ]; then + echo " CRASH: ${test_id} — $(signal_name "${retry_rc}") on attempt ${attempt}" + if [ "${attempt}" -eq "${PYTEST_MAX_CRASH_RETRIES}" ]; then + echo " FAILED: ${test_id} — crashes consistently" + crash_tests+=("${test_id}") + write_crash_xml "${retry_xml}" "pytest-crash" "${test_id}" \ + "${test_id} crashed with $(signal_name "${retry_rc}") on ${attempt} attempts" \ + "Consistent crash: $(signal_name "${retry_rc}"). This test needs urgent investigation." + fi + else + # Normal test failure, not a crash — already in retry_xml + break + fi + done + done <<< "${test_list}" + + echo "" + echo "=== CRASH ISOLATION SUMMARY ===" + echo "Consistent crashes: ${#crash_tests[@]}" + for t in "${crash_tests[@]+"${crash_tests[@]}"}"; do echo " :x: ${t}"; done + echo "Flaky crashes (passed on retry): ${#flaky_crash_tests[@]}" + for t in "${flaky_crash_tests[@]+"${flaky_crash_tests[@]}"}"; do echo " :warning: ${t}"; done + echo "================================" +} diff --git a/ci/utils/cuopt_rerun_xml.py b/ci/utils/cuopt_rerun_xml.py new file mode 100644 index 0000000000..1045e66420 --- /dev/null +++ b/ci/utils/cuopt_rerun_xml.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Pytest plugin: write rerun failures to a supplementary JUnit XML. + +pytest-rerunfailures v14+ only records the final outcome in JUnit XML. +This plugin collects rerun (failed) attempts and writes them to a +separate XML file so nightly_report.py can classify flaky tests +(tests that failed then passed on retry). + +The output filename is derived from the --junitxml argument so that +multiple pytest invocations in the same job (e.g., test_python.sh +running both cuopt and cuopt-server tests) each get their own file +instead of overwriting each other. + +Usage: pytest -p cuopt_rerun_xml ... +Requires RAPIDS_TESTS_DIR env var for output location. +""" + +import os +from collections import defaultdict +from xml.etree.ElementTree import Element, ElementTree, SubElement + +import pytest + +# Collect rerun failure reports keyed by nodeid +_rerun_failures = defaultdict(list) +_final_outcomes = {} +_junitxml_path = "" + + +def pytest_configure(config): + """Capture the --junitxml path to derive our output filename.""" + global _junitxml_path # noqa: PLW0603 + _junitxml_path = config.option.xmlpath or "" + + +@pytest.hookimpl(trylast=True) +def pytest_runtest_logreport(report): + """Collect reports — track reruns and final outcomes.""" + if report.when != "call": + return + node_id = report.nodeid + if report.outcome == "rerun": + # This is a failed attempt that will be retried + msg = "" + if report.longrepr: + msg = str(report.longrepr)[:500] + _rerun_failures[node_id].append(msg) + else: + _final_outcomes[node_id] = report.outcome + + +def pytest_sessionfinish(session, exitstatus): + """Write supplementary XML for flaky tests (failed then passed).""" + if not _rerun_failures: + return + + output_dir = os.environ.get("RAPIDS_TESTS_DIR", "") + if not output_dir: + return + + testsuites = Element("testsuites") + suite = SubElement(testsuites, "testsuite", name="pytest-reruns") + count = 0 + + for node_id, failure_messages in _rerun_failures.items(): + final = _final_outcomes.get(node_id, "") + if final != "passed": + # Test didn't eventually pass — not flaky, just failed + continue + + # Flaky: failed on rerun attempts, passed on final + parts = node_id.rsplit("::", 1) + if len(parts) == 2: + classname = parts[0].replace("/", ".").replace(".py", "") + name = parts[1] + else: + classname = "" + name = node_id + + for msg in failure_messages: + tc = SubElement( + suite, + "testcase", + classname=classname, + name=name, + time="0", + ) + fail = SubElement(tc, "failure", message=msg[:200]) + fail.text = msg + count += 1 + + if count > 0: + suite.set("tests", str(count)) + suite.set("failures", str(count)) + # Derive filename from --junitxml to avoid overwrites when + # multiple pytest invocations share the same RAPIDS_TESTS_DIR + # (e.g., test_python.sh runs cuopt then server tests). + if _junitxml_path: + base = os.path.basename(_junitxml_path).replace(".xml", "") + rerun_filename = f"{base}-reruns.xml" + else: + rerun_filename = "junit-pytest-reruns.xml" + out_path = os.path.join(output_dir, rerun_filename) + ElementTree(testsuites).write( + out_path, xml_declaration=True, encoding="unicode" + ) + print(f"\nWrote {count} rerun failure entries to {out_path}") diff --git a/ci/utils/generate_slack_payloads.py b/ci/utils/generate_slack_payloads.py new file mode 100644 index 0000000000..c1cb2b491c --- /dev/null +++ b/ci/utils/generate_slack_payloads.py @@ -0,0 +1,398 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generate Slack Block Kit payloads from a consolidated nightly summary JSON. + +Prints one JSON payload per line to stdout: + - Line 1: main channel message (thread parent) + - Lines 2+: thread replies (per-workflow details, failed job links) + +Usage: + python3 generate_slack_payloads.py [presigned_report_url] [presigned_dashboard_url] +""" + +import json +import os +import sys + + +def _esc(text): + """Escape Slack mrkdwn special characters in dynamic text.""" + return ( + str(text) + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + ) + + +def _job_prefix(job): + """Extract workflow prefix from a GitHub Actions job name.""" + name = job.get("name", "unknown") + return name.split(" / ")[0] if " / " in name else name + + +def make_payload(blocks): + return json.dumps( + { + "username": "cuOpt Nightly Bot", + "icon_emoji": ":robot_face:", + "blocks": blocks, + } + ) + + +def main(): + summary_path = sys.argv[1] + presigned_report_url = sys.argv[2] if len(sys.argv) > 2 else "" + presigned_dashboard_url = sys.argv[3] if len(sys.argv) > 3 else "" + + with open(summary_path) as f: + d = json.load(f) + + branch = d.get("branch", "main") + date = d.get("date", "unknown") + github_run_url = d.get("github_run_url", "") + jobs = d.get("job_summary", {}) + totals = d.get("test_totals", {}) + grid = d.get("matrix_grid", []) + has_new = d.get("has_new_failures", False) + has_new_flaky = d.get("has_new_flaky", False) + failed_ci_jobs = d.get("failed_ci_jobs", []) + untracked_failed = d.get("untracked_failed_ci_jobs", []) + workflow_jobs = d.get("workflow_jobs", []) + + # Slack user or user-group to mention on new failures or new flaky tests. + # Set CUOPT_SLACK_MENTION_ID to either: + # - a user ID (starts with U or W, e.g. U01ABCDEF) — pings the user + # - a user-group / subteam ID (starts with S, e.g. S01ABCDEF) — pings the group + # The group's handle name (e.g. "cuopt-ci-team") will NOT ping; Slack + # requires the subteam ID, formatted as . Empty disables. + mention_id = os.environ.get("CUOPT_SLACK_MENTION_ID", "") + if mention_id.startswith("S"): + mention_tag = f" " + elif mention_id: + mention_tag = f"<@{mention_id}> " + else: + mention_tag = "" + + total_jobs = jobs.get("total", 0) + + total_ci_jobs = len(workflow_jobs) + passed_ci_count = sum( + 1 for j in workflow_jobs if j.get("conclusion") == "success" + ) + + # ================================================================== + # MAIN MESSAGE (line 1) -- posted to channel, becomes thread parent + # ================================================================== + blocks = [] + + # Identify which workflows have failures (from both CI jobs and matrix grid) + failing_workflows = set() + for j in failed_ci_jobs: + failing_workflows.add(_job_prefix(j)) + for g in grid: + if str(g.get("status", "")).startswith("failed"): + failing_workflows.add(g.get("test_type", "unknown")) + flaky_workflows = set() + for g in grid: + if g.get("status") == "flaky": + flaky_workflows.add(g.get("test_type", "unknown")) + + has_failures = len(failing_workflows) > 0 + untracked_count = len(untracked_failed) + + if has_failures and has_new: + emoji = ":rotating_light:" + text = f"{len(failing_workflows)} workflow(s) with NEW failures" + if has_new_flaky: + text += " + NEW flaky tests" + mention = mention_tag + elif has_failures and untracked_count > 0: + emoji = ":rotating_light:" + text = ( + f"Recurring failures in {len(failing_workflows)} workflow(s)" + f" + {untracked_count} CI job(s) failed (no test details)" + ) + mention = mention_tag + elif has_failures and has_new_flaky: + emoji = ":x:" + text = f"Recurring failures in {len(failing_workflows)} workflow(s) + NEW flaky tests" + mention = mention_tag + elif has_failures: + emoji = ":x:" + text = f"Recurring failures in {len(failing_workflows)} workflow(s)" + mention = "" + elif flaky_workflows and has_new_flaky: + emoji = ":large_yellow_circle:" + text = "All jobs passed but NEW flaky tests detected" + mention = mention_tag + elif flaky_workflows: + emoji = ":large_yellow_circle:" + text = "All jobs passed but flaky tests detected" + mention = "" + else: + emoji = ":white_check_mark:" + text = f"All {total_jobs} matrix jobs passed" + if total_ci_jobs > 0: + if passed_ci_count == total_ci_jobs: + text += f", all {total_ci_jobs} CI jobs succeeded" + else: + text += ( + f", {passed_ci_count}/{total_ci_jobs} CI jobs succeeded" + ) + mention = "" + + stats_parts = [] + if totals.get("failed", 0) > 0: + stats_parts.append(f":x: {totals['failed']} failed") + if totals.get("flaky", 0) > 0: + stats_parts.append(f":warning: {totals['flaky']} flaky") + if not stats_parts: + stats_parts.append( + f":white_check_mark: {totals.get('total', 0)} tests passed" + ) + stats = " | ".join(stats_parts) + + blocks.append( + { + "type": "header", + "text": { + "type": "plain_text", + "text": f"cuOpt Nightly Tests \u2014 {branch} \u2014 {date}", + "emoji": True, + }, + } + ) + blocks.append( + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{mention}{emoji} *{_esc(text)}*\n\n{_esc(stats)}", + }, + } + ) + + # Per-workflow failure summary using CI job counts from GitHub API + # Build a lookup: workflow prefix -> (failed, total) from workflow_jobs + wf_counts = {} + for j in workflow_jobs: + prefix = _job_prefix(j) + wf_counts.setdefault(prefix, {"failed": 0, "total": 0}) + wf_counts[prefix]["total"] += 1 + if j.get("conclusion") == "failure": + wf_counts[prefix]["failed"] += 1 + + # Build a lookup: workflow prefix -> list of failing matrix_labels from grid + wf_failing_labels = {} + for g in grid: + if str(g.get("status", "")).startswith("failed"): + wf_failing_labels.setdefault( + g.get("test_type", "unknown"), [] + ).append(g.get("matrix_label", "unknown")) + + if failing_workflows: + lines = [] + for wf in sorted(failing_workflows): + counts = wf_counts.get(wf, {}) + f_count = counts.get("failed", 0) + t_count = counts.get("total", 0) + # Append failing matrix labels (up to 3, then "+N more") + labels = wf_failing_labels.get(wf, []) + label_suffix = "" + if labels: + shown = labels[:3] + label_suffix = " (" + ", ".join(shown) + if len(labels) > 3: + label_suffix += f", +{len(labels) - 3} more" + label_suffix += ")" + if t_count > 0: + lines.append( + f":x: *{_esc(wf)}* \u2014 {f_count}/{t_count} failed{_esc(label_suffix)}" + ) + else: + lines.append( + f":x: *{_esc(wf)}* \u2014 failed{_esc(label_suffix)}" + ) + blocks.append({"type": "divider"}) + # Chunk to stay within Slack's 3000-char block limit + current = "" + for line in lines: + if current and len(current) + len(line) + 1 > 2900: + blocks.append( + { + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + } + ) + current = "" + current += line + "\n" + if current.strip(): + blocks.append( + { + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + } + ) + + # Links in main message + link_parts = [] + if github_run_url: + link_parts.append(f"<{github_run_url}|:github: GitHub Actions>") + if presigned_report_url: + link_parts.append(f"<{presigned_report_url}|:bar_chart: Full Report>") + if presigned_dashboard_url: + link_parts.append( + f"<{presigned_dashboard_url}|:chart_with_upwards_trend: Dashboard>" + ) + if link_parts: + blocks.append({"type": "divider"}) + blocks.append( + { + "type": "context", + "elements": [ + {"type": "mrkdwn", "text": " | ".join(link_parts)} + ], + } + ) + + print(make_payload(blocks)) + + # ================================================================== + # THREAD REPLIES (lines 2+) -- posted as replies to main message + # ================================================================== + + # -- Thread 1: Failing and flaky tests (grouped by workflow) ------- + # Build per-workflow test issue lists + new_failures = d.get("new_failures", []) + recurring = d.get("recurring_failures", []) + flaky = d.get("flaky_tests", []) + resolved = d.get("resolved_tests", []) + + # Collect all test issues by test_type (workflow) + issues_by_wf = {} + for f_entry in new_failures: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault( + tt, {"new": [], "recurring": [], "flaky": [], "resolved": []} + ) + issues_by_wf[tt]["new"].append(f_entry) + for f_entry in recurring: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault( + tt, {"new": [], "recurring": [], "flaky": [], "resolved": []} + ) + issues_by_wf[tt]["recurring"].append(f_entry) + for f_entry in flaky: + tt = f_entry.get("test_type", "unknown") + issues_by_wf.setdefault( + tt, {"new": [], "recurring": [], "flaky": [], "resolved": []} + ) + issues_by_wf[tt]["flaky"].append(f_entry) + for r in resolved: + tt = r.get("test_type", "unknown") + issues_by_wf.setdefault( + tt, {"new": [], "recurring": [], "flaky": [], "resolved": []} + ) + issues_by_wf[tt]["resolved"].append(r) + + if issues_by_wf: + for wf_name, issues in sorted(issues_by_wf.items()): + wf_blocks = [] + wf_text = f"*{_esc(wf_name)}*\n" + + # New failures first (most urgent, show more error context) + for f_entry in issues["new"][:10]: + msg = _esc(f_entry.get("message", "")[:150].replace("\n", " ")) + matrix = _esc(f_entry.get("matrix_label", "")) + name = _esc(f_entry.get("name", "unknown")) + wf_text += f":new: `{name}` ({matrix}) \u2014 {msg}\n" + + # Flaky (actionable -- tests that are unstable) + for f_entry in issues["flaky"][:10]: + matrix = _esc(f_entry.get("matrix_label", "")) + err = _esc(f_entry.get("message", "")[:100].replace("\n", " ")) + suffix = f" \u2014 {err}" if err else "" + tag = ( + ":new: :warning:" if f_entry.get("is_new") else ":warning:" + ) + name = _esc(f_entry.get("name", "unknown")) + wf_text += f"{tag} `{name}` ({matrix}){suffix}\n" + + # Recurring failures (known issues) + for f_entry in issues["recurring"][:10]: + matrix = _esc(f_entry.get("matrix_label", "")) + first = _esc(f_entry.get("first_seen", "?")) + name = _esc(f_entry.get("name", "unknown")) + wf_text += ( + f":repeat: `{name}` ({matrix}) \u2014 since {first}\n" + ) + + # Resolved + for r in issues["resolved"][:5]: + matrix = _esc(r.get("matrix_label", "")) + count = r.get("failure_count", "?") + name = _esc(r.get("name", "unknown")) + wf_text += f":white_check_mark: `{name}` ({matrix}) \u2014 was failing {count}x\n" + + # Truncation notes + for category, label, limit in [ + ("new", "new failures", 10), + ("recurring", "recurring", 10), + ("flaky", "flaky", 10), + ("resolved", "resolved", 5), + ]: + if len(issues[category]) > limit: + wf_text += ( + f"_...+{len(issues[category]) - limit} more {label}_\n" + ) + + # Chunk if needed + while wf_text: + chunk = wf_text[:2900] + wf_blocks.append( + { + "type": "section", + "text": {"type": "mrkdwn", "text": chunk.rstrip()}, + } + ) + wf_text = wf_text[2900:] + + print(make_payload(wf_blocks)) + + # -- Thread: Failed job log links ---------------------------------- + failed_job_links = [ + j + for j in workflow_jobs + if j.get("conclusion") == "failure" and j.get("url") + ] + if failed_job_links: + link_blocks = [] + current = "*Failed Job Logs:*\n" + for j in failed_job_links: + url = j.get("url", "") + name = _esc(j.get("name", "unknown")) + line = f":x: <{url}|{name}>\n" + if len(current) + len(line) > 2900: + link_blocks.append( + { + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + } + ) + current = "" + current += line + if current.strip(): + link_blocks.append( + { + "type": "section", + "text": {"type": "mrkdwn", "text": current.rstrip()}, + } + ) + print(make_payload(link_blocks)) + + +if __name__ == "__main__": + main() diff --git a/ci/utils/generate_step_summary.py b/ci/utils/generate_step_summary.py new file mode 100644 index 0000000000..dd5c853c67 --- /dev/null +++ b/ci/utils/generate_step_summary.py @@ -0,0 +1,122 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Generate a GitHub Step Summary (Markdown) from a consolidated nightly summary JSON. + +Prints Markdown to stdout suitable for appending to $GITHUB_STEP_SUMMARY. + +Usage: + python3 generate_step_summary.py +""" + +import json +import sys + + +def main(): + with open(sys.argv[1]) as f: + d = json.load(f) + + totals = d.get("test_totals", {}) + grid = d.get("matrix_grid", []) + new_f = d.get("new_failures", []) + recur = d.get("recurring_failures", []) + flaky = d.get("flaky_tests", []) + resolved = d.get("resolved_tests", []) + + print( + "# Nightly Test Summary \u2014 %s \u2014 %s" + % (d.get("branch", ""), d.get("date", "")) + ) + print() + print("| Metric | Count |") + print("|--------|-------|") + print("| Total | %d |" % totals.get("total", 0)) + print("| Passed | %d |" % totals.get("passed", 0)) + print("| **Failed** | **%d** |" % totals.get("failed", 0)) + print("| Flaky | %d |" % totals.get("flaky", 0)) + print("| Skipped | %d |" % totals.get("skipped", 0)) + print("| Stabilized | %d |" % totals.get("resolved", 0)) + print() + if new_f: + print("## New Failures") + print("| Test Type | Matrix | Test | Error |") + print("|-----------|--------|------|-------|") + for e in new_f[:20]: + msg = ( + (e.get("message", "")[:80]) + .replace("\n", " ") + .replace("|", "\\|") + ) + print( + "| %s | %s | `%s` | %s |" + % ( + e.get("test_type", ""), + e.get("matrix_label", ""), + e["name"], + msg, + ) + ) + print() + if flaky: + print("## Flaky Tests") + print("| Test Type | Matrix | Test | Retries |") + print("|-----------|--------|------|---------|") + for e in flaky[:20]: + print( + "| %s | %s | `%s` | %s |" + % ( + e.get("test_type", ""), + e.get("matrix_label", ""), + e["name"], + e.get("retry_count", "?"), + ) + ) + print() + if recur: + print("## Recurring Failures") + print("| Test Type | Matrix | Test | Since |") + print("|-----------|--------|------|-------|") + for e in recur[:20]: + print( + "| %s | %s | `%s` | %s |" + % ( + e.get("test_type", ""), + e.get("matrix_label", ""), + e["name"], + e.get("first_seen", "?"), + ) + ) + print() + if resolved: + print("## Stabilized Tests") + for e in resolved[:10]: + print( + "- `%s` (%s) \u2014 was failing %sx" + % ( + e["name"], + e.get("matrix_label", ""), + e.get("failure_count", "?"), + ) + ) + print() + print("## Matrix Overview") + print("| Test Type | Matrix | Status | Passed | Failed | Flaky |") + print("|-----------|--------|--------|--------|--------|-------|") + for g in grid: + c = g.get("counts", {}) + print( + "| %s | %s | %s | %d | %d | %d |" + % ( + g["test_type"], + g["matrix_label"], + g["status"], + c.get("passed", 0), + c.get("failed", 0), + c.get("flaky", 0), + ) + ) + + +if __name__ == "__main__": + main() diff --git a/ci/utils/install_boost_tbb.sh b/ci/utils/install_boost_tbb.sh index 4cd0ca6f0b..844c09ea04 100644 --- a/ci/utils/install_boost_tbb.sh +++ b/ci/utils/install_boost_tbb.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail @@ -18,7 +18,7 @@ if [ -f /etc/os-release ]; then elif [[ "$ID" == "ubuntu" ]]; then echo "Detected Ubuntu. Installing Boost and TBB via apt..." apt-get update - apt-get install -y libboost-dev libtbb-dev + apt-get install -y libboost-iostreams-dev libboost-serialization-dev libtbb-dev else echo "Unknown OS: $ID. Please install Boost development libraries manually." exit 1 diff --git a/ci/utils/junit_helpers.py b/ci/utils/junit_helpers.py new file mode 100644 index 0000000000..39a7a3d1e3 --- /dev/null +++ b/ci/utils/junit_helpers.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +JUnit XML helpers for CI test runner scripts. + +Extracts test names from JUnit XML files for crash isolation and retry logic. +Called from shell scripts via: python3 ci/utils/junit_helpers.py + +Commands: + failed [--sep SEP] Print failed/errored test names + passed [--sep SEP] Print passed test names (excludes skipped) + gtest-list Parse gtest --gtest_list_tests from stdin +""" + +import sys +from xml.etree import ElementTree + + +def extract_tests(xml_path, status="failed", sep=".", include_skipped=False): + """Extract test names from a JUnit XML file. + + Args: + xml_path: Path to JUnit XML file. + status: "failed" to extract failures/errors, "passed" for passes. + sep: Separator between classname and name ("." for gtest, "::" for pytest). + include_skipped: If False, skipped tests are excluded from "passed" results. + """ + try: + tree = ElementTree.parse(xml_path) + except (ElementTree.ParseError, FileNotFoundError, OSError): + return + + for tc in tree.iter("testcase"): + cls = tc.get("classname", "") + name = tc.get("name", "") + if not cls or not name: + continue + + has_failure = tc.find("failure") is not None + has_error = tc.find("error") is not None + has_skipped = tc.find("skipped") is not None + + if status == "failed" and (has_failure or has_error): + print(f"{cls}{sep}{name}") + elif status == "passed": + if not has_failure and not has_error: + if include_skipped or not has_skipped: + print(f"{cls}{sep}{name}") + + +def parse_gtest_list(): + """Parse gtest --gtest_list_tests output from stdin into Suite.TestName.""" + suite = "" + for line in sys.stdin: + line = line.rstrip() + if not line or line.startswith("#"): + continue + if not line.startswith(" "): + suite = line.rstrip(".") + else: + print(f"{suite}.{line.strip().split()[0]}") + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [args]", file=sys.stderr) + sys.exit(1) + + cmd = sys.argv[1] + + if cmd in ("failed", "passed"): + if len(sys.argv) < 3: + print( + f"Usage: {sys.argv[0]} {cmd} [--sep SEP]", + file=sys.stderr, + ) + sys.exit(1) + xml_path = sys.argv[2] + sep = "." + for i, arg in enumerate(sys.argv[3:], 3): + if arg == "--sep" and i + 1 < len(sys.argv): + sep = sys.argv[i + 1] + extract_tests(xml_path, status=cmd, sep=sep) + + elif cmd == "gtest-list": + parse_gtest_list() + + else: + print(f"Unknown command: {cmd}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/ci/utils/nightly_report.py b/ci/utils/nightly_report.py new file mode 100755 index 0000000000..6742458582 --- /dev/null +++ b/ci/utils/nightly_report.py @@ -0,0 +1,1111 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Nightly test report generator for cuOpt CI. + +Parses JUnit XML test results, classifies failures as flaky vs genuine, +maintains a failure history database on S3, and outputs: + - HTML report (detailed, uploaded to S3 and linked from Slack) + - Markdown summary (for $GITHUB_STEP_SUMMARY or terminal) + - JSON summary (for downstream consumers like Slack notifier and dashboard) + +Each CI matrix job (CUDA version x Python version x architecture) runs this +script independently. The --test-type and --matrix-label flags identify the +job so that history and summaries are stored per-matrix-combo. + +History lifecycle: + 1. Download history from S3 (falls back to empty if not found) + 2. Classify this run's results + 3. Update history: mark new failures, bump recurring counts, resolve stabilized tests + 4. Upload updated history back to S3 + 5. Generate reports (HTML, Markdown, JSON, GitHub Step Summary) + 6. Upload per-run JSON snapshot to S3 summaries dir (for aggregation) + +Usage: + python ci/utils/nightly_report.py \\ + --results-dir test-results/ \\ + --output-dir report-output/ \\ + --sha abc123 \\ + --test-type python \\ + --matrix-label cuda12.9-py3.12-x86_64 \\ + --s3-history-uri s3://bucket/ci_test_reports/nightly/history/python-main-cuda12.9-py3.12-x86_64.json \\ + --s3-summary-uri s3://bucket/ci_test_reports/nightly/summaries/2026-04-13/python-cuda12.9-py3.12-x86_64.json +""" + +import argparse +import json +import os +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from xml.etree import ElementTree + +# Ensure ci/utils is importable when invoked as a script +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from s3_helpers import s3_download, s3_upload # noqa: E402 + +EMPTY_HISTORY = {"_schema_version": 2, "tests": {}} + +# A test that resolves then fails again within this window is considered +# "bouncing" (intermittently flaky) rather than a new failure. +BOUNCE_WINDOW_DAYS = int(os.environ.get("CUOPT_BOUNCE_WINDOW_DAYS", 14)) + +# Number of failure/resolve cycles that classify a test as cross-run flaky. +BOUNCE_THRESHOLD = int(os.environ.get("CUOPT_BOUNCE_THRESHOLD", 2)) + + +# --------------------------------------------------------------------------- +# JUnit XML parsing +# --------------------------------------------------------------------------- + + +def parse_junit_xml(xml_path): + """Parse a JUnit XML file and return a list of test result dicts.""" + results = [] + try: + tree = ElementTree.parse(xml_path) + except ElementTree.ParseError as e: + print(f"WARNING: Failed to parse {xml_path}: {e}", file=sys.stderr) + return results + + root = tree.getroot() + + if root.tag == "testsuites": + suites = root.findall("testsuite") + elif root.tag == "testsuite": + suites = [root] + else: + return results + + for suite in suites: + suite_name = suite.get("name", os.path.basename(xml_path)) + for testcase in suite.findall("testcase"): + name = testcase.get("name", "unknown") + classname = testcase.get("classname", "") + time_taken = testcase.get("time", "0") + + failure = testcase.find("failure") + error = testcase.find("error") + skipped = testcase.find("skipped") + + if skipped is not None: + status = "skipped" + message = skipped.get("message", "") + elif failure is not None: + status = "failed" + message = failure.get("message", "") + if failure.text: + message = failure.text.strip() + elif error is not None: + status = "error" + message = error.get("message", "") + if error.text: + message = error.text.strip() + else: + status = "passed" + message = "" + + results.append( + { + "suite": suite_name, + "classname": classname, + "name": name, + "status": status, + "time": time_taken, + "message": message, + "source_file": str(xml_path), + } + ) + + return results + + +def collect_all_results(results_dir): + """Collect test results from all JUnit XML files in a directory.""" + results_dir = Path(results_dir) + all_results = [] + for xml_file in sorted(results_dir.rglob("*.xml")): + all_results.extend(parse_junit_xml(xml_file)) + return all_results + + +# --------------------------------------------------------------------------- +# Classification +# --------------------------------------------------------------------------- + + +def classify_failures(results): + """ + Classify test results into passed, failed, flaky, skipped, and error. + + pytest-rerunfailures records reruns as additional entries. + A test that failed then passed on rerun is flaky. + """ + test_groups = defaultdict(list) + for r in results: + # Group by classname+name (not suite) so rerun entries from + # supplementary XML files match the main XML entries + key = f"{r['classname']}::{r['name']}" + test_groups[key].append(r) + + classified = { + "passed": [], + "failed": [], + "flaky": [], + "skipped": [], + "error": [], + } + + for key, entries in test_groups.items(): + statuses = [e["status"] for e in entries] + + if all(s == "skipped" for s in statuses): + classified["skipped"].append(entries[0]) + elif any(s == "passed" for s in statuses): + if any(s in ("failed", "error") for s in statuses): + entry = entries[-1].copy() + entry["status"] = "flaky" + entry["retry_count"] = sum( + 1 for s in statuses if s in ("failed", "error") + ) + # Capture the error message from the failed attempt + # (entries[-1] is the passing entry with no message) + failed = [ + e for e in entries if e["status"] in ("failed", "error") + ] + if failed: + entry["message"] = failed[-1].get("message", "") + classified["flaky"].append(entry) + else: + classified["passed"].append(entries[-1]) + elif any(s == "error" for s in statuses): + classified["error"].append(entries[-1]) + else: + classified["failed"].append(entries[-1]) + + return classified + + +# --------------------------------------------------------------------------- +# History management +# --------------------------------------------------------------------------- + + +def load_history(history_path): + """Load failure history from a local JSON file.""" + try: + with open(history_path) as f: + data = json.load(f) + if "tests" in data: + return data + except (FileNotFoundError, json.JSONDecodeError): + pass + return dict(EMPTY_HISTORY) + + +def _days_between(date_a, date_b): + """Return absolute number of days between two YYYY-MM-DD strings.""" + try: + a = datetime.strptime(date_a, "%Y-%m-%d") + b = datetime.strptime(date_b, "%Y-%m-%d") + return abs((a - b).days) + except (ValueError, TypeError): + return 999 + + +def _is_recent_resolve(rec, date_str): + """Check if a test was resolved recently (within bounce window).""" + resolved_date = rec.get("resolved_date", "") + if not resolved_date: + return False + return _days_between(resolved_date, date_str) <= BOUNCE_WINDOW_DAYS + + +def update_history(history, classified, sha, date_str): + """ + Update failure history with this run's results. + + Returns (history, new_failures, recurring_failures, resolved_tests, + new_flaky_tests). + + Classification logic: + - "new failure": never seen before (no history entry at all) + - "recurring": was already active (failing on previous runs) + - "bouncing": was resolved recently but failed again — reactivated + as recurring (not new), and marked cross-run flaky after 2+ bounces + - "resolved": was active, now passes — notified once, then silent + on subsequent passes + """ + tests = history.setdefault("tests", {}) + new_failures = [] + recurring_failures = [] + resolved_tests = [] + new_flaky_tests = [] + + # --- Genuine failures --- + for entry in classified["failed"] + classified["error"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + + if test_key in tests: + rec = tests[test_key] + + if rec["status"] == "active": + # Still failing — bump count + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + recurring_failures.append( + {**entry, "first_seen": rec["first_seen_date"]} + ) + elif rec["status"] == "resolved" and _is_recent_resolve( + rec, date_str + ): + # Bouncing: resolved recently but failed again. + # Reactivate as recurring, not new. Track the bounce. + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + if rec["bounce_count"] >= BOUNCE_THRESHOLD: + rec["is_flaky"] = True + recurring_failures.append( + { + **entry, + "first_seen": rec["first_seen_date"], + "is_bouncing": True, + } + ) + else: + # Resolved long ago — treat as new cycle but keep history + rec["status"] = "active" + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + new_failures.append(entry) + else: + # Truly new — never seen before + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": False, + "bounce_count": 0, + "status": "active", + } + new_failures.append(entry) + + # --- Flaky tests (passed on retry within this run) --- + for entry in classified["flaky"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + if test_key in tests: + rec = tests[test_key] + rec["last_seen_date"] = date_str + rec["last_seen_sha"] = sha + rec["failure_count"] += 1 + rec["is_flaky"] = True + # If it was resolved, reactivate — it's still unstable + if rec["status"] == "resolved": + rec["status"] = "active" + rec["bounce_count"] = rec.get("bounce_count", 0) + 1 + else: + tests[test_key] = { + "suite": entry["suite"], + "classname": entry["classname"], + "name": entry["name"], + "first_seen_date": date_str, + "first_seen_sha": sha, + "last_seen_date": date_str, + "last_seen_sha": sha, + "failure_count": 1, + "is_flaky": True, + "bounce_count": 0, + "status": "active", + } + new_flaky_tests.append(entry) + + # --- Resolve stabilized tests --- + passed_keys = set() + for entry in classified["passed"]: + test_key = f"{entry['suite']}::{entry['classname']}::{entry['name']}" + passed_keys.add(test_key) + + for test_key in passed_keys: + if test_key in tests and tests[test_key]["status"] == "active": + rec = tests[test_key] + rec["status"] = "resolved" + rec["resolved_date"] = date_str + rec["resolved_sha"] = sha + resolved_tests.append( + { + "suite": rec["suite"], + "classname": rec["classname"], + "name": rec["name"], + "first_seen": rec["first_seen_date"], + "failure_count": rec["failure_count"], + "bounce_count": rec.get("bounce_count", 0), + "was_flaky": rec.get("is_flaky", False), + } + ) + # If already "resolved" and passes again — no notification. + # The resolved notification was sent once when it first stabilized. + + return ( + history, + new_failures, + recurring_failures, + resolved_tests, + new_flaky_tests, + ) + + +def save_history(history, history_path): + """Write history to a local JSON file.""" + with open(history_path, "w") as f: + json.dump(history, f, indent=2, sort_keys=True) + f.write("\n") + + +# --------------------------------------------------------------------------- +# Report generation +# --------------------------------------------------------------------------- + + +def generate_markdown_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a Markdown summary report.""" + lines = [] + title = "# Nightly Test Report" + if test_type: + title += f" — {test_type}" + if matrix_label: + title += f" [{matrix_label}]" + lines.append(title) + lines.append("") + if date_str or sha: + meta_parts = [] + if date_str: + meta_parts.append(f"**Date:** {date_str}") + if sha: + meta_parts.append(f"**Commit:** `{sha[:12]}`") + if matrix_label: + meta_parts.append(f"**Matrix:** {matrix_label}") + lines.append(" | ".join(meta_parts)) + lines.append("") + + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + lines.append("## Summary") + lines.append("") + lines.append("| Metric | Count |") + lines.append("|--------|-------|") + lines.append(f"| Total tests | {total} |") + lines.append(f"| Passed | {total_passed} |") + lines.append(f"| **Genuine failures** | **{total_failed}** |") + lines.append(f"| Flaky (passed on retry) | {total_flaky} |") + lines.append(f"| Skipped | {total_skipped} |") + if resolved_tests: + lines.append( + f"| **Stabilized (were failing, now pass)** | **{len(resolved_tests)}** |" + ) + lines.append("") + + # -- New genuine failures (highest priority) -- + if new_failures: + lines.append("## NEW Failures (not previously seen)") + lines.append("") + lines.append("| Suite | Test | Error |") + lines.append("|-------|------|-------|") + for entry in new_failures: + short_msg = ( + entry.get("message", "")[:80] + .replace("\n", " ") + .replace("|", "\\|") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {short_msg} |" + ) + lines.append("") + + # -- Recurring failures -- + if recurring_failures: + lines.append("## Recurring Failures") + lines.append("") + lines.append("| Suite | Test | First seen | Failure count | Error |") + lines.append("|-------|------|------------|---------------|-------|") + for entry in recurring_failures: + short_msg = ( + entry.get("message", "")[:60] + .replace("\n", " ") + .replace("|", "\\|") + ) + first_seen = entry.get("first_seen", "unknown") + test_key = ( + f"{entry['suite']}::{entry['classname']}::{entry['name']}" + ) + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {first_seen} | {count} | {short_msg} |" + ) + lines.append("") + + # -- Stabilized tests -- + if resolved_tests: + lines.append("## Stabilized Tests (were failing, now passing)") + lines.append("") + lines.append( + "| Suite | Test | Was failing since | Total failure count | Was flaky? |" + ) + lines.append( + "|-------|------|-------------------|---------------------|------------|" + ) + for entry in resolved_tests: + flaky_badge = "Yes" if entry.get("was_flaky") else "No" + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {entry['first_seen']} " + f"| {entry['failure_count']} | {flaky_badge} |" + ) + lines.append("") + + # -- Flaky tests -- + if classified["flaky"]: + lines.append("## Flaky Tests (passed on retry)") + lines.append("") + lines.append("| Suite | Test | Retries needed | Error |") + lines.append("|-------|------|----------------|-------|") + for entry in classified["flaky"]: + retry_count = entry.get("retry_count", "?") + short_msg = ( + entry.get("message", "")[:80] + .replace("\n", " ") + .replace("|", "\\|") + ) + lines.append( + f"| {entry['suite']} | `{entry['name']}` | {retry_count} | {short_msg} |" + ) + lines.append("") + + # -- Detailed errors -- + all_failures = classified["failed"] + classified["error"] + if all_failures: + lines.append("## All Failure Details") + lines.append("") + for entry in all_failures: + lines.append(f"### `{entry['classname']}::{entry['name']}`") + lines.append(f"- **Suite**: {entry['suite']}") + lines.append(f"- **Source**: {entry['source_file']}") + msg = entry.get("message", "").strip() + if msg: + lines.append("- **Error**:") + lines.append("```") + for line in msg.split("\n")[:20]: + lines.append(line) + lines.append("```") + lines.append("") + + if not all_failures and not classified["flaky"] and not resolved_tests: + lines.append("All tests passed! No failures or flaky tests detected.") + lines.append("") + + return "\n".join(lines) + + +def generate_json_summary( + classified, + new_failures, + recurring_failures, + resolved_tests, + new_flaky_tests=None, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a JSON summary for downstream tools (Slack notifier, dashboard).""" + if new_flaky_tests is None: + new_flaky_tests = [] + new_flaky_keys = { + f"{e['classname']}::{e['name']}" for e in new_flaky_tests + } + return { + "timestamp": datetime.now(timezone.utc).isoformat(), + "test_type": test_type, + "matrix_label": matrix_label, + "sha": sha, + "date": date_str, + "counts": { + "total": sum(len(v) for v in classified.values()), + "passed": len(classified["passed"]), + "failed": len(classified["failed"]) + len(classified["error"]), + "flaky": len(classified["flaky"]), + "skipped": len(classified["skipped"]), + "resolved": len(resolved_tests), + }, + "has_new_failures": len(new_failures) > 0, + "has_new_flaky": len(new_flaky_tests) > 0, + "new_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "message": e.get("message", ""), + } + for e in new_failures + ], + "recurring_failures": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "message": e.get("message", ""), + } + for e in recurring_failures + ], + "flaky_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "retry_count": e.get("retry_count", 0), + "message": e.get("message", ""), + "is_new": f"{e['classname']}::{e['name']}" in new_flaky_keys, + } + for e in classified["flaky"] + ], + "resolved_tests": [ + { + "suite": e["suite"], + "name": e["name"], + "classname": e["classname"], + "first_seen": e.get("first_seen", "unknown"), + "failure_count": e.get("failure_count", 0), + "was_flaky": e.get("was_flaky", False), + } + for e in resolved_tests + ], + } + + +# --------------------------------------------------------------------------- +# HTML report +# --------------------------------------------------------------------------- + + +def _html_escape(text): + """Escape HTML special characters.""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def generate_html_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + test_type="", + matrix_label="", + sha="", + date_str="", +): + """Generate a self-contained HTML report with detailed failure info.""" + total_passed = len(classified["passed"]) + total_failed = len(classified["failed"]) + len(classified["error"]) + total_flaky = len(classified["flaky"]) + total_skipped = len(classified["skipped"]) + total = total_passed + total_failed + total_flaky + total_skipped + + title = "Nightly Test Report" + if test_type: + title += f" — {_html_escape(test_type)}" + if matrix_label: + title += f" [{_html_escape(matrix_label)}]" + + # Determine overall status color + if total_failed > 0: + status_color = "#d32f2f" + status_text = f"{total_failed} failure(s)" + elif total_flaky > 0: + status_color = "#f9a825" + status_text = "All passed (flaky detected)" + else: + status_color = "#388e3c" + status_text = "All passed" + + parts = [] + parts.append(f""" + + + + +{title} + + + +

{title}

+
""") + + meta_parts = [] + if date_str: + meta_parts.append(f"Date: {_html_escape(date_str)}") + if sha: + meta_parts.append(f"Commit: {_html_escape(sha[:12])}") + if matrix_label: + meta_parts.append( + f"Matrix: {_html_escape(matrix_label)}" + ) + parts.append("  |  ".join(meta_parts)) + + parts.append(f"""
+
{status_text}
+
+
{total}
Total
+
{total_passed}
Passed
+
{total_failed}
Failed
+
{total_flaky}
Flaky
+
Skipped
+
{len(resolved_tests)}
Stabilized
+
""") + + # --- New failures --- + if new_failures: + parts.append("

New Failures

") + parts.append("") + for e in new_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + parts.append( + f"" + f"' + f"' + ) + parts.append("
SuiteTestError
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'NEW
{short}" + f'
{msg}
") + + # --- Recurring failures --- + if recurring_failures: + parts.append("

Recurring Failures

") + parts.append( + "" + "" + ) + for e in recurring_failures: + msg = _html_escape(e.get("message", "")) + short = _html_escape(e.get("message", "")[:100]) + first_seen = _html_escape(e.get("first_seen", "unknown")) + test_key = f"{e['suite']}::{e['classname']}::{e['name']}" + count = ( + history.get("tests", {}) + .get(test_key, {}) + .get("failure_count", "?") + ) + parts.append( + f"" + f"' + f"" + f"' + ) + parts.append("
SuiteTestFirst SeenCountError
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'RECURRING{first_seen}{count}
{short}" + f'
{msg}
") + + # --- Stabilized --- + if resolved_tests: + parts.append("

Stabilized Tests

") + parts.append( + "" + "" + ) + for e in resolved_tests: + flaky_tag = "Yes" if e.get("was_flaky") else "No" + parts.append( + f"" + f"' + f"" + f"" + f"" + ) + parts.append("
SuiteTestFailing SinceFailure CountWas Flaky?
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'FIXED{_html_escape(e.get('first_seen', '?'))}{e.get('failure_count', '?')}{flaky_tag}
") + + # --- Flaky --- + if classified["flaky"]: + parts.append("

Flaky Tests (passed on retry)

") + parts.append( + "" + "" + ) + for e in classified["flaky"]: + msg = _html_escape(e.get("message", "")) + raw_msg = e.get("message", "").strip() + # Use last non-empty line as the short summary (typically the assertion) + lines = [ln for ln in raw_msg.splitlines() if ln.strip()] + short = _html_escape(lines[-1][:150] if lines else "") + parts.append( + f"" + f"' + f"" + f"' + ) + parts.append("
SuiteTestRetriesError
{_html_escape(e['suite'])}{_html_escape(e['name'])} " + f'FLAKY{e.get('retry_count', '?')}
{short}" + f'
{msg}
") + + # --- All failure details --- + all_failures = classified["failed"] + classified["error"] + if all_failures: + parts.append("

All Failure Details

") + for e in all_failures: + msg = _html_escape(e.get("message", "").strip()) + parts.append( + f'

' + f"{_html_escape(e['classname'])}::{_html_escape(e['name'])}

" + f'

' + f"Suite: {_html_escape(e['suite'])}  |  " + f"Source: {_html_escape(e['source_file'])}

" + ) + if msg: + parts.append(f'
{msg}
') + parts.append("
") + + if not all_failures and not classified["flaky"] and not resolved_tests: + parts.append( + '

All tests passed! No failures or flaky tests detected.

' + ) + + parts.append("") + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Generate nightly test failure report from JUnit XML results" + ) + parser.add_argument( + "--results-dir", + required=True, + help="Directory containing JUnit XML test result files", + ) + parser.add_argument( + "--output-dir", + default="report-output", + help="Directory to write report files to", + ) + parser.add_argument( + "--sha", + default=os.environ.get("GITHUB_SHA", "unknown"), + help="Git commit SHA for this run", + ) + parser.add_argument( + "--date", + default=datetime.now(timezone.utc).strftime("%Y-%m-%d"), + help="Date for this run (YYYY-MM-DD)", + ) + parser.add_argument( + "--test-type", + default="unknown", + help=( + "Test type identifier (e.g., cpp, python, wheel-python, " + "wheel-server, notebooks)" + ), + ) + parser.add_argument( + "--matrix-label", + default="", + help=( + "Matrix combination label (e.g., cuda12.9-py3.12-x86_64). " + "Included in reports and JSON summary to identify the CI job." + ), + ) + parser.add_argument( + "--s3-history-uri", + default="", + help=( + "S3 URI for persistent failure history JSON. " + "Downloaded before analysis, uploaded after update. " + "Example: s3://bucket/ci_test_reports/nightly/history/" + "python-main-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-history-seed-uri", + default="", + help=( + "S3 URI to seed history from when this branch has no history yet " + "(e.g., first nightly on a new release branch). Typically points " + "to main's history so known failures are inherited, not re-reported " + "as new. Only used if --s3-history-uri download fails." + ), + ) + parser.add_argument( + "--s3-summary-uri", + default="", + help=( + "S3 URI to upload this run's JSON snapshot for aggregation. " + "Scoped by run ID to prevent cross-run pollution. " + "Example: s3://bucket/.../summaries/2026-04-13/run-12345/" + "python-cuda12.9-py3.12-x86_64.json" + ), + ) + parser.add_argument( + "--s3-summary-branch-uri", + default="", + help=( + "S3 URI to also upload the JSON snapshot under the branch path " + "for manual browsing. Optional — same content as --s3-summary-uri." + ), + ) + parser.add_argument( + "--s3-html-uri", + default="", + help=( + "S3 URI to upload the HTML report. " + "Example: s3://bucket/ci_test_reports/nightly/reports/" + "2026-04-13/python-cuda12.9-py3.12-x86_64.html" + ), + ) + parser.add_argument( + "--github-step-summary", + default=os.environ.get("GITHUB_STEP_SUMMARY", ""), + help="Path to write GitHub Actions step summary", + ) + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + local_history_path = str(output_dir / "test_failure_history.json") + + # ---- Step 1: Download history from S3 ---- + if args.s3_history_uri: + if not s3_download(args.s3_history_uri, local_history_path): + # No history for this branch yet — seed from parent (e.g., main) + # so known failures are inherited and not re-reported as new. + if args.s3_history_seed_uri and s3_download( + args.s3_history_seed_uri, local_history_path + ): + print( + f"Seeded history from {args.s3_history_seed_uri} " + f"(first run on this branch)" + ) + + # ---- Step 2: Collect and classify results ---- + print(f"Collecting test results from {args.results_dir} ...") + results = collect_all_results(args.results_dir) + if not results: + print("WARNING: No test results found.", file=sys.stderr) + + print(f"Found {len(results)} test case entries across all XML files") + classified = classify_failures(results) + + print( + f"Classification: {len(classified['passed'])} passed, " + f"{len(classified['failed'])} failed, " + f"{len(classified['error'])} errors, " + f"{len(classified['flaky'])} flaky, " + f"{len(classified['skipped'])} skipped" + ) + + # ---- Step 3: Update history ---- + history = load_history(local_history_path) + ( + history, + new_failures, + recurring_failures, + resolved_tests, + new_flaky_tests, + ) = update_history(history, classified, args.sha, args.date) + + if new_flaky_tests: + print( + f"NEW FLAKY: {len(new_flaky_tests)} test(s) flaky for the first time" + ) + if resolved_tests: + print( + f"Stabilized: {len(resolved_tests)} previously-failing test(s) now pass" + ) + + save_history(history, local_history_path) + print(f"Updated local history at {local_history_path}") + + # ---- Step 4: Upload history back to S3 ---- + if args.s3_history_uri: + s3_upload(local_history_path, args.s3_history_uri) + + # ---- Step 5: Generate reports ---- + report_kwargs = dict( + test_type=args.test_type, + matrix_label=args.matrix_label, + sha=args.sha, + date_str=args.date, + ) + + md_report = generate_markdown_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + **report_kwargs, + ) + md_path = output_dir / "nightly_report.md" + md_path.write_text(md_report) + print(f"Markdown report written to {md_path}") + + html_report = generate_html_report( + classified, + new_failures, + recurring_failures, + resolved_tests, + history, + **report_kwargs, + ) + html_path = output_dir / "nightly_report.html" + html_path.write_text(html_report) + print(f"HTML report written to {html_path}") + + json_summary = generate_json_summary( + classified, + new_failures, + recurring_failures, + resolved_tests, + new_flaky_tests, + **report_kwargs, + ) + json_path = output_dir / "nightly_summary.json" + json_path.write_text(json.dumps(json_summary, indent=2) + "\n") + print(f"JSON summary written to {json_path}") + + if args.github_step_summary: + with open(args.github_step_summary, "a") as f: + f.write(md_report) + print(f"Wrote GitHub Step Summary to {args.github_step_summary}") + + # ---- Step 6: Upload per-run snapshot and HTML to S3 ---- + s3_ok = True + if args.s3_summary_uri: + if not s3_upload(str(json_path), args.s3_summary_uri): + print( + "ERROR: Failed to upload JSON summary to S3. " + "The nightly aggregate will NOT include this job's results.", + file=sys.stderr, + ) + s3_ok = False + + # Also upload to branch-scoped path for manual browsing + if ( + args.s3_summary_branch_uri + and args.s3_summary_branch_uri != args.s3_summary_uri + ): + if not s3_upload(str(json_path), args.s3_summary_branch_uri): + # Non-critical — the run-scoped copy is what the aggregate needs + print( + "WARNING: Failed to upload branch-scoped JSON summary.", + file=sys.stderr, + ) + + if args.s3_html_uri: + if not s3_upload(str(html_path), args.s3_html_uri): + print( + "WARNING: Failed to upload HTML report to S3.", + file=sys.stderr, + ) + s3_ok = False + + if s3_ok and (args.s3_summary_uri or args.s3_html_uri): + print("S3 uploads completed successfully.") + + # ---- Exit code ---- + genuine_failures = len(classified["failed"]) + len(classified["error"]) + if genuine_failures > 0: + print( + f"\nFAILED: {genuine_failures} genuine test failure(s) detected." + ) + return 1 + if classified["flaky"]: + print( + f"\nWARNING: All tests passed but {len(classified['flaky'])} flaky test(s) detected." + ) + else: + print("\nAll tests passed.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ci/utils/nightly_report_helper.sh b/ci/utils/nightly_report_helper.sh new file mode 100755 index 0000000000..c65fc22f0e --- /dev/null +++ b/ci/utils/nightly_report_helper.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Shared helper for generating nightly test reports with matrix-aware S3 paths. +# +# Usage (source from any test script): +# +# # For C++ tests (no Python version in matrix label): +# generate_nightly_report "cpp" +# +# # For Python tests (includes Python version in matrix label): +# generate_nightly_report "python" --with-python-version +# +# # For wheel tests: +# generate_nightly_report "wheel-python" --with-python-version +# +# Prerequisites (set before calling): +# RAPIDS_TESTS_DIR - directory containing JUnit XML test results +# +# Optional environment variables (auto-detected if not set): +# RAPIDS_CUDA_VERSION - CUDA version (e.g., "12.9") +# RAPIDS_PY_VERSION - Python version (e.g., "3.12"), used with --with-python-version +# RAPIDS_BRANCH - branch name (e.g., "main") +# RAPIDS_BUILD_TYPE - build type; S3 history/summary/HTML uploads are +# only enabled when this equals "nightly" +# CUOPT_S3_URI - S3 bucket root (e.g., s3://cuopt-datasets/); +# only consulted when RAPIDS_BUILD_TYPE=nightly +# GITHUB_SHA - commit SHA +# GITHUB_RUN_ID - GitHub Actions run ID (scopes summaries to this run) +# GITHUB_STEP_SUMMARY - path for GitHub Actions step summary + +# Resolve the directory where THIS helper lives (ci/utils/) +_HELPER_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +generate_nightly_report() { + local test_type="${1:?Usage: generate_nightly_report [--with-python-version]}" + local include_py_version=false + + shift + while [ $# -gt 0 ]; do + case "$1" in + --with-python-version) include_py_version=true ;; + *) echo "WARNING: Unknown option: $1" >&2 ;; + esac + shift + done + + # --- Build matrix label --- + local cuda_tag="cuda${RAPIDS_CUDA_VERSION:-unknown}" + local arch_tag + arch_tag="$(arch)" + local matrix_label="${cuda_tag}-${arch_tag}" + + if [ "${include_py_version}" = true ]; then + local py_tag="py${RAPIDS_PY_VERSION:-unknown}" + matrix_label="${cuda_tag}-${py_tag}-${arch_tag}" + fi + + local branch_slug + branch_slug=$(echo "${RAPIDS_BRANCH:-main}" | tr '/' '-') + # Use RUN_DATE if set (nightly workflows pass the trigger date), + # fall back to local date. This avoids mismatches between test + # jobs and the summary job when a run spans UTC midnight. + local run_date + run_date="${RUN_DATE:-$(date +%F)}" + + # --- Ensure results dir exists --- + RAPIDS_TESTS_DIR="${RAPIDS_TESTS_DIR:-${PWD}/test-results}" + mkdir -p "${RAPIDS_TESTS_DIR}" + + local report_output_dir="${RAPIDS_TESTS_DIR}/report" + mkdir -p "${report_output_dir}" + + # --- Build S3 URIs --- + local s3_history_uri="" + local s3_history_seed_uri="" + local s3_summary_uri="" + local s3_summary_branch_uri="" + local s3_html_uri="" + + # Only upload to S3 for nightly runs. For PRs and other build types we + # still generate the local report and GitHub Step Summary, but skip S3 + # so PR runs don't pollute the nightly history/summary/report buckets. + if [ "${RAPIDS_BUILD_TYPE:-}" = "nightly" ] && [ -n "${CUOPT_S3_URI:-}" ]; then + local s3_base="${CUOPT_S3_URI}ci_test_reports/nightly" + s3_history_uri="${s3_base}/history/${branch_slug}/${test_type}-${matrix_label}.json" + # For non-main branches, seed history from main on first run so known + # failures are inherited (not re-reported as new on release branches). + if [ "${branch_slug}" != "main" ]; then + s3_history_seed_uri="${s3_base}/history/main/${test_type}-${matrix_label}.json" + fi + # Scope summaries by GITHUB_RUN_ID so each workflow run is isolated. + # The run-scoped path is date-free — the run ID is unique, and + # dropping the date prevents mismatches when jobs span midnight UTC. + # Also write to branch+date path for manual browsing. + local summary_filename="${test_type}-${matrix_label}.json" + if [ -n "${GITHUB_RUN_ID:-}" ]; then + s3_summary_uri="${s3_base}/summaries/run-${GITHUB_RUN_ID}/${summary_filename}" + else + s3_summary_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${summary_filename}" + fi + s3_summary_branch_uri="${s3_base}/summaries/${run_date}/${branch_slug}/${summary_filename}" + s3_html_uri="${s3_base}/reports/${run_date}/${branch_slug}/${test_type}-${matrix_label}.html" + fi + + # --- Run nightly report --- + python3 "${_HELPER_DIR}/nightly_report.py" \ + --results-dir "${RAPIDS_TESTS_DIR}" \ + --output-dir "${report_output_dir}" \ + --sha "${GITHUB_SHA:-unknown}" \ + --date "${run_date}" \ + --test-type "${test_type}" \ + --matrix-label "${matrix_label}" \ + --s3-history-uri "${s3_history_uri}" \ + --s3-history-seed-uri "${s3_history_seed_uri}" \ + --s3-summary-uri "${s3_summary_uri}" \ + --s3-summary-branch-uri "${s3_summary_branch_uri}" \ + --s3-html-uri "${s3_html_uri}" \ + --github-step-summary "${GITHUB_STEP_SUMMARY:-}" \ + || true +} diff --git a/ci/utils/s3_helpers.py b/ci/utils/s3_helpers.py new file mode 100644 index 0000000000..54e8b96d21 --- /dev/null +++ b/ci/utils/s3_helpers.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Shared S3 helper functions for cuOpt CI scripts. + +Maps CUOPT_AWS_* credentials to standard AWS env vars and provides +download / upload / list wrappers around the aws CLI. +""" + +import os +import subprocess +import sys + + +def s3_env(): + """Build env dict for AWS CLI calls using CUOPT-specific credentials. + + The cuOpt S3 bucket requires explicit CUOPT_AWS_* static credentials. + Role-based credentials from aws-actions/configure-aws-credentials do not + have access. We override AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY with + the CUOPT_* values and unset AWS_SESSION_TOKEN to avoid mixing with + role-based session tokens (matching the pattern in datasets/*.sh). + """ + env = os.environ.copy() + if os.environ.get("CUOPT_AWS_ACCESS_KEY_ID"): + env["AWS_ACCESS_KEY_ID"] = os.environ["CUOPT_AWS_ACCESS_KEY_ID"] + if os.environ.get("CUOPT_AWS_SECRET_ACCESS_KEY"): + env["AWS_SECRET_ACCESS_KEY"] = os.environ[ + "CUOPT_AWS_SECRET_ACCESS_KEY" + ] + # Unset session token to avoid mixing role-based tokens with static keys + env.pop("AWS_SESSION_TOKEN", None) + if os.environ.get("CUOPT_AWS_REGION"): + env["AWS_DEFAULT_REGION"] = os.environ["CUOPT_AWS_REGION"] + elif "AWS_DEFAULT_REGION" not in env: + env["AWS_DEFAULT_REGION"] = "us-east-1" + return env + + +def s3_download(s3_uri, local_path): + """Download a file from S3. Returns True on success, False on any error.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", s3_uri, local_path], + env=env, + check=True, + capture_output=True, + text=True, + ) + print(f"Downloaded {s3_uri}") + return True + except FileNotFoundError: + print( + "WARNING: aws CLI not found, skipping S3 download", file=sys.stderr + ) + return False + except subprocess.CalledProcessError as exc: + print( + f"WARNING: S3 download failed (first run?): {exc.stderr.strip()}", + file=sys.stderr, + ) + return False + + +def s3_upload(local_path, s3_uri): + """Upload a file to S3. Returns True on success.""" + env = s3_env() + try: + subprocess.run( + ["aws", "s3", "cp", local_path, s3_uri], + env=env, + check=True, + capture_output=True, + text=True, + ) + print(f"Uploaded {local_path} to {s3_uri}") + return True + except FileNotFoundError: + print( + "WARNING: aws CLI not found, skipping S3 upload", file=sys.stderr + ) + return False + except subprocess.CalledProcessError as exc: + print( + f"WARNING: S3 upload failed: {exc.stderr.strip()}", file=sys.stderr + ) + return False + + +def s3_list(s3_prefix): + """List objects under an S3 prefix (recursive). Returns list of S3 URIs.""" + env = s3_env() + # Extract bucket and prefix from s3_prefix for reconstructing full URIs + # s3_prefix looks like "s3://bucket/path/to/prefix/" + try: + result = subprocess.run( + ["aws", "s3", "ls", "--recursive", s3_prefix], + env=env, + check=True, + capture_output=True, + text=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + print(f"WARNING: S3 ls failed: {exc}", file=sys.stderr) + return [] + + # --recursive output format: "2026-04-16 12:00:00 1234 path/to/file.json" + # We need to reconstruct full S3 URIs from the key paths + # Parse bucket from s3_prefix + if not s3_prefix.startswith("s3://"): + return [] + without_scheme = s3_prefix[5:] # remove "s3://" + bucket = without_scheme.split("/")[0] + base_uri = f"s3://{bucket}/" + + uris = [] + for line in result.stdout.strip().splitlines(): + parts = line.split(None, 3) # date, time, size, key + if len(parts) == 4: + uris.append(f"{base_uri}{parts[3]}") + return uris diff --git a/ci/utils/send_consolidated_summary.sh b/ci/utils/send_consolidated_summary.sh new file mode 100755 index 0000000000..f0c88aa298 --- /dev/null +++ b/ci/utils/send_consolidated_summary.sh @@ -0,0 +1,146 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Send a consolidated Slack notification for the entire nightly run. +# Reads the aggregated JSON produced by aggregate_nightly.py and sends: +# - Main message: Header + status summary + test totals + failed CI jobs +# - Thread replies: matrix details, failure details, links, HTML report +# +# Posts via chat.postMessage (supports threading + file uploads). +# +# Required environment variables: +# SLACK_BOT_TOKEN - Slack Bot Token (xoxb-*) +# SLACK_CHANNEL_ID - Slack channel ID +# CONSOLIDATED_SUMMARY - Path to consolidated_summary.json +# +# Optional environment variables: +# CONSOLIDATED_HTML - Path to consolidated HTML file to upload +# PRESIGNED_REPORT_URL - Presigned URL for consolidated HTML report +# PRESIGNED_DASHBOARD_URL - Presigned URL for dashboard + +set -euo pipefail + +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + +CONSOLIDATED_SUMMARY="${CONSOLIDATED_SUMMARY:?CONSOLIDATED_SUMMARY must point to consolidated_summary.json}" +SLACK_BOT_TOKEN="${SLACK_BOT_TOKEN:?SLACK_BOT_TOKEN is required}" +SLACK_CHANNEL_ID="${SLACK_CHANNEL_ID:?SLACK_CHANNEL_ID is required}" +CONSOLIDATED_HTML="${CONSOLIDATED_HTML:-}" +PRESIGNED_REPORT_URL="${PRESIGNED_REPORT_URL:-}" +PRESIGNED_DASHBOARD_URL="${PRESIGNED_DASHBOARD_URL:-}" + +if [ ! -f "${CONSOLIDATED_SUMMARY}" ]; then + echo "ERROR: Summary file not found: ${CONSOLIDATED_SUMMARY}" >&2 + exit 1 +fi + +# Generate Slack payloads — one JSON object per line. +# Line 1 = main message, lines 2+ = thread replies. +PAYLOADS=$(python3 "${SCRIPT_DIR}/generate_slack_payloads.py" "${CONSOLIDATED_SUMMARY}" "${PRESIGNED_REPORT_URL}" "${PRESIGNED_DASHBOARD_URL}") + +# ── Send messages ───────────────────────────────────────────────────── +echo "Sending consolidated Slack notification..." + +THREAD_TS="" +FIRST=true + +while IFS= read -r payload; do + # Inject channel (and thread_ts for replies) into payload + if [ "${FIRST}" = true ]; then + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}") + else + BOT_PAYLOAD=$(python3 -c " +import json, sys +p = json.loads(sys.argv[1]) +p['channel'] = sys.argv[2] +p['thread_ts'] = sys.argv[3] +print(json.dumps(p)) +" "${payload}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") + fi + + RESPONSE=$(curl -s --max-time 30 -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${BOT_PAYLOAD}" \ + "https://slack.com/api/chat.postMessage" || echo '{"ok":false,"error":"curl_failed"}') + + OK=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ok',''))" 2>/dev/null || echo "") + + if [ "${FIRST}" = true ]; then + if [ "${OK}" != "True" ]; then + echo "WARNING: Main Slack message failed: ${RESPONSE}" >&2 + break + fi + THREAD_TS=$(echo "${RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('ts',''))" 2>/dev/null || echo "") + echo "Main message posted (ts=${THREAD_TS})" + FIRST=false + else + if [ "${OK}" != "True" ]; then + echo "WARNING: Thread reply failed: ${RESPONSE}" >&2 + fi + fi +done <<< "${PAYLOADS}" +echo "Consolidated Slack notification sent." + +# ── Upload HTML report as file in thread ────────────────────────────── +if [ -n "${CONSOLIDATED_HTML}" ] && [ -f "${CONSOLIDATED_HTML}" ]; then + echo "Uploading HTML report to Slack..." + + REPORT_DATE=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('date','report'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "report") + REPORT_BRANCH=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('branch','main'))" "${CONSOLIDATED_SUMMARY}" 2>/dev/null || echo "main") + UPLOAD_FILENAME="cuopt-nightly-${REPORT_BRANCH}-${REPORT_DATE}.html" + FILE_SIZE=$(stat --format=%s "${CONSOLIDATED_HTML}") + UPLOAD_TITLE="cuOpt Nightly Report — ${REPORT_BRANCH} — ${REPORT_DATE}" + + # Step 1: Get an upload URL from Slack + URL_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + --data-urlencode "filename=${UPLOAD_FILENAME}" \ + --data-urlencode "length=${FILE_SIZE}" \ + "https://slack.com/api/files.getUploadURLExternal") + + UPLOAD_URL=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('upload_url',''))" 2>/dev/null) + FILE_ID=$(echo "${URL_RESPONSE}" | python3 -c "import json,sys; print(json.load(sys.stdin).get('file_id',''))" 2>/dev/null) + + if [ -z "${UPLOAD_URL}" ] || [ -z "${FILE_ID}" ]; then + echo "WARNING: Slack file upload failed at getUploadURLExternal. Response: ${URL_RESPONSE}" >&2 + else + # Step 2: Upload the file content to the presigned URL + curl -s -X POST \ + -F "file=@${CONSOLIDATED_HTML}" \ + "${UPLOAD_URL}" + + # Step 3: Complete the upload and share to channel (in thread if available) + COMPLETE_PAYLOAD=$(python3 -c " +import json, sys +payload = { + 'files': [{'id': sys.argv[1], 'title': sys.argv[2]}], + 'channel_id': sys.argv[3], + 'initial_comment': 'Full nightly test report \u2014 download and open in a browser for interactive details.', +} +thread_ts = sys.argv[4] if len(sys.argv) > 4 and sys.argv[4] else '' +if thread_ts: + payload['thread_ts'] = thread_ts +print(json.dumps(payload)) +" "${FILE_ID}" "${UPLOAD_TITLE}" "${SLACK_CHANNEL_ID}" "${THREAD_TS}") + + COMPLETE_RESPONSE=$(curl -s -X POST \ + -H "Authorization: Bearer ${SLACK_BOT_TOKEN}" \ + -H "Content-Type: application/json" \ + --data "${COMPLETE_PAYLOAD}" \ + "https://slack.com/api/files.completeUploadExternal") + + if echo "${COMPLETE_RESPONSE}" | python3 -c "import json,sys; sys.exit(0 if json.load(sys.stdin).get('ok') else 1)" 2>/dev/null; then + echo "HTML report uploaded to Slack." + else + echo "WARNING: Slack file upload failed at completeUploadExternal. Response: ${COMPLETE_RESPONSE}" >&2 + fi + fi +fi diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh index 79188cacc3..61b768b1d3 100755 --- a/ci/validate_wheel.sh +++ b/ci/validate_wheel.sh @@ -22,11 +22,11 @@ PYDISTCHECK_ARGS=( if [[ "${package_dir}" == "python/libcuopt" ]]; then if [[ "${RAPIDS_CUDA_MAJOR}" == "12" ]]; then PYDISTCHECK_ARGS+=( - --max-allowed-size-compressed '650Mi' + --max-allowed-size-compressed '670Mi' ) else PYDISTCHECK_ARGS+=( - --max-allowed-size-compressed '495Mi' + --max-allowed-size-compressed '550Mi' ) fi elif [[ "${package_dir}" != "python/cuopt" ]] && \ diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake index 05627a91f7..96b7f373c3 100644 --- a/cmake/RAPIDS.cmake +++ b/cmake/RAPIDS.cmake @@ -1,6 +1,6 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= @@ -8,7 +8,7 @@ # This is the preferred entry point for projects using rapids-cmake # # Enforce the minimum required CMake version for all users -cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) +cmake_minimum_required(VERSION 4.0 FATAL_ERROR) # Allow users to control which version is used if(NOT (rapids-cmake-branch OR rapids-cmake-version)) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index cf3563d476..145850d4d8 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -12,14 +12,14 @@ dependencies: - ccache - clang-tools=20.1.8 - clang==20.1.8 -- cmake>=3.30.4 +- cmake>=4.0 - cpp-argparse - cuda-nvcc - cuda-nvtx-dev - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,12 +36,11 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 -- myst-nb - myst-parser - ninja - notebook @@ -55,9 +54,10 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index a8a589e48b..293b49fbea 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -12,14 +12,14 @@ dependencies: - ccache - clang-tools=20.1.8 - clang==20.1.8 -- cmake>=3.30.4 +- cmake>=4.0 - cpp-argparse - cuda-nvcc - cuda-nvtx-dev - cuda-python>=12.9.2,<13.0 - cuda-sanitizer-api - cuda-version=12.9 -- cudf==26.4.*,>=0.0.0a0 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,12 +36,11 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 -- myst-nb - myst-parser - ninja - notebook @@ -55,9 +54,10 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-132_arch-aarch64.yaml similarity index 85% rename from conda/environments/all_cuda-131_arch-aarch64.yaml rename to conda/environments/all_cuda-132_arch-aarch64.yaml index 477c708918..fa8844a1f9 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-132_arch-aarch64.yaml @@ -12,14 +12,14 @@ dependencies: - ccache - clang-tools=20.1.8 - clang==20.1.8 -- cmake>=3.30.4 +- cmake>=4.0 - cpp-argparse - cuda-nvcc - cuda-nvtx-dev - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api -- cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cuda-version=13.2 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,12 +36,11 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 -- myst-nb - myst-parser - ninja - notebook @@ -55,9 +54,10 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx @@ -83,4 +83,4 @@ dependencies: - nvidia-sphinx-theme - swagger-plugin-for-sphinx - veroviz -name: all_cuda-131_arch-aarch64 +name: all_cuda-132_arch-aarch64 diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-132_arch-x86_64.yaml similarity index 85% rename from conda/environments/all_cuda-131_arch-x86_64.yaml rename to conda/environments/all_cuda-132_arch-x86_64.yaml index d5fcba0b73..a37d8718c0 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-132_arch-x86_64.yaml @@ -12,14 +12,14 @@ dependencies: - ccache - clang-tools=20.1.8 - clang==20.1.8 -- cmake>=3.30.4 +- cmake>=4.0 - cpp-argparse - cuda-nvcc - cuda-nvtx-dev - cuda-python>=13.0.1,<14.0 - cuda-sanitizer-api -- cuda-version=13.1 -- cudf==26.4.*,>=0.0.0a0 +- cuda-version=13.2 +- cudf==26.6.*,>=0.0.0a0 - cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 @@ -36,12 +36,11 @@ dependencies: - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 - libprotobuf -- libraft-headers==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libraft-headers==26.6.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - msgpack-numpy==0.4.8 - msgpack-python==1.1.2 -- myst-nb - myst-parser - ninja - notebook @@ -55,9 +54,10 @@ dependencies: - pip - pre-commit - psutil>=6.0.0 -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyrsistent - pytest-cov +- pytest-rerunfailures - pytest<9.0 - python>=3.11,<3.15 - pyyaml>=6.0.0 @@ -65,7 +65,7 @@ dependencies: - rapids-logger==0.2.*,>=0.0.0a0 - re2 - requests -- rmm==26.4.*,>=0.0.0a0 +- rmm==26.6.*,>=0.0.0a0 - scikit-build-core>=0.11.0 - scipy>=1.14.1 - sphinx @@ -83,4 +83,4 @@ dependencies: - nvidia-sphinx-theme - swagger-plugin-for-sphinx - veroviz -name: all_cuda-131_arch-x86_64 +name: all_cuda-132_arch-x86_64 diff --git a/conda/recipes/cuopt/conda_build_config.yaml b/conda/recipes/cuopt/conda_build_config.yaml index 4f1ae065c4..a7501ac21b 100644 --- a/conda/recipes/cuopt/conda_build_config.yaml +++ b/conda/recipes/cuopt/conda_build_config.yaml @@ -14,4 +14,4 @@ c_stdlib_version: - "=2.28" cmake_version: - - ">=3.30.4" + - ">=4.0" diff --git a/conda/recipes/libcuopt/conda_build_config.yaml b/conda/recipes/libcuopt/conda_build_config.yaml index 4f1ae065c4..a7501ac21b 100644 --- a/conda/recipes/libcuopt/conda_build_config.yaml +++ b/conda/recipes/libcuopt/conda_build_config.yaml @@ -14,4 +14,4 @@ c_stdlib_version: - "=2.28" cmake_version: - - ">=3.30.4" + - ">=4.0" diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml index 682f9d33ef..ee074392ae 100644 --- a/conda/recipes/libcuopt/recipe.yaml +++ b/conda/recipes/libcuopt/recipe.yaml @@ -29,7 +29,7 @@ cache: export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g') set +x - ./build.sh -n -v ${BUILD_EXTRA_FLAGS} libmps_parser libcuopt deb --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\" + ./build.sh -n -v ${BUILD_EXTRA_FLAGS} libmps_parser libcuopt deb --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DBUILD_LP_BENCHMARKS=ON -DBUILD_MIP_BENCHMARKS=ON\" secrets: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY diff --git a/conda/recipes/mps-parser/conda_build_config.yaml b/conda/recipes/mps-parser/conda_build_config.yaml index bc330ea431..a60dca0786 100644 --- a/conda/recipes/mps-parser/conda_build_config.yaml +++ b/conda/recipes/mps-parser/conda_build_config.yaml @@ -14,4 +14,4 @@ c_stdlib_version: - "=2.28" cmake_version: - - ">=3.30.4" + - ">=4.0" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9249b53171..395f364807 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # cmake-format: on -cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) +cmake_minimum_required(VERSION 4.0 FATAL_ERROR) # Add our custom Find modules to the module path list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/thirdparty") @@ -24,11 +24,16 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake") message(STATUS "CMAKE_MODULE_PATH = ${CMAKE_MODULE_PATH}") project( - CUOPT - VERSION "${RAPIDS_VERSION}" - LANGUAGES CXX CUDA C + CUOPT + VERSION "${RAPIDS_VERSION}" + LANGUAGES CXX CUDA C ) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) + # Disable C++20 module scanning as the codebase doesn't use modules set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE BOOL "Disable C++20 module scanning") @@ -43,10 +48,10 @@ rapids_cmake_build_type(Release) # - User Options ------------------------------------------------------------ option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc useful for cuda-memcheck / profiler" ON) option(BUILD_TESTS "Configure CMake to build tests" ON) -option(DISABLE_OPENMP "Disable OpenMP" OFF) option(BUILD_LP_ONLY "Build only linear programming components, exclude routing and MIP-specific files" OFF) option(SKIP_C_PYTHON_ADAPTERS "Skip building C and Python adapter files (cython_solve.cu and cuopt_c.cpp)" OFF) option(SKIP_ROUTING_BUILD "Skip building routing components" OFF) +option(SKIP_GRPC_BUILD "Skip building gRPC and protobuf components" OFF) option(WRITE_FATBIN "Enable fatbin writing" ON) option(HOST_LINEINFO "Build with debug line information for host code" OFF) @@ -67,69 +72,70 @@ message(VERBOSE "cuOpt: fatbin: ${WRITE_FATBIN}") rapids_cuda_init_runtime(USE_STATIC ON) rapids_find_package(CUDAToolkit REQUIRED - BUILD_EXPORT_SET cuopt-exports - INSTALL_EXPORT_SET cuopt-exports + BUILD_EXPORT_SET cuopt-exports + INSTALL_EXPORT_SET cuopt-exports ) set(CUOPT_CXX_FLAGS "") set(CUOPT_CUDA_FLAGS "") -if(CMAKE_COMPILER_IS_GNUCXX) - list(APPEND CUOPT_CXX_FLAGS -Werror -Wno-error=deprecated-declarations) -endif(CMAKE_COMPILER_IS_GNUCXX) +if (CMAKE_COMPILER_IS_GNUCXX) + list(APPEND CUOPT_CXX_FLAGS -Werror -Wno-error=deprecated-declarations) +endif (CMAKE_COMPILER_IS_GNUCXX) # Papilo pulls in Boost.Multiprecision float128 support, which expects quadmath.h from the GCC # toolchain internals. Conda clang ships libquadmath, but does not surface the matching GCC # internal include directory by default. Add it late in the search order so clang still prefers its # own builtin intrinsic headers. -if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - execute_process( - COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libquadmath.a - OUTPUT_VARIABLE CUOPT_QUADMATH_LIB - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - - if(IS_ABSOLUTE "${CUOPT_QUADMATH_LIB}") - get_filename_component(CUOPT_QUADMATH_LIBDIR "${CUOPT_QUADMATH_LIB}" DIRECTORY) - set(CUOPT_QUADMATH_INCLUDEDIR "${CUOPT_QUADMATH_LIBDIR}/include") - - if(EXISTS "${CUOPT_QUADMATH_INCLUDEDIR}/quadmath.h") - message(STATUS "Adding clang fallback include for quadmath: ${CUOPT_QUADMATH_INCLUDEDIR}") - add_compile_options("$<$:-idirafter${CUOPT_QUADMATH_INCLUDEDIR}>") - endif() - endif() -endif() +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libquadmath.a + OUTPUT_VARIABLE CUOPT_QUADMATH_LIB + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + if (IS_ABSOLUTE "${CUOPT_QUADMATH_LIB}") + get_filename_component(CUOPT_QUADMATH_LIBDIR "${CUOPT_QUADMATH_LIB}" DIRECTORY) + set(CUOPT_QUADMATH_INCLUDEDIR "${CUOPT_QUADMATH_LIBDIR}/include") + + if (EXISTS "${CUOPT_QUADMATH_INCLUDEDIR}/quadmath.h") + message(STATUS "Adding clang fallback include for quadmath: ${CUOPT_QUADMATH_INCLUDEDIR}") + add_compile_options("$<$:-idirafter${CUOPT_QUADMATH_INCLUDEDIR}>") + endif () + endif () +endif () # To use sanitizer with cuda runtime, one must follow a few steps: # 1. Run the binary with env var set: LD_PRELOAD="$(gcc -print-file-name=libasan.so)" ASAN_OPTIONS='protect_shadow_gap=0:replace_intrin=0' # 2. (Optional) To run with a debugger (gdb or cuda-gdb) use the additional ASAN option alloc_dealloc_mismatch=0 -if(BUILD_SANITIZER) - list(APPEND CUOPT_CXX_FLAGS -fsanitize=address,undefined -fno-omit-frame-pointer -g) - if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - list(APPEND CUOPT_CXX_FLAGS -Wno-error=maybe-uninitialized) - endif() - add_link_options(-fsanitize=address,undefined) -endif(BUILD_SANITIZER) +if (BUILD_SANITIZER) + list(APPEND CUOPT_CXX_FLAGS -fsanitize=address,undefined -fno-omit-frame-pointer -g) + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + list(APPEND CUOPT_CXX_FLAGS -Wno-error=maybe-uninitialized) + endif () + add_link_options(-fsanitize=address,undefined) +endif (BUILD_SANITIZER) # To use ThreadSanitizer: -# 1. Build with clang and the -tsan flag -# 2. Run the binary with env var set: OMP_TOOL_LIBRARIES=/usr/lib/llvm-17/lib/libarcher.so ARCHER_OPTIONS='verbose=1' TSAN_OPTIONS='suppresions=cpp/utilities/tsan_suppressions.txt:ignore_noninstrumented_modules=1:halt_on_error=1' +# 1. Install clangxx and llvm-openmp into the conda environment. For some reason: libarcher.so was renamed to libarcher.so.bak +# 2. Build with clang and the -tsan flag +# 3. Run the binary with env var set: OMP_TOOL_LIBRARIES=/lib/libarcher.so.bak ARCHER_OPTIONS='verbose=1' TSAN_OPTIONS='suppressions=cpp/utilities/tsan_suppressions.txt:ignore_noninstrumented_modules=1:halt_on_error=1' # Replace with local llvm install path. libarcher.so must be presetn -if(BUILD_TSAN) - message(STATUS "Building with ThreadSanitizer enabled") - list(APPEND CUOPT_CXX_FLAGS -fsanitize=thread -fno-omit-frame-pointer -g) - add_link_options(-fsanitize=thread) -endif(BUILD_TSAN) +if (BUILD_TSAN) + message(STATUS "Building with ThreadSanitizer enabled") + list(APPEND CUOPT_CXX_FLAGS -fsanitize=thread -fno-omit-frame-pointer -g) + add_link_options(-fsanitize=thread) +endif (BUILD_TSAN) # To use MemorySanitizer: # 1. Build with clang and the -msan flag (MemorySanitizer requires clang) # 2. Run the binary with env var set: MSAN_OPTIONS='halt_on_error=1' # Note: MemorySanitizer requires all code (including libraries) to be instrumented for accurate results -if(BUILD_MSAN) - message(STATUS "Building with MemorySanitizer enabled") - list(APPEND CUOPT_CXX_FLAGS -fsanitize=memory -fno-omit-frame-pointer -g -fsanitize-memory-track-origins=1) - add_link_options(-fsanitize=memory) -endif(BUILD_MSAN) +if (BUILD_MSAN) + message(STATUS "Building with MemorySanitizer enabled") + list(APPEND CUOPT_CXX_FLAGS -fsanitize=memory -fno-omit-frame-pointer -g -fsanitize-memory-track-origins=1) + add_link_options(-fsanitize=memory) +endif (BUILD_MSAN) # Note: -UNDEBUG is applied via CUOPT_CXX_FLAGS / CUOPT_CUDA_FLAGS (not add_definitions) # to avoid leaking into dependencies that are built in-tree. @@ -140,27 +146,27 @@ endif(BUILD_MSAN) # Keeping NDEBUG defined for gRPC files makes the header inline an empty Dtor(), # avoiding the missing symbol at runtime. Additionally, gRPC files are always # compiled with -DNDEBUG (see below) so Debug builds also avoid the missing symbol. -if(DEFINE_ASSERT) - add_definitions(-DASSERT_MODE) - list(APPEND CUOPT_CUDA_FLAGS -UNDEBUG) -endif(DEFINE_ASSERT) +if (DEFINE_ASSERT) + add_definitions(-DASSERT_MODE) + list(APPEND CUOPT_CUDA_FLAGS -UNDEBUG) +endif (DEFINE_ASSERT) -if(DEFINE_BENCHMARK) - add_definitions(-DBENCHMARK) -endif(DEFINE_BENCHMARK) +if (DEFINE_BENCHMARK) + add_definitions(-DBENCHMARK) +endif (DEFINE_BENCHMARK) -if(DEFINE_PDLP_VERBOSE_MODE) - add_definitions(-DPDLP_VERBOSE_MODE) -endif(DEFINE_PDLP_VERBOSE_MODE) +if (DEFINE_PDLP_VERBOSE_MODE) + add_definitions(-DPDLP_VERBOSE_MODE) +endif (DEFINE_PDLP_VERBOSE_MODE) # Set logging level set(LIBCUOPT_LOGGING_LEVEL - "INFO" - CACHE STRING "Choose the logging level." + "INFO" + CACHE STRING "Choose the logging level." ) set_property( - CACHE LIBCUOPT_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" - "OFF") + CACHE LIBCUOPT_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" + "OFF") message(VERBOSE "CUOPT: LIBCUOPT_LOGGING_LEVEL = '${LIBCUOPT_LOGGING_LEVEL}'.") message("-- Building with logging level = ${LIBCUOPT_LOGGING_LEVEL}") @@ -170,51 +176,47 @@ message("-- Host target architecture = '${CMAKE_SYSTEM_PROCESSOR}'") # make the flags global in order to propagate flags to test cmake files set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extended-lambda") -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false") -endif() +if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false") +endif () list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror --default-stream=per-thread) -if("${CMAKE_CUDA_HOST_COMPILER}" MATCHES "clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall) -else() - list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend) -endif() +if ("${CMAKE_CUDA_HOST_COMPILER}" MATCHES "clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall) +else () + list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend) +endif () list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all) -if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0) - list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=--compress-level=3) -endif() +if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0) + list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=--compress-level=3) +endif () list(APPEND CUOPT_CUDA_FLAGS -fopenmp) # Add jobserver flags for parallel compilation if PARALLEL_LEVEL is set -if(PARALLEL_LEVEL AND NOT "${PARALLEL_LEVEL}" STREQUAL "") - message(STATUS "Enabling nvcc parallel compilation support") - list(APPEND CUOPT_CUDA_FLAGS --threads=0 --split-compile=0) - if(USE_NVCC_JOBSERVER AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) - message(STATUS "Enabling nvcc jobserver support (NVCC >= 13.0)") - list(APPEND CUOPT_CUDA_FLAGS --jobserver) - endif() -endif() - -if(NOT DISABLE_OPENMP) - find_package(OpenMP) - - if(OPENMP_FOUND) - message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") - endif() -endif() +if (PARALLEL_LEVEL AND NOT "${PARALLEL_LEVEL}" STREQUAL "") + message(STATUS "Enabling nvcc parallel compilation support") + list(APPEND CUOPT_CUDA_FLAGS --threads=0 --split-compile=0) + if (USE_NVCC_JOBSERVER AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) + message(STATUS "Enabling nvcc jobserver support (NVCC >= 13.0)") + list(APPEND CUOPT_CUDA_FLAGS --jobserver) + endif () +endif () + +# The MIP solver requires OpenMP to work +find_package(OpenMP REQUIRED) +message(VERBOSE "cuOpt: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}") # Debug options -if(CMAKE_BUILD_TYPE MATCHES Debug) - message(STATUS "Building with debugging flags") - list(APPEND CUOPT_CUDA_FLAGS -G -Xcompiler=-rdynamic -O0) - -# Option to enable line info in CUDA device compilation to allow introspection when profiling / -# memchecking -elseif(CMAKE_CUDA_LINEINFO) - message(STATUS "Enabling line info") - list(APPEND CUOPT_CUDA_FLAGS -lineinfo) - set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -lineinfo") -endif(CMAKE_BUILD_TYPE MATCHES Debug) +if (CMAKE_BUILD_TYPE MATCHES Debug) + message(STATUS "Building with debugging flags") + list(APPEND CUOPT_CUDA_FLAGS -G -Xcompiler=-rdynamic -O0) + + # Option to enable line info in CUDA device compilation to allow introspection when profiling / + # memchecking +elseif (CMAKE_CUDA_LINEINFO) + message(STATUS "Enabling line info") + list(APPEND CUOPT_CUDA_FLAGS -lineinfo) + set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -lineinfo") +endif (CMAKE_BUILD_TYPE MATCHES Debug) # ################################################################################################## # - find CPM based dependencies ------------------------------------------------------------------ @@ -224,34 +226,34 @@ rapids_cmake_install_lib_dir(lib_dir) option(FETCH_RAPIDS "Fetch RAPIDS dependencies" ON) if (FETCH_RAPIDS) - include(cmake/thirdparty/get_cccl.cmake) - include(cmake/thirdparty/get_rmm.cmake) - include(cmake/thirdparty/get_raft.cmake) - # Source-built RMM can hide out-of-line utility symbols such as - # rmm::align_up / rmm::get_current_cuda_device when built with hidden visibility on clang. - # Force default visibility on the fetched rmm target until this is fixed upstream/figured out. - if(TARGET rmm) - set_target_properties(rmm PROPERTIES CXX_VISIBILITY_PRESET default) - endif() -else() - find_package(CCCL REQUIRED) - find_package(RMM REQUIRED) - find_package(RAFT REQUIRED) -endif() + include(cmake/thirdparty/get_cccl.cmake) + include(cmake/thirdparty/get_rmm.cmake) + include(cmake/thirdparty/get_raft.cmake) + # Source-built RMM can hide out-of-line utility symbols such as + # rmm::align_up / rmm::get_current_cuda_device when built with hidden visibility on clang. + # Force default visibility on the fetched rmm target until this is fixed upstream/figured out. + if (TARGET rmm) + set_target_properties(rmm PROPERTIES CXX_VISIBILITY_PRESET default) + endif () +else () + find_package(CCCL REQUIRED) + find_package(RMM REQUIRED) + find_package(RAFT REQUIRED) +endif () FetchContent_Declare( - papilo - GIT_REPOSITORY "https://github.com/scipopt/papilo.git" - # We would want to get the main branch. However, the main branch - # does not have some of the presolvers and settings that we need - # Mainly, probing and clique merging. - # This is the reason we are using the development branch - # from Oct 12, 2025. Once these changes are merged into the main branch, - #we can switch to the main branch. - GIT_TAG "741a2b9c8155b249d6df574d758b4d97d4417520" - GIT_PROGRESS TRUE - EXCLUDE_FROM_ALL - SYSTEM + papilo + GIT_REPOSITORY "https://github.com/scipopt/papilo.git" + # We would want to get the main branch. However, the main branch + # does not have some of the presolvers and settings that we need + # Mainly, probing and clique merging. + # This is the reason we are using the development branch + # from Oct 12, 2025. Once these changes are merged into the main branch, + #we can switch to the main branch. + GIT_TAG "741a2b9c8155b249d6df574d758b4d97d4417520" + GIT_PROGRESS TRUE + EXCLUDE_FROM_ALL + SYSTEM ) find_package(TBB REQUIRED) @@ -264,12 +266,12 @@ FetchContent_MakeAvailable(papilo) # PSLP - Lightweight C presolver for linear programs # https://github.com/dance858/PSLP FetchContent_Declare( - pslp - GIT_REPOSITORY "https://github.com/dance858/PSLP.git" - GIT_TAG "v0.0.8" - GIT_PROGRESS TRUE - EXCLUDE_FROM_ALL - SYSTEM + pslp + GIT_REPOSITORY "https://github.com/dance858/PSLP.git" + GIT_TAG "v0.0.8" + GIT_PROGRESS TRUE + EXCLUDE_FROM_ALL + SYSTEM ) # Build PSLP as static to embed in cuopt (avoids runtime library path issues) @@ -287,166 +289,177 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt) find_package(CUDSS REQUIRED) # ################################################################################################## -# - gRPC and Protobuf setup (REQUIRED) ------------------------------------------------------------ - -# gRPC is required for this branch - it provides remote execution features -# gRPC can come from either: -# - an installed CMake package (gRPCConfig.cmake), or -# - an in-tree build (e.g. python/libcuopt uses FetchContent(grpc), which defines gRPC::grpc++). -if(NOT TARGET gRPC::grpc++) - find_package(gRPC CONFIG REQUIRED) -endif() - -# Find Protobuf (should come with gRPC, but verify) -if(NOT TARGET protobuf::libprotobuf) - find_package(protobuf CONFIG REQUIRED) -endif() - -set(CUOPT_ENABLE_GRPC ON) -add_compile_definitions(CUOPT_ENABLE_GRPC) -message(STATUS "gRPC enabled (target gRPC::grpc++ is available)") - -# Find protoc compiler (provided by config package or target) -if(TARGET protobuf::protoc) - get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION_RELEASE) - if(NOT _PROTOBUF_PROTOC) - get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION) - endif() -else() - find_package(protobuf CONFIG REQUIRED) - get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION_RELEASE) - if(NOT _PROTOBUF_PROTOC) - get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION) - endif() -endif() - -if(NOT _PROTOBUF_PROTOC) - message(FATAL_ERROR "protoc not found (Protobuf_PROTOC_EXECUTABLE is empty)") -endif() - -# Find grpc_cpp_plugin -if(TARGET grpc_cpp_plugin) - set(_GRPC_CPP_PLUGIN_EXECUTABLE "$") -else() - find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin) - if(NOT _GRPC_CPP_PLUGIN_EXECUTABLE) - message(FATAL_ERROR "grpc_cpp_plugin not found") - endif() -endif() - -# Generate C++ code from cuopt_remote.proto (base message definitions) -set(PROTO_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/cuopt_remote.proto") -set(PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote.pb.cc") -set(PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote.pb.h") - -add_custom_command( - OUTPUT "${PROTO_SRCS}" "${PROTO_HDRS}" - COMMAND ${_PROTOBUF_PROTOC} - ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} - --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/src/grpc - ${PROTO_FILE} - DEPENDS ${PROTO_FILE} - COMMENT "Generating C++ code from cuopt_remote.proto" - VERBATIM -) - -# Generate gRPC service code from cuopt_remote_service.proto -set(GRPC_PROTO_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/cuopt_remote_service.proto") -set(GRPC_PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.pb.cc") -set(GRPC_PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.pb.h") -set(GRPC_SERVICE_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.grpc.pb.cc") -set(GRPC_SERVICE_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.grpc.pb.h") - -add_custom_command( - OUTPUT "${GRPC_PROTO_SRCS}" "${GRPC_PROTO_HDRS}" "${GRPC_SERVICE_SRCS}" "${GRPC_SERVICE_HDRS}" - COMMAND ${_PROTOBUF_PROTOC} - ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} - --grpc_out ${CMAKE_CURRENT_BINARY_DIR} - --plugin=protoc-gen-grpc=${_GRPC_CPP_PLUGIN_EXECUTABLE} - --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/src/grpc - ${GRPC_PROTO_FILE} - DEPENDS ${GRPC_PROTO_FILE} ${PROTO_FILE} - COMMENT "Generating gRPC C++ code from cuopt_remote_service.proto" - VERBATIM -) - -message(STATUS "gRPC protobuf code generation configured") - -if(BUILD_TESTS) - include(cmake/thirdparty/get_gtest.cmake) -endif() - -set(CUOPT_SRC_FILES ) +# - gRPC and Protobuf setup ----------------------------------------------------------------------- + +if (NOT SKIP_GRPC_BUILD) + # gRPC can come from either: + # - an installed CMake package (gRPCConfig.cmake), or + # - an in-tree build (e.g. python/libcuopt uses FetchContent(grpc), which defines gRPC::grpc++). + + if (NOT TARGET OpenSSL::SSL) + find_package(OpenSSL CONFIG QUIET) + if (NOT OpenSSL_FOUND AND NOT OPENSSL_FOUND) + find_package(OpenSSL REQUIRED) + endif () + endif () + + if (NOT TARGET gRPC::grpc++) + find_package(gRPC CONFIG REQUIRED) + endif () + + # Find Protobuf (should come with gRPC, but verify) + if (NOT TARGET protobuf::libprotobuf) + find_package(protobuf CONFIG REQUIRED) + endif () + + set(CUOPT_ENABLE_GRPC ON) + add_compile_definitions(CUOPT_ENABLE_GRPC) + message(STATUS "gRPC enabled (target gRPC::grpc++ is available)") + + # Find protoc compiler (provided by config package or target) + if (TARGET protobuf::protoc) + get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION_RELEASE) + if (NOT _PROTOBUF_PROTOC) + get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION) + endif () + else () + find_package(protobuf CONFIG REQUIRED) + get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION_RELEASE) + if (NOT _PROTOBUF_PROTOC) + get_target_property(_PROTOBUF_PROTOC protobuf::protoc IMPORTED_LOCATION) + endif () + endif () + + if (NOT _PROTOBUF_PROTOC) + message(FATAL_ERROR "protoc not found (Protobuf_PROTOC_EXECUTABLE is empty)") + endif () + + # Find grpc_cpp_plugin + if (TARGET grpc_cpp_plugin) + set(_GRPC_CPP_PLUGIN_EXECUTABLE "$") + else () + find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin) + if (NOT _GRPC_CPP_PLUGIN_EXECUTABLE) + message(FATAL_ERROR "grpc_cpp_plugin not found") + endif () + endif () + + # Generate C++ code from cuopt_remote.proto (base message definitions) + set(PROTO_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/cuopt_remote.proto") + set(PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote.pb.cc") + set(PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote.pb.h") + + add_custom_command( + OUTPUT "${PROTO_SRCS}" "${PROTO_HDRS}" + COMMAND ${_PROTOBUF_PROTOC} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} + --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/src/grpc + ${PROTO_FILE} + DEPENDS ${PROTO_FILE} + COMMENT "Generating C++ code from cuopt_remote.proto" + VERBATIM + ) + + # Generate gRPC service code from cuopt_remote_service.proto + set(GRPC_PROTO_FILE "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/cuopt_remote_service.proto") + set(GRPC_PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.pb.cc") + set(GRPC_PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.pb.h") + set(GRPC_SERVICE_SRCS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.grpc.pb.cc") + set(GRPC_SERVICE_HDRS "${CMAKE_CURRENT_BINARY_DIR}/cuopt_remote_service.grpc.pb.h") + + add_custom_command( + OUTPUT "${GRPC_PROTO_SRCS}" "${GRPC_PROTO_HDRS}" "${GRPC_SERVICE_SRCS}" "${GRPC_SERVICE_HDRS}" + COMMAND ${_PROTOBUF_PROTOC} + ARGS --cpp_out ${CMAKE_CURRENT_BINARY_DIR} + --grpc_out ${CMAKE_CURRENT_BINARY_DIR} + --plugin=protoc-gen-grpc=${_GRPC_CPP_PLUGIN_EXECUTABLE} + --proto_path ${CMAKE_CURRENT_SOURCE_DIR}/src/grpc + ${GRPC_PROTO_FILE} + DEPENDS ${GRPC_PROTO_FILE} ${PROTO_FILE} ${PROTO_SRCS} ${PROTO_HDRS} + COMMENT "Generating gRPC C++ code from cuopt_remote_service.proto" + VERBATIM + ) + + message(STATUS "gRPC protobuf code generation configured") + +else () + message(STATUS "gRPC disabled") +endif () + +if (BUILD_TESTS) + include(cmake/thirdparty/get_gtest.cmake) +endif () + +set(CUOPT_SRC_FILES) add_subdirectory(src) if (HOST_LINEINFO) - set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") -endif() + set_source_files_properties(${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES COMPILE_OPTIONS "-g1") +endif () # Apply -UNDEBUG only to solver source files (not gRPC infrastructure). # Must happen before gRPC files are appended to CUOPT_SRC_FILES. # Uses APPEND to preserve any existing per-file options (e.g. -g1 from HOST_LINEINFO). -if(DEFINE_ASSERT) - set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} - APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG") -endif() - -# Add gRPC mapper files and generated protobuf sources -set(GRPC_INFRA_FILES - ${PROTO_SRCS} - ${GRPC_PROTO_SRCS} - ${GRPC_SERVICE_SRCS} - src/grpc/grpc_problem_mapper.cpp - src/grpc/grpc_solution_mapper.cpp - src/grpc/grpc_settings_mapper.cpp - src/grpc/grpc_service_mapper.cpp - src/grpc/client/grpc_client.cpp - src/grpc/client/solve_remote.cpp -) -list(APPEND CUOPT_SRC_FILES ${GRPC_INFRA_FILES}) - -# Always keep NDEBUG defined for gRPC infrastructure files so that abseil -# headers inline Mutex::Dtor() instead of emitting an external call. -# The conda-forge abseil shared library is built with NDEBUG and does not -# export that symbol (abseil-cpp#1624). Without this, Debug builds fail -# at runtime with "undefined symbol: absl::…::Mutex::Dtor". -set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} - APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG") +if (DEFINE_ASSERT) + set_property(SOURCE ${CUOPT_SRC_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + APPEND PROPERTY COMPILE_OPTIONS "-UNDEBUG") +endif () + +if (NOT SKIP_GRPC_BUILD) + # Add gRPC mapper files and generated protobuf sources + set(GRPC_INFRA_FILES + ${PROTO_SRCS} + ${GRPC_PROTO_SRCS} + ${GRPC_SERVICE_SRCS} + src/grpc/grpc_problem_mapper.cpp + src/grpc/grpc_solution_mapper.cpp + src/grpc/grpc_settings_mapper.cpp + src/grpc/grpc_service_mapper.cpp + src/grpc/client/grpc_client.cpp + src/grpc/client/solve_remote.cpp + ) + list(APPEND CUOPT_SRC_FILES ${GRPC_INFRA_FILES}) + + # Always keep NDEBUG defined for gRPC infrastructure files so that abseil + # headers inline Mutex::Dtor() instead of emitting an external call. + # The conda-forge abseil shared library is built with NDEBUG and does not + # export that symbol (abseil-cpp#1624). Without this, Debug builds fail + # at runtime with "undefined symbol: absl::…::Mutex::Dtor". + set_property(SOURCE ${GRPC_INFRA_FILES} DIRECTORY ${CMAKE_SOURCE_DIR} + APPEND PROPERTY COMPILE_OPTIONS "-DNDEBUG") +endif (NOT SKIP_GRPC_BUILD) add_library(cuopt SHARED - ${CUOPT_SRC_FILES} + ${CUOPT_SRC_FILES} ) set_target_properties(cuopt - PROPERTIES BUILD_RPATH "\$ORIGIN" - INSTALL_RPATH "\$ORIGIN" - - # set target compile options - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - INTERFACE_POSITION_INDEPENDENT_CODE ON - CXX_SCAN_FOR_MODULES OFF + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + INTERFACE_POSITION_INDEPENDENT_CODE ON + CXX_SCAN_FOR_MODULES OFF ) -target_compile_definitions(cuopt PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}") +target_compile_definitions(cuopt + PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}" + PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API +) target_compile_options(cuopt - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" - "$<$:${CUOPT_CUDA_FLAGS}>" + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + "$<$:${CUOPT_CUDA_FLAGS}>" ) -if(WRITE_FATBIN) - file(WRITE "${CUOPT_BINARY_DIR}/fatbin.ld" - [=[ +if (WRITE_FATBIN) + file(WRITE "${CUOPT_BINARY_DIR}/fatbin.ld" + [=[ SECTIONS { .nvFatBinSegment : { *(.nvFatBinSegment) } .nv_fatbin : { *(.nv_fatbin) } } ]=]) - target_link_options(cuopt PRIVATE "${CUOPT_BINARY_DIR}/fatbin.ld") -endif() + target_link_options(cuopt PRIVATE "${CUOPT_BINARY_DIR}/fatbin.ld") +endif () add_library(cuopt::cuopt ALIAS cuopt) # ################################################################################################## @@ -455,29 +468,29 @@ message(STATUS "target include directories CUDSS_INCLUDES = ${CUDSS_INCLUDE}") # Adding Papilo as a system include messes up clang's include resolution if papilo is already installed as a conda package target_include_directories(cuopt PRIVATE - "${papilo_SOURCE_DIR}/src" - "${papilo_BINARY_DIR}" + "${papilo_SOURCE_DIR}/src" + "${papilo_BINARY_DIR}" ) target_include_directories(cuopt SYSTEM PRIVATE - "${pslp_SOURCE_DIR}/include" + "${pslp_SOURCE_DIR}/include" ) target_include_directories(cuopt - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty" - "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc" - "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/client" - "${CMAKE_CURRENT_BINARY_DIR}" - "${CUDSS_INCLUDE}" - PUBLIC - "$" - "$" - "$" - INTERFACE - "$" - ${CUDSS_INCLUDE} + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty" + "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc" + "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/client" + "${CMAKE_CURRENT_BINARY_DIR}" + "${CUDSS_INCLUDE}" + PUBLIC + "$" + "$" + "$" + INTERFACE + "$" + ${CUDSS_INCLUDE} ) # Link PSLP by file to avoid export dependency tracking @@ -488,10 +501,10 @@ add_dependencies(cuopt PSLP) # - link libraries -------------------------------------------------------------------------------- set(CUOPT_PRIVATE_CUDA_LIBS - CUDA::curand - CUDA::cusolver - TBB::tbb - OpenMP::OpenMP_CXX) + CUDA::curand + CUDA::cusolver + TBB::tbb + OpenMP::OpenMP_CXX) list(PREPEND CUOPT_PRIVATE_CUDA_LIBS CUDA::cublasLt) @@ -504,19 +517,19 @@ get_filename_component(CUDSS_MT_LIB_FILE_NAME "${CUDSS_MT_LIB_FILE}" NAME) target_compile_definitions(cuopt PRIVATE CUDSS_MT_LIB_FILE_NAME="${CUDSS_MT_LIB_FILE_NAME}") execute_process( - COMMAND git rev-parse --short HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND git rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE ) message("-- Building with GIT_COMMIT_HASH = '${GIT_COMMIT_HASH}'") # Generate build_info.hpp from template # configure_file() only updates the output if content changes, avoiding unnecessary rebuilds configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/src/utilities/build_info.hpp.in - ${CMAKE_CURRENT_BINARY_DIR}/include/utilities/build_info.hpp - @ONLY + ${CMAKE_CURRENT_SOURCE_DIR}/src/utilities/build_info.hpp.in + ${CMAKE_CURRENT_BINARY_DIR}/include/utilities/build_info.hpp + @ONLY ) # Add the generated include directory @@ -524,32 +537,32 @@ target_include_directories(cuopt PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include) list(JOIN CMAKE_CUDA_ARCHITECTURES "," JOINED_CUDA_ARCHITECTURES) target_compile_definitions(cuopt PUBLIC - CUOPT_CUDA_ARCHITECTURES="${JOINED_CUDA_ARCHITECTURES}" - CUOPT_CPU_ARCHITECTURE="${CMAKE_SYSTEM_PROCESSOR}") + CUOPT_CUDA_ARCHITECTURES="${JOINED_CUDA_ARCHITECTURES}" + CUOPT_CPU_ARCHITECTURE="${CMAKE_SYSTEM_PROCESSOR}") target_link_libraries(cuopt - PUBLIC - CUDA::cublas - CUDA::cusparse - rmm::rmm - rapids_logger::rapids_logger - CCCL::CCCL - raft::raft - cuopt::mps_parser - ${CUDSS_LIB_FILE} - PRIVATE - ${CUOPT_PRIVATE_CUDA_LIBS} - protobuf::libprotobuf - gRPC::grpc++ - ) + PUBLIC + CUDA::cublas + CUDA::cusparse + rmm::rmm + rapids_logger::rapids_logger + CCCL::CCCL + raft::raft + cuopt::mps_parser + ${CUDSS_LIB_FILE} + PRIVATE + ${CUOPT_PRIVATE_CUDA_LIBS} + $<$:protobuf::libprotobuf> + $<$:gRPC::grpc++> +) # ################################################################################################## # - generate tests -------------------------------------------------------------------------------- -if(BUILD_TESTS) - include(CTest) - add_subdirectory(tests) -endif(BUILD_TESTS) +if (BUILD_TESTS) + include(CTest) + add_subdirectory(tests) +endif (BUILD_TESTS) # ################################################################################################## # - install targets ------------------------------------------------------------------------------- @@ -560,46 +573,46 @@ set(CPACK_COMPONENTS_ALL runtime dev) set(CPACK_PACKAGING_INSTALL_PREFIX "/usr/local") #If using cpack to create a deb package -if(CPACK_GENERATOR STREQUAL "DEB") - set(_BIN_DEST "bin") - set(_LIB_DEST "lib") - set(_INCLUDE_DEST "lib/cuopt") - -#If building locally use the Default install paths(e.g. for local development or other package types) -else() - set(_BIN_DEST "${CMAKE_INSTALL_BINDIR}") - set(_LIB_DEST "${lib_dir}") - set(_INCLUDE_DEST include/cuopt/) -endif() +if (CPACK_GENERATOR STREQUAL "DEB") + set(_BIN_DEST "bin") + set(_LIB_DEST "lib") + set(_INCLUDE_DEST "lib/cuopt") + + #If building locally use the Default install paths(e.g. for local development or other package types) +else () + set(_BIN_DEST "${CMAKE_INSTALL_BINDIR}") + set(_LIB_DEST "${lib_dir}") + set(_INCLUDE_DEST include/cuopt/) +endif () # adds the .so files to the runtime deb package install(TARGETS cuopt mps_parser - DESTINATION ${_LIB_DEST} - COMPONENT runtime - EXPORT cuopt-exports + DESTINATION ${_LIB_DEST} + COMPONENT runtime + EXPORT cuopt-exports ) # adds the .so files to the development deb package install(TARGETS cuopt mps_parser - DESTINATION ${_LIB_DEST} - COMPONENT dev + DESTINATION ${_LIB_DEST} + COMPONENT dev ) # adds the header files to the development deb package install(DIRECTORY include/cuopt/ - DESTINATION ${_INCLUDE_DEST} - COMPONENT dev + DESTINATION ${_INCLUDE_DEST} + COMPONENT dev ) # adds the version header file to the development deb package install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cuopt/version_config.hpp - DESTINATION ${_INCLUDE_DEST} - COMPONENT dev + DESTINATION ${_INCLUDE_DEST} + COMPONENT dev ) # ############################################################################################### # - install export ------------------------------------------------------------------------------- set(doc_string - [=[ + [=[ Provide targets for cuOpt. cuOpt library is a collection of GPU accelerated combinatorial optimization algorithms. @@ -607,19 +620,19 @@ cuOpt library is a collection of GPU accelerated combinatorial optimization algo ]=]) rapids_export(INSTALL cuopt - EXPORT_SET cuopt-exports - GLOBAL_TARGETS cuopt - NAMESPACE cuopt:: - DOCUMENTATION doc_string + EXPORT_SET cuopt-exports + GLOBAL_TARGETS cuopt + NAMESPACE cuopt:: + DOCUMENTATION doc_string ) # ############################################################################################### # - build export ------------------------------------------------------------------------------- rapids_export(BUILD cuopt - EXPORT_SET cuopt-exports - GLOBAL_TARGETS cuopt - NAMESPACE cuopt:: - DOCUMENTATION doc_string + EXPORT_SET cuopt-exports + GLOBAL_TARGETS cuopt + NAMESPACE cuopt:: + DOCUMENTATION doc_string ) # ################################################################################################## @@ -630,201 +643,201 @@ rapids_export(BUILD cuopt # doc targets for cuOpt find_package(Doxygen) -if(Doxygen_FOUND) - add_custom_command(OUTPUT CUOPT_DOXYGEN - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doxygen - COMMAND doxygen Doxyfile - VERBATIM) +if (Doxygen_FOUND) + add_custom_command(OUTPUT CUOPT_DOXYGEN + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/doxygen + COMMAND doxygen Doxyfile + VERBATIM) - add_custom_target(docs_cuopt DEPENDS CUOPT_DOXYGEN) -endif() + add_custom_target(docs_cuopt DEPENDS CUOPT_DOXYGEN) +endif () rapids_cpm_find( - argparse 3.2.0 - GLOBAL_TARGETS argparse::argparse - CPM_ARGS - GIT_REPOSITORY https://github.com/p-ranav/argparse.git - GIT_TAG v3.2 - GIT_SHALLOW TRUE -) - -if(NOT BUILD_LP_ONLY) -add_executable(cuopt_cli cuopt_cli.cpp) - -set_target_properties(cuopt_cli - PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_SCAN_FOR_MODULES OFF -) - -target_compile_options(cuopt_cli - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" - "$<$:${CUOPT_CUDA_FLAGS}>" -) - -target_include_directories(cuopt_cli - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/src" - PUBLIC - "$" - "$" - ${CUDSS_INCLUDE} - "$" -) - -target_link_libraries(cuopt_cli - PUBLIC - cuopt - OpenMP::OpenMP_CXX - ${CUDSS_LIBRARIES} - TBB::tbb - PRIVATE - argparse::argparse -) - # Use RUNPATH when building locally in order to allow LD_LIBRARY_PATH to override the conda env path -if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") - target_link_options(cuopt_cli PRIVATE -Wl,--enable-new-dtags) -endif() -set_property(TARGET cuopt_cli PROPERTY INSTALL_RPATH "$ORIGIN/../${lib_dir}") - -# adds the cuopt_cli executable to the runtime deb package -install(TARGETS cuopt_cli - COMPONENT runtime - RUNTIME DESTINATION ${_BIN_DEST} -) -endif() + argparse 3.2.0 + GLOBAL_TARGETS argparse::argparse + CPM_ARGS + GIT_REPOSITORY https://github.com/p-ranav/argparse.git + GIT_TAG v3.2 + GIT_SHALLOW TRUE +) + +if (NOT BUILD_LP_ONLY) + add_executable(cuopt_cli cuopt_cli.cpp) + + # PIE executable: auditwheel/patchelf expands .dynstr/RPATH when repairing wheels; non-PIE + # (ET_EXEC) binaries are prone to corrupt segment layout. PIE (ET_DYN) survives RPATH edits. + set_target_properties(cuopt_cli + PROPERTIES + CXX_SCAN_FOR_MODULES OFF + POSITION_INDEPENDENT_CODE ON + ) + + target_compile_options(cuopt_cli + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + "$<$:${CUOPT_CUDA_FLAGS}>" + ) + + target_link_options(cuopt_cli PRIVATE -pie) + + target_include_directories(cuopt_cli + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/src" + PUBLIC + "$" + "$" + ${CUDSS_INCLUDE} + "$" + ) + + target_link_libraries(cuopt_cli + PUBLIC + cuopt + OpenMP::OpenMP_CXX + ${CUDSS_LIBRARIES} + TBB::tbb + PRIVATE + argparse::argparse + ) + # Use RUNPATH when building locally in order to allow LD_LIBRARY_PATH to override the conda env path + if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") + target_link_options(cuopt_cli PRIVATE -Wl,--enable-new-dtags) + endif () + set_property(TARGET cuopt_cli PROPERTY INSTALL_RPATH "$ORIGIN/../${lib_dir}") + + # adds the cuopt_cli executable to the runtime deb package + install(TARGETS cuopt_cli + COMPONENT runtime + RUNTIME DESTINATION ${_BIN_DEST} + ) +endif () option(BUILD_MIP_BENCHMARKS "Build MIP benchmarks" OFF) -if(BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) - add_executable(solve_MIP ../benchmarks/linear_programming/cuopt/run_mip.cpp) - target_include_directories(solve_MIP - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/src" - PUBLIC - "$" - ) - - set_target_properties(solve_MIP - PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_SCAN_FOR_MODULES OFF - ) - - target_compile_options(solve_MIP - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" - "$<$:${CUOPT_CUDA_FLAGS}>" - ) - target_link_libraries(solve_MIP - PUBLIC - cuopt - OpenMP::OpenMP_CXX - PRIVATE - ) - if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") - target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags) - endif() - - target_include_directories(solve_MIP - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/src" - ) - -endif() +if (BUILD_MIP_BENCHMARKS AND NOT BUILD_LP_ONLY) + add_executable(solve_MIP ../benchmarks/linear_programming/cuopt/run_mip.cpp) + target_include_directories(solve_MIP + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/src" + PUBLIC + "$" + ) + + set_target_properties(solve_MIP + PROPERTIES + CXX_SCAN_FOR_MODULES OFF + ) + + target_compile_options(solve_MIP + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + "$<$:${CUOPT_CUDA_FLAGS}>" + ) + target_link_libraries(solve_MIP + PUBLIC + cuopt + OpenMP::OpenMP_CXX + PRIVATE + ) + if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") + target_link_options(solve_MIP PRIVATE -Wl,--enable-new-dtags) + endif () + + target_include_directories(solve_MIP + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/src" + ) + +endif () option(BUILD_LP_BENCHMARKS "Build LP benchmarks" OFF) -if(BUILD_LP_BENCHMARKS) - add_executable(solve_LP ../benchmarks/linear_programming/cuopt/run_pdlp.cu) - - set_target_properties(solve_LP - PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON - CXX_SCAN_FOR_MODULES OFF - ) - - target_compile_options(solve_LP - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" - "$<$:${CUOPT_CUDA_FLAGS}>" - ) - target_link_libraries(solve_LP - PUBLIC - cuopt - OpenMP::OpenMP_CXX - PRIVATE - ) - if(NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") - target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags) - endif() -endif() +if (BUILD_LP_BENCHMARKS) + add_executable(solve_LP ../benchmarks/linear_programming/cuopt/run_pdlp.cu) + + set_target_properties(solve_LP + PROPERTIES + CXX_SCAN_FOR_MODULES OFF + ) + + target_compile_options(solve_LP + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + "$<$:${CUOPT_CUDA_FLAGS}>" + ) + target_link_libraries(solve_LP + PUBLIC + cuopt + OpenMP::OpenMP_CXX + PRIVATE + ) + if (NOT DEFINED INSTALL_TARGET OR "${INSTALL_TARGET}" STREQUAL "") + target_link_options(solve_LP PRIVATE -Wl,--enable-new-dtags) + endif () +endif () # ################################################################################################## # - cuopt_grpc_server - gRPC-based remote server -------------------------------------------------- -add_executable(cuopt_grpc_server - src/grpc/server/grpc_server_main.cpp - src/grpc/server/grpc_server_logger.cpp - src/grpc/server/grpc_worker.cpp - src/grpc/server/grpc_worker_infra.cpp - src/grpc/server/grpc_server_threads.cpp - src/grpc/server/grpc_pipe_io.cpp - src/grpc/server/grpc_job_management.cpp - src/grpc/server/grpc_service_impl.cpp -) - -set_target_properties(cuopt_grpc_server - PROPERTIES - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CXX_SCAN_FOR_MODULES OFF -) - -target_compile_options(cuopt_grpc_server - PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" -) - -target_include_directories(cuopt_grpc_server - PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc" - "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/server" - "${CMAKE_CURRENT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser/include" - "${CMAKE_CURRENT_BINARY_DIR}" - PUBLIC - "$" - "$" -) - -find_library(UUID_LIBRARY uuid REQUIRED) - -target_link_libraries(cuopt_grpc_server - PUBLIC - cuopt - OpenMP::OpenMP_CXX - PRIVATE - protobuf::libprotobuf - gRPC::grpc++ - ${UUID_LIBRARY} - argparse::argparse -) - -# Use RUNPATH when building locally -target_link_options(cuopt_grpc_server PRIVATE -Wl,--enable-new-dtags) -set_property(TARGET cuopt_grpc_server PROPERTY INSTALL_RPATH "$ORIGIN/../${lib_dir}") - -# Install the grpc server executable -install(TARGETS cuopt_grpc_server - COMPONENT runtime - RUNTIME DESTINATION ${_BIN_DEST} -) - -message(STATUS "Building cuopt_grpc_server (gRPC-based remote solve server)") +if (NOT SKIP_GRPC_BUILD) + add_executable(cuopt_grpc_server + src/grpc/server/grpc_server_main.cpp + src/grpc/server/grpc_server_logger.cpp + src/grpc/server/grpc_worker.cpp + src/grpc/server/grpc_worker_infra.cpp + src/grpc/server/grpc_server_threads.cpp + src/grpc/server/grpc_pipe_io.cpp + src/grpc/server/grpc_job_management.cpp + src/grpc/server/grpc_service_impl.cpp + ) + + set_target_properties(cuopt_grpc_server + PROPERTIES + CXX_SCAN_FOR_MODULES OFF + POSITION_INDEPENDENT_CODE ON + ) + + target_compile_options(cuopt_grpc_server + PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" + ) + + target_link_options(cuopt_grpc_server PRIVATE -pie) + + target_include_directories(cuopt_grpc_server + PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc" + "${CMAKE_CURRENT_SOURCE_DIR}/src/grpc/server" + "${CMAKE_CURRENT_SOURCE_DIR}/include" + "${CMAKE_CURRENT_SOURCE_DIR}/libmps_parser/include" + "${CMAKE_CURRENT_BINARY_DIR}" + PUBLIC + "$" + "$" + ) + + find_library(UUID_LIBRARY uuid REQUIRED) + + target_link_libraries(cuopt_grpc_server + PUBLIC + cuopt + OpenMP::OpenMP_CXX + PRIVATE + protobuf::libprotobuf + gRPC::grpc++ + ${UUID_LIBRARY} + argparse::argparse + ) + + # Use RUNPATH when building locally + target_link_options(cuopt_grpc_server PRIVATE -Wl,--enable-new-dtags) + set_property(TARGET cuopt_grpc_server PROPERTY INSTALL_RPATH "$ORIGIN/../${lib_dir}") + + # Install the grpc server executable + install(TARGETS cuopt_grpc_server + COMPONENT runtime + RUNTIME DESTINATION ${_BIN_DEST} + ) + + message(STATUS "Building cuopt_grpc_server (gRPC-based remote solve server)") +endif (NOT SKIP_GRPC_BUILD) # ################################################################################################## # - CPack has to be the last item in the cmake file------------------------------------------------- diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index ac568e07cf..4552c1fef1 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -135,7 +135,6 @@ int run_single_file(const std::string& file_path, std::make_unique>(); } - // Populate the problem from MPS data model cuopt::linear_programming::populate_from_mps_data_model(problem_interface.get(), mps_data_model); const bool is_mip = (problem_interface->get_problem_category() == @@ -415,15 +414,16 @@ int main(int argc, char* argv[]) // Only initialize CUDA resources if using GPU memory backend (not remote execution) auto memory_backend = cuopt::linear_programming::get_memory_backend_type(); - std::vector> memory_resources; + std::vector memory_resources; if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) { const int num_gpus = settings.get_parameter(CUOPT_NUM_GPUS); + memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus)); for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) { RAFT_CUDA_TRY(cudaSetDevice(i)); - memory_resources.push_back(make_async()); - rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back().get()); + memory_resources.emplace_back(); + rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back()); } RAFT_CUDA_TRY(cudaSetDevice(0)); } diff --git a/cpp/include/cuopt/error.hpp b/cpp/include/cuopt/error.hpp index 9dd547adbb..9a8f62a428 100644 --- a/cpp/include/cuopt/error.hpp +++ b/cpp/include/cuopt/error.hpp @@ -100,9 +100,7 @@ inline void cuopt_expects(bool cond, error_type_t error_type, const char* fmt, . if (not cond) { va_list args; va_start(args, fmt); - char msg[2048]; - va_start(args, fmt); vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 06eacb3408..b251b3eaba 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -74,13 +74,14 @@ #define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching" #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \ "mip_strong_branching_simplex_iteration_limit" -#define CUOPT_SOLUTION_FILE "solution_file" -#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" -#define CUOPT_NUM_GPUS "num_gpus" -#define CUOPT_USER_PROBLEM_FILE "user_problem_file" -#define CUOPT_PRESOLVE_FILE "presolve_file" -#define CUOPT_RANDOM_SEED "random_seed" -#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_SOLUTION_FILE "solution_file" +#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" +#define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_USER_PROBLEM_FILE "user_problem_file" +#define CUOPT_PRESOLVE_FILE "presolve_file" +#define CUOPT_RANDOM_SEED "random_seed" +#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE "mip_hyper_heuristic_population_size" #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS "mip_hyper_heuristic_num_cpufj_threads" diff --git a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp index 009a8ce84e..48d61b9e0c 100644 --- a/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/cpu_optimization_problem.hpp @@ -41,6 +41,8 @@ class mip_solution_interface_t; template class cpu_optimization_problem_t : public optimization_problem_interface_t { public: + using typename optimization_problem_interface_t::quadratic_constraint_t; + cpu_optimization_problem_t(); // Setters @@ -113,6 +115,10 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t& get_quadratic_objective_values() const override; bool has_quadratic_objective() const override; + void set_quadratic_constraints(std::vector constraints) override; + bool has_quadratic_constraints() const override; + const std::vector& get_quadratic_constraints() const override; + // Host getters - these are the only supported getters for CPU implementation std::vector get_constraint_matrix_values_host() const override; std::vector get_constraint_matrix_indices_host() const override; @@ -185,6 +191,8 @@ class cpu_optimization_problem_t : public optimization_problem_interface_t Q_indices_; std::vector Q_values_; + std::vector quadratic_constraints_{}; + std::vector variable_lower_bounds_; std::vector variable_upper_bounds_; std::vector constraint_lower_bounds_; diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp index 14c4d227bc..ae0187e454 100644 --- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp @@ -7,6 +7,7 @@ #pragma once +#include #include #include @@ -31,6 +32,14 @@ struct benchmark_info_t { template class solver_settings_t; +template +class mip_solver_settings_t; + +namespace detail { +template +struct mip_solver_settings_accessor; +} // namespace detail + template class mip_solver_settings_t { public: @@ -86,6 +95,7 @@ class mip_solver_settings_t { f_t time_limit = std::numeric_limits::infinity(); f_t work_limit = std::numeric_limits::infinity(); + f_t semi_continuous_big_m = f_t(1e10); i_t node_limit = std::numeric_limits::max(); bool heuristics_only = false; i_t reliability_branching = -1; @@ -145,8 +155,49 @@ class mip_solver_settings_t { private: std::vector mip_callbacks_; + std::optional semi_continuous_original_num_variables_; + std::vector semi_continuous_binary_to_original_indices_; friend class solver_settings_t; + friend struct detail::mip_solver_settings_accessor; }; +namespace detail { + +template +struct mip_solver_settings_accessor { + static void clear_mip_callbacks(mip_solver_settings_t& settings) + { + settings.mip_callbacks_.clear(); + } + + static void set_semi_continuous_callback_translation(mip_solver_settings_t& settings, + i_t original_num_variables, + std::vector binary_to_original_indices) + { + settings.semi_continuous_original_num_variables_ = original_num_variables; + settings.semi_continuous_binary_to_original_indices_ = std::move(binary_to_original_indices); + } + + static bool has_semi_continuous_callback_translation( + const mip_solver_settings_t& settings) + { + return settings.semi_continuous_original_num_variables_.has_value(); + } + + static i_t get_semi_continuous_original_num_variables( + const mip_solver_settings_t& settings) + { + return settings.semi_continuous_original_num_variables_.value_or(0); + } + + static const std::vector& get_semi_continuous_binary_to_original_indices( + const mip_solver_settings_t& settings) + { + return settings.semi_continuous_binary_to_original_indices_; + } +}; + +} // namespace detail + } // namespace cuopt::linear_programming diff --git a/cpp/include/cuopt/linear_programming/optimization_problem.hpp b/cpp/include/cuopt/linear_programming/optimization_problem.hpp index df78dd17c7..a61118aa1c 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem.hpp @@ -72,6 +72,9 @@ class optimization_problem_t : public optimization_problem_interface_t static_assert(std::is_floating_point::value, "'optimization_problem_t' accepts only floating point types for weights"); + // nvcc does not always find base typedefs in derived class scope; inject explicitly. + using typename optimization_problem_interface_t::quadratic_constraint_t; + /** * @brief A device-side view of the `optimization_problem_t` structure with * the RAII stuffs stripped out, to make it easy to work inside kernels @@ -177,6 +180,16 @@ class optimization_problem_t : public optimization_problem_interface_t */ void set_objective_offset(f_t objective_offset) override; + /** + * @brief Set per-climber objective offsets for batch PDLP. + * + * When non-empty, the size must match the fixed_batch_size that will be used for batch PDLP. + * Empty means the scalar `objective_offset_` is replicated across climbers (default behavior). + * + * @param[in] offsets Host-side vector of per-climber offsets. + */ + void set_batch_objective_offsets(const std::vector& offsets); + /** * @brief Set the quadratic objective matrix (Q) in CSR format. * @note Used for quadratic programming: objective is x^T * Q * x + c^T * x @@ -196,6 +209,8 @@ class optimization_problem_t : public optimization_problem_interface_t i_t size_offsets, bool validate_positive_semi_definite = false) override; + void set_quadratic_constraints(std::vector constraints) override; + /** @copydoc optimization_problem_interface_t::set_variable_lower_bounds */ void set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) override; /** @copydoc optimization_problem_interface_t::set_variable_upper_bounds */ @@ -239,6 +254,11 @@ class optimization_problem_t : public optimization_problem_interface_t rmm::device_uvector& get_objective_coefficients() override; f_t get_objective_scaling_factor() const override; f_t get_objective_offset() const override; + /** + * @brief Get the per-climber objective offsets host vector. Size 0 means none were set. + */ + const std::vector& get_batch_objective_offsets() const noexcept; + std::vector& get_batch_objective_offsets() noexcept; const rmm::device_uvector& get_variable_lower_bounds() const override; rmm::device_uvector& get_variable_lower_bounds() override; const rmm::device_uvector& get_variable_upper_bounds() const override; @@ -259,7 +279,9 @@ class optimization_problem_t : public optimization_problem_interface_t const std::vector& get_quadratic_objective_offsets() const override; const std::vector& get_quadratic_objective_indices() const override; const std::vector& get_quadratic_objective_values() const override; + const std::vector& get_quadratic_constraints() const override; bool has_quadratic_objective() const override; + bool has_quadratic_constraints() const override; // ============================================================================ // Host getters @@ -371,11 +393,17 @@ class optimization_problem_t : public optimization_problem_interface_t rmm::device_uvector c_; f_t objective_scaling_factor_{1}; f_t objective_offset_{0}; + // Per-climber objective offsets for batch PDLP. Empty means the scalar `objective_offset_` is + // replicated across climbers (default behavior). + std::vector batch_objective_offsets_{}; std::vector Q_offsets_; std::vector Q_indices_; std::vector Q_values_; + /** QCQP: quadratic constraints **/ + std::vector quadratic_constraints_{}; + rmm::device_uvector variable_lower_bounds_; rmm::device_uvector variable_upper_bounds_; rmm::device_uvector constraint_lower_bounds_; diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp index 767e62e746..aa164ca756 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_interface.hpp @@ -20,7 +20,7 @@ namespace cuopt::linear_programming { -enum class var_t { CONTINUOUS = 0, INTEGER }; +enum class var_t { CONTINUOUS = 0, INTEGER, SEMI_CONTINUOUS }; enum class problem_category_t : int8_t { LP = 0, MIP = 1, IP = 2 }; template @@ -56,8 +56,52 @@ class optimization_problem_interface_t { static_assert(std::is_floating_point::value, "'optimization_problem_interface_t' accepts only floating point types for weights"); + /** Quadratic constraint bundle used by core optimization problem interfaces. */ + struct quadratic_constraint_t { + i_t constraint_row_index{}; + std::string constraint_row_name{}; + char constraint_row_type{}; + std::vector linear_values{}; + std::vector linear_indices{}; + f_t rhs_value{f_t(0)}; + std::vector quadratic_values{}; + std::vector quadratic_indices{}; + std::vector quadratic_offsets{}; + }; + virtual ~optimization_problem_interface_t() = default; + /** + * @brief Store quadratic constraints for MPS round-trip (linear + Q parts per QC row). + */ + virtual void set_quadratic_constraints(std::vector constraints) = 0; + template >> + void set_quadratic_constraints(const std::vector& constraints) + { + std::vector converted_constraints; + converted_constraints.reserve(constraints.size()); + for (const auto& qc : constraints) { + converted_constraints.push_back( + {static_cast(qc.constraint_row_index), + qc.constraint_row_name, + qc.constraint_row_type, + std::vector(qc.linear_values.begin(), qc.linear_values.end()), + std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), + static_cast(qc.rhs_value), + std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), + std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), + std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + } + set_quadratic_constraints(std::move(converted_constraints)); + } + + /** @brief Whether quadratic constraint metadata is present (for MPS export). */ + virtual bool has_quadratic_constraints() const = 0; + + /** @brief Quadratic constraints for MPS export (empty if none). */ + virtual const std::vector& get_quadratic_constraints() const = 0; + // ============================================================================ // Setters (accept both CPU and GPU pointers) // ============================================================================ diff --git a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp index 90e853f530..1adffb1603 100644 --- a/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp +++ b/cpp/include/cuopt/linear_programming/optimization_problem_utils.hpp @@ -16,6 +16,17 @@ namespace cuopt::linear_programming { +namespace detail { + +inline constexpr var_t char_to_var_type(char variable_type) +{ + if (variable_type == 'I' || variable_type == 'B') { return var_t::INTEGER; } + if (variable_type == 'S') { return var_t::SEMI_CONTINUOUS; } + return var_t::CONTINUOUS; +} + +} // namespace detail + /** * @brief Helper function to populate optimization_problem_interface_t from mps_data_model_t * @@ -87,9 +98,7 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr if (!char_variable_types.empty()) { std::vector enum_variable_types(char_variable_types.size()); for (size_t i = 0; i < char_variable_types.size(); ++i) { - enum_variable_types[i] = (char_variable_types[i] == 'I' || char_variable_types[i] == 'B') - ? var_t::INTEGER - : var_t::CONTINUOUS; + enum_variable_types[i] = detail::char_to_var_type(char_variable_types[i]); } problem->set_variable_types(enum_variable_types.data(), enum_variable_types.size()); // Problem category (LP/MIP/IP) is auto-detected by set_variable_types @@ -109,6 +118,10 @@ void populate_from_mps_data_model(optimization_problem_interface_t* pr q_offsets.data(), n_vars + 1); } + // Handle quadratic constraints if present + if (data_model.has_quadratic_constraints()) { + problem->set_quadratic_constraints(data_model.get_quadratic_constraints()); + } } /** @@ -252,9 +265,7 @@ void populate_from_data_model_view(optimization_problem_interface_t* p data_model->get_variable_types().data(), data_model->get_variable_types().data() + data_model->get_variable_types().size(), enum_variable_types.begin(), - [](const auto val) -> var_t { - return (val == 'I' || val == 'B') ? var_t::INTEGER : var_t::CONTINUOUS; - }); + detail::char_to_var_type); problem->set_variable_types(enum_variable_types.data(), enum_variable_types.size()); // Problem category (LP/MIP/IP) is auto-detected by set_variable_types } @@ -266,6 +277,10 @@ void populate_from_data_model_view(optimization_problem_interface_t* p if (data_model->get_row_names().size() != 0) { problem->set_row_names(data_model->get_row_names()); } + + if (data_model->has_quadratic_constraints()) { + problem->set_quadratic_constraints(data_model->get_quadratic_constraints()); + } } } // namespace cuopt::linear_programming diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_warm_start_data.hpp b/cpp/include/cuopt/linear_programming/pdlp/pdlp_warm_start_data.hpp index 363e416627..1f241463ac 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_warm_start_data.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_warm_start_data.hpp @@ -9,7 +9,7 @@ #include -#include +#include namespace cuopt::linear_programming { @@ -80,15 +80,15 @@ struct pdlp_warm_start_data_t { template struct pdlp_warm_start_data_view_t { - cuopt::mps_parser::span current_primal_solution_; - cuopt::mps_parser::span current_dual_solution_; - cuopt::mps_parser::span initial_primal_average_; - cuopt::mps_parser::span initial_dual_average_; - cuopt::mps_parser::span current_ATY_; - cuopt::mps_parser::span sum_primal_solutions_; - cuopt::mps_parser::span sum_dual_solutions_; - cuopt::mps_parser::span last_restart_duality_gap_primal_solution_; - cuopt::mps_parser::span last_restart_duality_gap_dual_solution_; + std::span current_primal_solution_; + std::span current_dual_solution_; + std::span initial_primal_average_; + std::span initial_dual_average_; + std::span current_ATY_; + std::span sum_primal_solutions_; + std::span sum_dual_solutions_; + std::span last_restart_duality_gap_primal_solution_; + std::span last_restart_duality_gap_dual_solution_; f_t initial_primal_weight_{-1}; f_t initial_step_size_{-1}; i_t total_pdlp_iterations_{-1}; diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index bcf5a736f0..a1cb787f09 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -17,6 +17,7 @@ #include #include +#include #include @@ -282,7 +283,25 @@ class pdlp_solver_settings_t { bool eliminate_dense_columns{true}; pdlp_precision_t pdlp_precision{pdlp_precision_t::DefaultPrecision}; bool save_best_primal_so_far{false}; + /** + * @brief Stop the solver as soon as a primal feasible iterate is encountered. + * + * In non-batch mode the solver returns the first primal feasible iterate (without waiting for + * optimality / dual feasibility). In batch mode the whole batch stops the moment any climber + * reaches primal feasibility; every climber returns its current iterate with its current + * termination status. Can be composed with `per_constraint_residual`. + * Mutually exclusive with `all_primal_feasible`. + */ bool first_primal_feasible{false}; + /** + * @brief Batch-only: stop only once every climber has reached (at least) primal feasibility. + * + * Each climber is individually ejected from the batch the first time it becomes primal + * feasible and its per-climber solution is captured. The solver returns when all climbers + * have been captured. Setting this in non-batch mode is a validation error. Setting it + * together with `first_primal_feasible` is a validation error. + */ + bool all_primal_feasible{false}; presolver_t presolver{presolver_t::Default}; bool dual_postsolve{true}; int num_gpus{1}; @@ -294,18 +313,17 @@ class pdlp_solver_settings_t { cuda::std::span> shared_sb_solved; static constexpr f_t minimal_absolute_tolerance = 1.0e-12; pdlp_hyper_params::pdlp_hyper_params_t hyper_params; - // Holds the information of new variable lower and upper bounds for each climber in the format: - // (variable index, new lower bound, new upper bound) - // For each entry in the vector, a new version of the problem (climber) will be solved - // concurrently i.e. if new_bounds.size() == 2, then 2 versions of the problem with updated bounds - // will be solved concurrently - std::vector> new_bounds; + // Holds per-climber variable-bound overrides in the format: + // (climber id, variable index, new lower bound, new upper bound). + // Per-climber objective coefficients / offsets / constraint bounds must be pre-expanded directly + // on the optimization_problem_t instead. + std::vector> new_bounds; // By default to save memory and speed we don't store and copy each climber's primal and dual // solutions We only retrieve termination statistics and the objective values bool generate_batch_primal_dual_solution{false}; // Used to force batch PDLP to solve a subbatch of the problems at a time // The 0 default value will make the solver use its heuristic to determine the subbatch size - i_t sub_batch_size{0}; + i_t fixed_batch_size{0}; private: /** Initial primal solution */ diff --git a/cpp/include/cuopt/routing/data_model_view.hpp b/cpp/include/cuopt/routing/data_model_view.hpp index dda9e3be1c..df4ef20156 100644 --- a/cpp/include/cuopt/routing/data_model_view.hpp +++ b/cpp/include/cuopt/routing/data_model_view.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -265,8 +265,12 @@ class data_model_view_t { * list of orders * @param norders number of customer orders that are served by this * vehicle + * @param[in] validate_input runs expensive input checks. Defaults to true. */ - void add_vehicle_order_match(const i_t vehicle_id, i_t const* orders, const i_t norders); + void add_vehicle_order_match(const i_t vehicle_id, + i_t const* orders, + const i_t norders, + bool validate_input = true); /** * @brief Control if a specified order should only serve a subset of vehicles @@ -275,8 +279,12 @@ class data_model_view_t { * @param vehicles device memory pointer to integer values corresponding to * list of vehicles * @param nvehicles number of vehicles that can serve this order + * @param[in] validate_input runs expensive input checks. Defaults to true. */ - void add_order_vehicle_match(const i_t order_id, i_t const* vehicles, const i_t nvehicles); + void add_order_vehicle_match(const i_t order_id, + i_t const* vehicles, + const i_t nvehicles, + bool validate_input = true); /** * @brief In fully heterogenous fleet mode, vehicle can take different amount diff --git a/cpp/libmps_parser/CMakeLists.txt b/cpp/libmps_parser/CMakeLists.txt index 427d4ac17b..172b419452 100644 --- a/cpp/libmps_parser/CMakeLists.txt +++ b/cpp/libmps_parser/CMakeLists.txt @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # cmake-format: on -cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) +cmake_minimum_required(VERSION 4.0 FATAL_ERROR) include(../../cmake/rapids_config.cmake) include(rapids-cmake) include(rapids-cpm) @@ -16,6 +16,9 @@ project( LANGUAGES CXX ) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + # Disable C++20 module scanning as the codebase doesn't use modules set(CMAKE_CXX_SCAN_FOR_MODULES OFF CACHE BOOL "Disable C++20 module scanning") @@ -80,10 +83,6 @@ add_library(mps_parser SHARED set_target_properties(mps_parser PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" - - # set target compile options - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON INTERFACE_POSITION_INDEPENDENT_CODE ON CXX_SCAN_FOR_MODULES OFF ) @@ -136,8 +135,9 @@ endif(BUILD_TESTS) # ################################################################################################## # * mps_parser Install ---------------------------------------------------------------------------- +rapids_cmake_install_lib_dir(mps_parser_lib_dir) install(TARGETS mps_parser - DESTINATION lib + DESTINATION ${mps_parser_lib_dir} EXPORT mps-parser-exports) install(DIRECTORY include/mps_parser/ diff --git a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp index c2a8f84980..04ed4d6b7c 100644 --- a/cpp/libmps_parser/include/mps_parser/data_model_view.hpp +++ b/cpp/libmps_parser/include/mps_parser/data_model_view.hpp @@ -7,9 +7,10 @@ #pragma once -#include +#include #include +#include #include #include #include @@ -268,33 +269,33 @@ class data_model_view_t { /** * @brief Get the CSR constraint matrix values * - * @return span + * @return std::span */ - span get_constraint_matrix_values() const noexcept; + std::span get_constraint_matrix_values() const noexcept; /** * @brief Get the CSR constraint matrix indices * - * @return span + * @return std::span */ - span get_constraint_matrix_indices() const noexcept; + std::span get_constraint_matrix_indices() const noexcept; /** * @brief Get the CSR constraint matrix offsets * - * @return span + * @return std::span */ - span get_constraint_matrix_offsets() const noexcept; + std::span get_constraint_matrix_offsets() const noexcept; /** * @brief Get the b (right-hand side) constraints array * - * @return span + * @return std::span */ - span get_constraint_bounds() const noexcept; + std::span get_constraint_bounds() const noexcept; /** * @brief Get the c vector (weights of each x variable). * - * @return span + * @return std::span */ - span get_objective_coefficients() const noexcept; + std::span get_objective_coefficients() const noexcept; /** * @brief Get the objective scaling factor * @@ -310,62 +311,62 @@ class data_model_view_t { /** * @brief Get the variables (x) lower bounds * - * @return span + * @return std::span */ - span get_variable_lower_bounds() const noexcept; + std::span get_variable_lower_bounds() const noexcept; /** * @brief Get the variables (x) upper bounds * - * @return span + * @return std::span */ - span get_variable_upper_bounds() const noexcept; + std::span get_variable_upper_bounds() const noexcept; /** * @brief Get the variables (x) types * - * @return span + * @return std::span */ - span get_variable_types() const noexcept; + std::span get_variable_types() const noexcept; /** * @brief Get the row types * - * @return span + * @return std::span */ - span get_row_types() const noexcept; + std::span get_row_types() const noexcept; /** * @brief Get the constraints lower bounds * - * @return span + * @return std::span */ - span get_constraint_lower_bounds() const noexcept; + std::span get_constraint_lower_bounds() const noexcept; /** * @brief Get the constraints upper bounds * - * @return span + * @return std::span */ - span get_constraint_upper_bounds() const noexcept; + std::span get_constraint_upper_bounds() const noexcept; /** * @brief Get the initial primal solution * - * @return span + * @return std::span */ - span get_initial_primal_solution() const noexcept; + std::span get_initial_primal_solution() const noexcept; /** * @brief Get the initial dual solution * - * @return span + * @return std::span */ - span get_initial_dual_solution() const noexcept; + std::span get_initial_dual_solution() const noexcept; /** * @brief Get the variable names * - * @return span + * @return const std::vector& */ const std::vector& get_variable_names() const noexcept; /** * @brief Get the row names * - * @return span + * @return const std::vector& */ const std::vector& get_row_names() const noexcept; @@ -386,21 +387,21 @@ class data_model_view_t { /** * @brief Get the quadratic objective matrix values * - * @return span + * @return std::span */ - span get_quadratic_objective_values() const noexcept; + std::span get_quadratic_objective_values() const noexcept; /** * @brief Get the quadratic objective matrix indices * - * @return span + * @return std::span */ - span get_quadratic_objective_indices() const noexcept; + std::span get_quadratic_objective_indices() const noexcept; /** * @brief Get the quadratic objective matrix offsets * - * @return span + * @return std::span */ - span get_quadratic_objective_offsets() const noexcept; + std::span get_quadratic_objective_offsets() const noexcept; /** * @brief Check if the problem has quadratic objective terms * @@ -415,35 +416,66 @@ class data_model_view_t { */ bool is_Q_symmetrized() const noexcept; + /** + * @brief Quadratic constraints (MPS QCMATRIX); owned copy for writers when not using spans. + */ + void set_quadratic_constraints( + std::vector::quadratic_constraint_t> constraints); + template + void set_quadratic_constraints(const std::vector& constraints) + { + quadratic_constraints_.clear(); + quadratic_constraints_.reserve(constraints.size()); + for (const auto& qc : constraints) { + quadratic_constraints_.push_back( + {static_cast(qc.constraint_row_index), + qc.constraint_row_name, + qc.constraint_row_type, + std::vector(qc.linear_values.begin(), qc.linear_values.end()), + std::vector(qc.linear_indices.begin(), qc.linear_indices.end()), + static_cast(qc.rhs_value), + std::vector(qc.quadratic_values.begin(), qc.quadratic_values.end()), + std::vector(qc.quadratic_indices.begin(), qc.quadratic_indices.end()), + std::vector(qc.quadratic_offsets.begin(), qc.quadratic_offsets.end())}); + } + } + + bool has_quadratic_constraints() const noexcept; + + const std::vector::quadratic_constraint_t>& + get_quadratic_constraints() const noexcept; + private: bool maximize_{false}; - span A_; - span A_indices_; - span A_offsets_; - span b_; - span c_; + std::span A_; + std::span A_indices_; + std::span A_offsets_; + std::span b_; + std::span c_; f_t objective_scaling_factor_{1}; f_t objective_offset_{0}; - span variable_lower_bounds_; - span variable_upper_bounds_; - span variable_types_; - span row_types_; + std::span variable_lower_bounds_; + std::span variable_upper_bounds_; + std::span variable_types_; + std::span row_types_; std::string objective_name_; std::string problem_name_; std::vector variable_names_; std::vector row_names_; - span constraint_lower_bounds_; - span constraint_upper_bounds_; + std::span constraint_lower_bounds_; + std::span constraint_upper_bounds_; // TODO move to solver_settings in next release - span initial_primal_solution_; - span initial_dual_solution_; + std::span initial_primal_solution_; + std::span initial_dual_solution_; // QPS-specific data members for quadratic programming support - span Q_objective_; - span Q_objective_indices_; - span Q_objective_offsets_; + std::span Q_objective_; + std::span Q_objective_indices_; + std::span Q_objective_offsets_; bool is_Q_symmetrized_{false}; + + std::vector::quadratic_constraint_t> quadratic_constraints_; }; // class data_model_view_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp index 6879e15d60..4ca56f02ba 100644 --- a/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp +++ b/cpp/libmps_parser/include/mps_parser/mps_data_model.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -63,44 +64,31 @@ class mps_data_model_t { * @note Setting before calling the solver is mandatory. * * @throws std::logic_error when an error occurs. - * @param[in] A_values Values of the CSR representation of the constraint matrix as a host memory - pointer to a floating point array of size size_values. - * MPS Parser copies this data. - * @param size_values Size of the A_values array. - * @param[in] A_indices Indices of the CSR representation of the constraint matrix as a host - memory pointer to an integer array of size size_indices. - * MPS Parser copies this data. - * @param size_indices Size of the A_indices array. - * @param[in] A_offsets Offsets of the CSR representation of the constraint matrix as a host - memory pointer to a integer array of size size_offsets. - * MPS Parser copies this data. - * @param size_offsets Size of the A_offsets array. + * @param[in] A_values Values of the CSR representation of the constraint matrix; host memory. + * The model copies this data. + * @param[in] A_indices Indices of the CSR representation of the constraint matrix; host memory. + * The model copies this data. + * @param[in] A_offsets Offsets of the CSR representation of the constraint matrix; host memory. + * The model copies this data. */ - void set_csr_constraint_matrix(const f_t* A_values, - i_t size_values, - const i_t* A_indices, - i_t size_indices, - const i_t* A_offsets, - i_t size_offsets); + void set_csr_constraint_matrix(std::span A_values, + std::span A_indices, + std::span A_offsets); /** * @brief Set the constraint bounds (b / right-hand side) array. * @note Setting before calling the solver is mandatory. * - * @param[in] b Host memory pointer to a floating point array of size size. - * MPS Parser copies this data. - * @param size Size of the b array. + * @param[in] b Constraint bounds; host memory. The model copies this data. */ - void set_constraint_bounds(const f_t* b, i_t size); + void set_constraint_bounds(std::span b); /** * @brief Set the objective coefficients (c) array. * @note Setting before calling the solver is mandatory. * - * @param[in] c Host memory pointer to a floating point array of size size. - * MPS Parser copies this data. - * @param size Size of the c array. + * @param[in] c Objective coefficients; host memory. The model copies this data. */ - void set_objective_coefficients(const f_t* c, i_t size); + void set_objective_coefficients(std::span c); /** * @brief Set the scaling factor of the objective function (scaling_factor * objective_value). * @note Setting before calling the solver is optional, default value if 1. @@ -120,45 +108,37 @@ class mps_data_model_t { * @brief Set the variables (x) lower bounds. * @note Setting before calling the solver is optional, default value for all is 0. * - * @param[in] variable_lower_bounds Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the variable_lower_bounds array + * @param[in] variable_lower_bounds Variable lower bounds; host memory. The model copies + * this data. */ - void set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size); + void set_variable_lower_bounds(std::span variable_lower_bounds); /** * @brief Set the variables (x) upper bounds. * @note Setting before calling the solver is optional, default value for all is +infinity. * - * @param[in] variable_upper_bounds Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the variable_upper_bounds array. + * @param[in] variable_upper_bounds Variable upper bounds; host memory. The model copies + * this data. */ - void set_variable_upper_bounds(const f_t* variable_upper_bounds, i_t size); + void set_variable_upper_bounds(std::span variable_upper_bounds); /** * @brief Set the constraints lower bounds. * @note Setting before calling the solver is optional if you set the row type, else it's * mandatory along with the upper bounds. * - * @param[in] constraint_lower_bounds Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the constraint_lower_bounds array + * @param[in] constraint_lower_bounds Constraint lower bounds; host memory. The model copies + * this data. */ - void set_constraint_lower_bounds(const f_t* constraint_lower_bounds, i_t size); + void set_constraint_lower_bounds(std::span constraint_lower_bounds); /** * @brief Set the constraints upper bounds. * @note Setting before calling the solver is optional if you set the row type, else it's * mandatory along with the lower bounds. * If both are set, priority goes to set_constraints. * - * @param[in] constraint_upper_bounds Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the constraint_upper_bounds array + * @param[in] constraint_upper_bounds Constraint upper bounds; host memory. The model copies + * this data. */ - void set_constraint_upper_bounds(const f_t* constraint_upper_bounds, i_t size); + void set_constraint_upper_bounds(std::span constraint_upper_bounds); /** * @brief Set the type of each row (constraint). Possible values are: @@ -171,12 +151,9 @@ class mps_data_model_t { * bounds, else it's mandatory * If both are set, priority goes to set_constraints. * - * @param[in] row_types Host memory pointer to a character array of - * size size. - * MPS Parser copies this data. - * @param size Size of the row_types array + * @param[in] row_types Row types; host memory. The model copies this data. */ - void set_row_types(const char* row_types, i_t size); + void set_row_types(std::span row_types); /** * @brief Set the name of the objective function. @@ -223,24 +200,20 @@ class mps_data_model_t { * * @note Default value is all 0. * - * @param[in] initial_primal_solution Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the initial_primal_solution array. + * @param[in] initial_primal_solution Initial primal solution; host memory. The model copies + * this data. */ - void set_initial_primal_solution(const f_t* initial_primal_solution, i_t size); + void set_initial_primal_solution(std::span initial_primal_solution); /** * @brief Set an initial dual solution. * * @note Default value is all 0. * - * @param[in] initial_dual_solution Host memory pointer to a floating point array of - * size size. - * MPS Parser copies this data. - * @param size Size of the initial_dual_solution array. + * @param[in] initial_dual_solution Initial dual solution; host memory. The model copies + * this data. */ - void set_initial_dual_solution(const f_t* initial_dual_solution, i_t size); + void set_initial_dual_solution(std::span initial_dual_solution); /** * @brief Set the quadratic objective matrix (Q) in CSR format for QPS files. @@ -248,19 +221,61 @@ class mps_data_model_t { * @note This is used for quadratic programming problems where the objective * function contains quadratic terms: (1/2) * x^T * Q * x + c^T * x * - * @param[in] Q_values Values of the CSR representation of the quadratic objective matrix - * @param size_values Size of the Q_values array - * @param[in] Q_indices Indices of the CSR representation of the quadratic objective matrix - * @param size_indices Size of the Q_indices array - * @param[in] Q_offsets Offsets of the CSR representation of the quadratic objective matrix - * @param size_offsets Size of the Q_offsets array + * @param[in] Q_values Values of the CSR representation of the quadratic objective matrix; host + * memory. The model copies this data. + * @param[in] Q_indices Indices of the CSR representation of the quadratic objective matrix; host + * memory. The model copies this data. + * @param[in] Q_offsets Offsets of the CSR representation of the quadratic objective matrix; host + * memory. The model copies this data. */ - void set_quadratic_objective_matrix(const f_t* Q_values, - i_t size_values, - const i_t* Q_indices, - i_t size_indices, - const i_t* Q_offsets, - i_t size_offsets); + void set_quadratic_objective_matrix(std::span Q_values, + std::span Q_indices, + std::span Q_offsets); + + /** + * @brief One quadratic constraint as parsed from MPS sections (ROWS, COLUMNS, RHS, QCMATRIX). + * + * This bundles all pieces of a quadratic row: + * - row identity and type (from ROWS), + * - sparse linear coefficients (from COLUMNS), + * - RHS value (from RHS), + * - quadratic matrix Q in CSR (from QCMATRIX). + */ + struct quadratic_constraint_t { + /** ROWS declaration index (among all constraint rows), not an index into the linear CSR. */ + i_t constraint_row_index{}; + std::string constraint_row_name{}; + /** MPS ROWS sense for this quadratic row; only 'L' (≤) is supported for convex QCQP at the + * moment. */ + char constraint_row_type{}; + std::vector linear_values{}; + std::vector linear_indices{}; + f_t rhs_value{f_t(0)}; + std::vector quadratic_values{}; + std::vector quadratic_indices{}; + std::vector quadratic_offsets{}; + }; + + /** + * @brief Append one complete quadratic constraint (row + linear + rhs + quadratic Q). + * @note All span inputs are host memory; the model copies this data. + * @param linear_values, linear_indices Same nnz; can be empty for a purely quadratic row (rare). + * @param quadratic_values, quadratic_indices CSR nnz; may be empty if Q is empty. + * @param quadratic_offsets CSR row starts; must be non-empty. + * @param constraint_row_type MPS ROWS type; must be 'L'. 'G' and 'E' quadratic rows are not + * supported. + */ + void append_quadratic_constraint(i_t constraint_row_index, + const std::string& constraint_row_name, + char constraint_row_type, + std::span linear_values, + std::span linear_indices, + f_t rhs_value, + std::span quadratic_values, + std::span quadratic_indices, + std::span quadratic_offsets); + + const std::vector& get_quadratic_constraints() const; i_t get_n_variables() const; i_t get_n_constraints() const; @@ -306,6 +321,8 @@ class mps_data_model_t { bool has_quadratic_objective() const noexcept; + bool has_quadratic_constraints() const noexcept; + /** whether to maximize or minimize the objective function */ bool maximize_; /** @@ -342,7 +359,7 @@ class mps_data_model_t { std::string problem_name_; /** names of each of the variables in the OP */ std::vector var_names_{}; - /** names of each of the rows (aka constraints or objective) in the OP */ + /** names of linear constraint rows in exported MPS order. */ std::vector row_names_{}; /** number of variables */ i_t n_vars_{0}; @@ -361,6 +378,9 @@ class mps_data_model_t { std::vector Q_objective_indices_; std::vector Q_objective_offsets_; + /** One full quadratic constraint per QCMATRIX block, in order of appearance in the file */ + std::vector quadratic_constraints_; + }; // class mps_data_model_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/parser.hpp b/cpp/libmps_parser/include/mps_parser/parser.hpp index e8e8c342bd..c5b21dcb13 100644 --- a/cpp/libmps_parser/include/mps_parser/parser.hpp +++ b/cpp/libmps_parser/include/mps_parser/parser.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,6 +9,9 @@ #include +#include +#include + namespace cuopt::mps_parser { /** @@ -23,6 +26,8 @@ namespace cuopt::mps_parser { * QPS files (for quadratic programming). QPS files are MPS files with additional * sections: * - QUADOBJ: Defines quadratic terms in the objective function + * - QMATRIX: Full symmetric quadratic objective matrix (alternative to QUADOBJ) + * - QCMATRIX: Symmetric quadratic terms for a named constraint row (QCQP) * * Note: Compressed MPS files .mps.gz, .mps.bz2 can only be read if the compression * libraries zlib or libbzip2 are installed, respectively. @@ -35,4 +40,19 @@ template mps_data_model_t parse_mps(const std::string& mps_file_path, bool fixed_mps_format = false); +/** + * @brief Reads an MPS problem from in-memory file contents. + * + * This parses the same plain-text MPS format as parse_mps(), but the input is + * already loaded in memory. Compressed .mps.gz/.mps.bz2 inputs are only supported + * by parse_mps() because compression is detected from the file path. + * + * @param[in] mps_contents MPS file contents. + * @param[in] fixed_mps_format If MPS content should be parsed as fixed, false by default. + * @return mps_data_model_t A fully formed problem which represents the given content. + */ +template +mps_data_model_t parse_mps_from_string(std::string_view mps_contents, + bool fixed_mps_format = false); + } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp b/cpp/libmps_parser/include/mps_parser/utilities/span.hpp deleted file mode 100644 index 02679cd378..0000000000 --- a/cpp/libmps_parser/include/mps_parser/utilities/span.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#pragma once - -#include - -namespace cuopt::mps_parser { - -template -class span { - public: - span() = default; - span(T* ptr, std::size_t size) : ptr_(ptr), size_(size) {} - std::size_t size() const noexcept { return size_; } - const T* data() const noexcept { return ptr_; } - - private: - T* ptr_{nullptr}; - std::size_t size_{0}; -}; - -} // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/src/data_model_view.cpp b/cpp/libmps_parser/src/data_model_view.cpp index 62b441aa60..934869f9c4 100644 --- a/cpp/libmps_parser/src/data_model_view.cpp +++ b/cpp/libmps_parser/src/data_model_view.cpp @@ -6,9 +6,10 @@ /* clang-format on */ #include -#include #include +#include + namespace cuopt::mps_parser { template @@ -29,19 +30,19 @@ void data_model_view_t::set_csr_constraint_matrix(const f_t* A_values, mps_parser_expects( A_values != nullptr, error_type_t::ValidationError, "A_values cannot be null"); } - A_ = span(A_values, size_values); + A_ = std::span(A_values, size_values); if (size_indices != 0) { mps_parser_expects( A_indices != nullptr, error_type_t::ValidationError, "A_indices cannot be null"); } - A_indices_ = span(A_indices, size_indices); + A_indices_ = std::span(A_indices, size_indices); mps_parser_expects( A_offsets != nullptr, error_type_t::ValidationError, "A_offsets cannot be null"); mps_parser_expects( size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); - A_offsets_ = span(A_offsets, size_offsets); + A_offsets_ = std::span(A_offsets, size_offsets); } template @@ -50,7 +51,7 @@ void data_model_view_t::set_constraint_bounds(const f_t* b, i_t size) if (size != 0) { mps_parser_expects(b != nullptr, error_type_t::ValidationError, "b cannot be null"); } - b_ = span(b, size); + b_ = std::span(b, size); } template @@ -59,7 +60,7 @@ void data_model_view_t::set_objective_coefficients(const f_t* c, i_t s if (size != 0) { mps_parser_expects(c != nullptr, error_type_t::ValidationError, "c cannot be null"); } - c_ = span(c, size); + c_ = std::span(c, size); } template @@ -81,7 +82,7 @@ void data_model_view_t::set_variable_lower_bounds(const f_t* variable_ mps_parser_expects(variable_lower_bounds != nullptr, error_type_t::ValidationError, "data model variable_lower_bounds cannot be null"); - variable_lower_bounds_ = span(variable_lower_bounds, size); + variable_lower_bounds_ = std::span(variable_lower_bounds, size); } template @@ -91,7 +92,7 @@ void data_model_view_t::set_variable_upper_bounds(const f_t* variable_ mps_parser_expects(variable_upper_bounds != nullptr, error_type_t::ValidationError, "variable_upper_bounds cannot be null"); - variable_upper_bounds_ = span(variable_upper_bounds, size); + variable_upper_bounds_ = std::span(variable_upper_bounds, size); } template @@ -99,7 +100,7 @@ void data_model_view_t::set_variable_types(const char* variable_types, { mps_parser_expects( variable_types != nullptr, error_type_t::ValidationError, "variable_types cannot be null"); - variable_types_ = span(variable_types, size); + variable_types_ = std::span(variable_types, size); } template @@ -109,7 +110,7 @@ void data_model_view_t::set_constraint_lower_bounds(const f_t* constra mps_parser_expects(constraint_lower_bounds != nullptr, error_type_t::ValidationError, "constraint_lower_bounds cannot be null"); - constraint_lower_bounds_ = span(constraint_lower_bounds, size); + constraint_lower_bounds_ = std::span(constraint_lower_bounds, size); } template @@ -119,7 +120,7 @@ void data_model_view_t::set_constraint_upper_bounds(const f_t* constra mps_parser_expects(constraint_upper_bounds != nullptr, error_type_t::ValidationError, "constraint_upper_bounds cannot be null"); - constraint_upper_bounds_ = span(constraint_upper_bounds, size); + constraint_upper_bounds_ = std::span(constraint_upper_bounds, size); } template @@ -129,7 +130,7 @@ void data_model_view_t::set_initial_primal_solution(const f_t* initial mps_parser_expects(initial_primal_solution != nullptr, error_type_t::ValidationError, "initial_primal_solution cannot be null"); - initial_primal_solution_ = span(initial_primal_solution, size); + initial_primal_solution_ = std::span(initial_primal_solution, size); } template @@ -139,7 +140,7 @@ void data_model_view_t::set_initial_dual_solution(const f_t* initial_d mps_parser_expects(initial_dual_solution != nullptr, error_type_t::ValidationError, "initial_dual_solution cannot be null"); - initial_dual_solution_ = span(initial_dual_solution, size); + initial_dual_solution_ = std::span(initial_dual_solution, size); } template @@ -155,19 +156,19 @@ void data_model_view_t::set_quadratic_objective_matrix(const f_t* Q_va mps_parser_expects( Q_values != nullptr, error_type_t::ValidationError, "Q_values cannot be null"); } - Q_objective_ = span(Q_values, size_values); + Q_objective_ = std::span(Q_values, size_values); if (size_indices != 0) { mps_parser_expects( Q_indices != nullptr, error_type_t::ValidationError, "Q_indices cannot be null"); } - Q_objective_indices_ = span(Q_indices, size_indices); + Q_objective_indices_ = std::span(Q_indices, size_indices); mps_parser_expects( Q_offsets != nullptr, error_type_t::ValidationError, "Q_offsets cannot be null"); mps_parser_expects( size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); - Q_objective_offsets_ = span(Q_offsets, size_offsets); + Q_objective_offsets_ = std::span(Q_offsets, size_offsets); is_Q_symmetrized_ = is_symmetrized; } @@ -177,7 +178,7 @@ void data_model_view_t::set_row_types(const char* row_types, i_t size) { mps_parser_expects( row_types != nullptr, error_type_t::ValidationError, "row_types cannot be null"); - row_types_ = span(row_types, size); + row_types_ = std::span(row_types, size); } template @@ -205,31 +206,31 @@ void data_model_view_t::set_row_names(const std::vector& } template -span data_model_view_t::get_constraint_matrix_values() const noexcept +std::span data_model_view_t::get_constraint_matrix_values() const noexcept { return A_; } template -span data_model_view_t::get_constraint_matrix_indices() const noexcept +std::span data_model_view_t::get_constraint_matrix_indices() const noexcept { return A_indices_; } template -span data_model_view_t::get_constraint_matrix_offsets() const noexcept +std::span data_model_view_t::get_constraint_matrix_offsets() const noexcept { return A_offsets_; } template -span data_model_view_t::get_constraint_bounds() const noexcept +std::span data_model_view_t::get_constraint_bounds() const noexcept { return b_; } template -span data_model_view_t::get_objective_coefficients() const noexcept +std::span data_model_view_t::get_objective_coefficients() const noexcept { return c_; } @@ -247,49 +248,49 @@ f_t data_model_view_t::get_objective_offset() const noexcept } template -span data_model_view_t::get_variable_lower_bounds() const noexcept +std::span data_model_view_t::get_variable_lower_bounds() const noexcept { return variable_lower_bounds_; } template -span data_model_view_t::get_variable_upper_bounds() const noexcept +std::span data_model_view_t::get_variable_upper_bounds() const noexcept { return variable_upper_bounds_; } template -span data_model_view_t::get_variable_types() const noexcept +std::span data_model_view_t::get_variable_types() const noexcept { return variable_types_; } template -span data_model_view_t::get_constraint_lower_bounds() const noexcept +std::span data_model_view_t::get_constraint_lower_bounds() const noexcept { return constraint_lower_bounds_; } template -span data_model_view_t::get_constraint_upper_bounds() const noexcept +std::span data_model_view_t::get_constraint_upper_bounds() const noexcept { return constraint_upper_bounds_; } template -span data_model_view_t::get_initial_primal_solution() const noexcept +std::span data_model_view_t::get_initial_primal_solution() const noexcept { return initial_primal_solution_; } template -span data_model_view_t::get_initial_dual_solution() const noexcept +std::span data_model_view_t::get_initial_dual_solution() const noexcept { return initial_dual_solution_; } template -span data_model_view_t::get_row_types() const noexcept +std::span data_model_view_t::get_row_types() const noexcept { return row_types_; } @@ -326,19 +327,19 @@ const std::vector& data_model_view_t::get_row_names() con // QPS-specific getter implementations template -span data_model_view_t::get_quadratic_objective_values() const noexcept +std::span data_model_view_t::get_quadratic_objective_values() const noexcept { return Q_objective_; } template -span data_model_view_t::get_quadratic_objective_indices() const noexcept +std::span data_model_view_t::get_quadratic_objective_indices() const noexcept { return Q_objective_indices_; } template -span data_model_view_t::get_quadratic_objective_offsets() const noexcept +std::span data_model_view_t::get_quadratic_objective_offsets() const noexcept { return Q_objective_offsets_; } @@ -355,6 +356,26 @@ bool data_model_view_t::is_Q_symmetrized() const noexcept return is_Q_symmetrized_; } +template +void data_model_view_t::set_quadratic_constraints( + std::vector::quadratic_constraint_t> constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + +template +bool data_model_view_t::has_quadratic_constraints() const noexcept +{ + return !quadratic_constraints_.empty(); +} + +template +const std::vector::quadratic_constraint_t>& +data_model_view_t::get_quadratic_constraints() const noexcept +{ + return quadratic_constraints_; +} + // NOTE: Explicitly instantiate all types here in order to avoid linker error template class data_model_view_t; diff --git a/cpp/libmps_parser/src/mps_data_model.cpp b/cpp/libmps_parser/src/mps_data_model.cpp index 7d0d44a038..d552a35273 100644 --- a/cpp/libmps_parser/src/mps_data_model.cpp +++ b/cpp/libmps_parser/src/mps_data_model.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,59 +9,34 @@ #include #include +#include namespace cuopt::mps_parser { template -void mps_data_model_t::set_csr_constraint_matrix(const f_t* A_values, - i_t size_values, - const i_t* A_indices, - i_t size_indices, - const i_t* A_offsets, - i_t size_offsets) +void mps_data_model_t::set_csr_constraint_matrix(std::span A_values, + std::span A_indices, + std::span A_offsets) { - if (size_values != 0) { - mps_parser_expects( - A_values != nullptr, error_type_t::ValidationError, "A_values cannot be null"); - } - A_.resize(size_values); - std::copy(A_values, A_values + size_values, A_.data()); - - if (size_indices != 0) { - mps_parser_expects( - A_indices != nullptr, error_type_t::ValidationError, "A_indices cannot be null"); - } - A_indices_.resize(size_indices); - std::copy(A_indices, A_indices + size_indices, A_indices_.data()); - mps_parser_expects( - A_offsets != nullptr, error_type_t::ValidationError, "A_offsets cannot be null"); - mps_parser_expects( - size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); - A_offsets_.resize(size_offsets); - std::copy(A_offsets, A_offsets + size_offsets, A_offsets_.data()); + !A_offsets.empty(), error_type_t::ValidationError, "A_offsets cannot be empty"); + A_.assign(A_values.begin(), A_values.end()); + A_indices_.assign(A_indices.begin(), A_indices.end()); + A_offsets_.assign(A_offsets.begin(), A_offsets.end()); } template -void mps_data_model_t::set_constraint_bounds(const f_t* b, i_t size) +void mps_data_model_t::set_constraint_bounds(std::span b) { - if (size != 0) { - mps_parser_expects(b != nullptr, error_type_t::ValidationError, "b cannot be null"); - } - b_.resize(size); - n_constraints_ = size; - std::copy(b, b + size, b_.data()); + b_.assign(b.begin(), b.end()); + n_constraints_ = static_cast(b.size()); } template -void mps_data_model_t::set_objective_coefficients(const f_t* c, i_t size) +void mps_data_model_t::set_objective_coefficients(std::span c) { - if (size != 0) { - mps_parser_expects(c != nullptr, error_type_t::ValidationError, "c cannot be null"); - } - c_.resize(size); - n_vars_ = size; - std::copy(c, c + size, c_.data()); + c_.assign(c.begin(), c.end()); + n_vars_ = static_cast(c.size()); } template @@ -77,67 +52,38 @@ void mps_data_model_t::set_objective_offset(f_t objective_offset) } template -void mps_data_model_t::set_variable_lower_bounds(const f_t* variable_lower_bounds, - i_t size) +void mps_data_model_t::set_variable_lower_bounds( + std::span variable_lower_bounds) { - if (size != 0) { - mps_parser_expects(variable_lower_bounds != nullptr, - error_type_t::ValidationError, - "variable_lower_bounds cannot be null"); - } - variable_lower_bounds_.resize(size); - std::copy(variable_lower_bounds, variable_lower_bounds + size, variable_lower_bounds_.data()); + variable_lower_bounds_.assign(variable_lower_bounds.begin(), variable_lower_bounds.end()); } template -void mps_data_model_t::set_variable_upper_bounds(const f_t* variable_upper_bounds, - i_t size) +void mps_data_model_t::set_variable_upper_bounds( + std::span variable_upper_bounds) { - if (size != 0) { - mps_parser_expects(variable_upper_bounds != nullptr, - error_type_t::ValidationError, - "variable_upper_bounds cannot be null"); - } - variable_upper_bounds_.resize(size); - std::copy(variable_upper_bounds, variable_upper_bounds + size, variable_upper_bounds_.data()); + variable_upper_bounds_.assign(variable_upper_bounds.begin(), variable_upper_bounds.end()); } template -void mps_data_model_t::set_constraint_lower_bounds(const f_t* constraint_lower_bounds, - i_t size) +void mps_data_model_t::set_constraint_lower_bounds( + std::span constraint_lower_bounds) { - if (size != 0) { - mps_parser_expects(constraint_lower_bounds != nullptr, - error_type_t::ValidationError, - "constraint_lower_bounds cannot be null"); - } - constraint_lower_bounds_.resize(size); - n_constraints_ = size; - std::copy( - constraint_lower_bounds, constraint_lower_bounds + size, constraint_lower_bounds_.data()); + constraint_lower_bounds_.assign(constraint_lower_bounds.begin(), constraint_lower_bounds.end()); + n_constraints_ = static_cast(constraint_lower_bounds.size()); } template -void mps_data_model_t::set_constraint_upper_bounds(const f_t* constraint_upper_bounds, - i_t size) +void mps_data_model_t::set_constraint_upper_bounds( + std::span constraint_upper_bounds) { - if (size != 0) { - mps_parser_expects(constraint_upper_bounds != nullptr, - error_type_t::ValidationError, - "constraint_upper_bounds cannot be null"); - } - constraint_upper_bounds_.resize(size); - std::copy( - constraint_upper_bounds, constraint_upper_bounds + size, constraint_upper_bounds_.data()); + constraint_upper_bounds_.assign(constraint_upper_bounds.begin(), constraint_upper_bounds.end()); } template -void mps_data_model_t::set_row_types(const char* row_types, i_t size) +void mps_data_model_t::set_row_types(std::span row_types) { - mps_parser_expects( - row_types != nullptr, error_type_t::ValidationError, "row_types cannot be null"); - row_types_.resize(size); - std::copy(row_types, row_types + size, row_types_.data()); + row_types_.assign(row_types.begin(), row_types.end()); } template @@ -167,56 +113,71 @@ void mps_data_model_t::set_row_names(const std::vector& r } template -void mps_data_model_t::set_initial_primal_solution(const f_t* initial_primal_solution, - i_t size) +void mps_data_model_t::set_initial_primal_solution( + std::span initial_primal_solution) { - mps_parser_expects(initial_primal_solution != nullptr, - error_type_t::ValidationError, - "initial_primal_solution cannot be null"); - initial_primal_solution_.resize(size); - std::copy( - initial_primal_solution, initial_primal_solution + size, initial_primal_solution_.data()); + initial_primal_solution_.assign(initial_primal_solution.begin(), initial_primal_solution.end()); } template -void mps_data_model_t::set_initial_dual_solution(const f_t* initial_dual_solution, - i_t size) +void mps_data_model_t::set_initial_dual_solution( + std::span initial_dual_solution) { - mps_parser_expects(initial_dual_solution != nullptr, - error_type_t::ValidationError, - "initial_dual_solution cannot be null"); - initial_dual_solution_.resize(size); - std::copy(initial_dual_solution, initial_dual_solution + size, initial_dual_solution_.data()); + initial_dual_solution_.assign(initial_dual_solution.begin(), initial_dual_solution.end()); } template -void mps_data_model_t::set_quadratic_objective_matrix(const f_t* Q_values, - i_t size_values, - const i_t* Q_indices, - i_t size_indices, - const i_t* Q_offsets, - i_t size_offsets) +void mps_data_model_t::set_quadratic_objective_matrix(std::span Q_values, + std::span Q_indices, + std::span Q_offsets) { - if (size_values != 0) { - mps_parser_expects( - Q_values != nullptr, error_type_t::ValidationError, "Q_values cannot be null"); - } - Q_objective_values_.resize(size_values); - std::copy(Q_values, Q_values + size_values, Q_objective_values_.data()); + mps_parser_expects( + !Q_offsets.empty(), error_type_t::ValidationError, "Q_offsets cannot be empty"); + Q_objective_values_.assign(Q_values.begin(), Q_values.end()); + Q_objective_indices_.assign(Q_indices.begin(), Q_indices.end()); + Q_objective_offsets_.assign(Q_offsets.begin(), Q_offsets.end()); +} - if (size_indices != 0) { - mps_parser_expects( - Q_indices != nullptr, error_type_t::ValidationError, "Q_indices cannot be null"); - } - Q_objective_indices_.resize(size_indices); - std::copy(Q_indices, Q_indices + size_indices, Q_objective_indices_.data()); +template +void mps_data_model_t::append_quadratic_constraint(i_t constraint_row_index, + const std::string& constraint_row_name, + char constraint_row_type, + std::span linear_values, + std::span linear_indices, + f_t rhs_value, + std::span quadratic_values, + std::span quadratic_indices, + std::span quadratic_offsets) +{ + mps_parser_expects(constraint_row_index >= 0, + error_type_t::ValidationError, + "constraint_row_index must be non-negative"); + + mps_parser_expects(constraint_row_type == 'L', + error_type_t::ValidationError, + "Quadratic constraint ROWS type must be 'L' (less-or-equal); got '%c'. " + "Only 'L' is supported for convex quadratic constraints.", + constraint_row_type); + + mps_parser_expects(linear_values.size() == linear_indices.size(), + error_type_t::ValidationError, + "linear_values and linear_indices must have the same nnz count"); mps_parser_expects( - Q_offsets != nullptr, error_type_t::ValidationError, "Q_offsets cannot be null"); - mps_parser_expects( - size_offsets > 0, error_type_t::ValidationError, "size_offsets cannot be empty"); - Q_objective_offsets_.resize(size_offsets); - std::copy(Q_offsets, Q_offsets + size_offsets, Q_objective_offsets_.data()); + !quadratic_offsets.empty(), error_type_t::ValidationError, "quadratic_offsets cannot be empty"); + + quadratic_constraint_t qc; + qc.constraint_row_index = constraint_row_index; + qc.constraint_row_name = constraint_row_name; + qc.constraint_row_type = constraint_row_type; + qc.rhs_value = rhs_value; + qc.linear_values.assign(linear_values.begin(), linear_values.end()); + qc.linear_indices.assign(linear_indices.begin(), linear_indices.end()); + qc.quadratic_values.assign(quadratic_values.begin(), quadratic_values.end()); + qc.quadratic_indices.assign(quadratic_indices.begin(), quadratic_indices.end()); + qc.quadratic_offsets.assign(quadratic_offsets.begin(), quadratic_offsets.end()); + + quadratic_constraints_.push_back(std::move(qc)); } template @@ -454,12 +415,25 @@ std::vector& mps_data_model_t::get_quadratic_objective_offsets() return Q_objective_offsets_; } +template +auto mps_data_model_t::get_quadratic_constraints() const + -> const std::vector& +{ + return quadratic_constraints_; +} + template bool mps_data_model_t::has_quadratic_objective() const noexcept { return !Q_objective_values_.empty(); } +template +bool mps_data_model_t::has_quadratic_constraints() const noexcept +{ + return !quadratic_constraints_.empty(); +} + // NOTE: Explicitly instantiate all types here in order to avoid linker error template class mps_data_model_t; diff --git a/cpp/libmps_parser/src/mps_parser.cpp b/cpp/libmps_parser/src/mps_parser.cpp index 6a81b3b6c1..c58a843ed5 100644 --- a/cpp/libmps_parser/src/mps_parser.cpp +++ b/cpp/libmps_parser/src/mps_parser.cpp @@ -14,11 +14,11 @@ #include #include #include -#include #include #include #include #include +#include #ifdef MPS_PARSER_WITH_BZIP2 #include @@ -44,6 +44,13 @@ struct FcloseDeleter { fclose(fp) == 0, error_type_t::ValidationError, "Error closing MPS file!"); } }; + +std::vector string_to_buffer(std::string_view input) +{ + std::vector buf(input.begin(), input.end()); + buf.push_back('\0'); + return buf; +} } // end namespace #ifdef MPS_PARSER_WITH_BZIP2 @@ -243,14 +250,14 @@ BoundType convert(std::string_view str) return LowerBoundIntegerVariable; } else if (str == "UI") { return UpperBoundIntegerVariable; - } else if (str == "LC") { - return SemiContiniousVariable; + } else if (str == "SC" || str == "LC") { + return SemiContinuousVariable; } else { mps_parser_expects(false, error_type_t::ValidationError, "Invalid variable bound type found in BOUNDS section! Bound type=%s", std::string(str).c_str()); - return SemiContiniousVariable; + return SemiContinuousVariable; } } @@ -272,35 +279,43 @@ ObjSenseType convert_to_obj_sense(const std::string& str) template void mps_parser_t::fill_problem(mps_data_model_t& problem) { + // Row indices that have QCMATRIX blocks (quadratic rows follow linear rows in ROWS under + // our MPS section rules; names are not required to be QC0..QCN) + std::unordered_set quadratic_row_ids{}; + for (const auto& block : qcmatrix_blocks_) { + quadratic_row_ids.insert(block.constraint_row_id); + } + const auto is_quadratic_row = [&quadratic_row_ids](i_t row) { + return quadratic_row_ids.count(row); + }; + { std::vector h_offsets{}, h_indices{}; std::vector h_values{}; h_offsets.push_back(0); + i_t num_linear_rows = 0; for (i_t i = 0; i < (i_t)A_indices.size(); ++i) { - i_t off = h_offsets.size() > 0 ? h_offsets[h_offsets.size() - 1] : 0; + // Quadratic constraint rows are omitted from the linear CSR; linear pieces live in each + // quadratic_constraint_t bundle. + if (is_quadratic_row(i)) { continue; } + ++num_linear_rows; for (const auto& idx_itr : A_indices[i]) { h_indices.push_back(idx_itr); } for (const auto& val_itr : A_values[i]) { h_values.push_back(val_itr); } - off += A_indices[i].size(); - h_offsets.push_back(off); + h_offsets.push_back(static_cast(h_indices.size())); } - problem.set_csr_constraint_matrix(h_values.data(), - h_values.size(), - h_indices.data(), - h_indices.size(), - h_offsets.data(), - h_offsets.size()); + problem.set_csr_constraint_matrix(h_values, h_indices, h_offsets); - mps_parser_expects(A_indices.size() + 1 == h_offsets.size(), + mps_parser_expects(static_cast(num_linear_rows) + 1 == h_offsets.size(), error_type_t::ValidationError, "The row indexing vector for the constraint matrix was not constructed " "successfully. Should be size %zu, but was size %zu", - A_indices.size() + 1, + static_cast(num_linear_rows) + 1, h_offsets.size()); mps_parser_expects( h_indices.size() == h_values.size(), @@ -320,17 +335,22 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) h_offsets[h_offsets.size() - 1]); } - // Set b & c - problem.set_constraint_bounds(b_values.data(), b_values.size()); - problem.set_objective_coefficients(c_values.data(), c_values.size()); + // Set b & c (RHS entries for quadratic rows are stored only on quadratic_constraint_t) + std::vector b_compacted{}; + b_compacted.reserve(b_values.size()); + for (i_t i = 0; i < (i_t)b_values.size(); ++i) { + if (!is_quadratic_row(i)) { b_compacted.push_back(b_values[i]); } + } + problem.set_constraint_bounds(b_compacted); + problem.set_objective_coefficients(c_values); // Set offset and scaling factor of objective function problem.set_objective_scaling_factor(objective_scaling_factor_value); problem.set_objective_offset(objective_offset_value); // Set lower and upper bounds - problem.set_variable_lower_bounds(variable_lower_bounds.data(), variable_lower_bounds.size()); - problem.set_variable_upper_bounds(variable_upper_bounds.data(), variable_upper_bounds.size()); + problem.set_variable_lower_bounds(variable_lower_bounds); + problem.set_variable_upper_bounds(variable_upper_bounds); mps_parser_expects( (problem.get_variable_lower_bounds().size() == problem.get_variable_upper_bounds().size()) && @@ -343,22 +363,25 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.get_variable_lower_bounds().size(), problem.get_variable_upper_bounds().size()); - // Determine the constraint bounds based on row types + // Determine the constraint bounds based on row types (quadratic rows use bundles only, not + // counted here) { std::vector h_constraint_lower_bounds{}; std::vector h_constraint_upper_bounds{}; for (i_t i = 0; i < (i_t)row_types.size(); ++i) { + if (is_quadratic_row(i)) { continue; } if (row_types[i] == Equality) { h_constraint_lower_bounds.push_back(b_values[i]); h_constraint_upper_bounds.push_back(b_values[i]); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_lower_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Constraints lower bound %d shouldn't be nan", i); - mps_parser_expects(!std::isnan(h_constraint_upper_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Constraints upper bound %d shouldn't be nan", i); @@ -367,17 +390,18 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) "Equality range value %d shouldn't be nan", i); if (ranges_values[i] < f_t(0)) - h_constraint_lower_bounds[i] = h_constraint_lower_bounds[i] + ranges_values[i]; + h_constraint_lower_bounds[r] = h_constraint_lower_bounds[r] + ranges_values[i]; else // Positive - h_constraint_upper_bounds[i] = h_constraint_upper_bounds[i] + ranges_values[i]; + h_constraint_upper_bounds[r] = h_constraint_upper_bounds[r] + ranges_values[i]; } } else if (row_types[i] == GreaterThanOrEqual) { h_constraint_lower_bounds.push_back(b_values[i]); h_constraint_upper_bounds.push_back(std::numeric_limits::infinity()); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_lower_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Constraints lower bound %d shouldn't be nan", i); @@ -385,15 +409,16 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) error_type_t::ValidationError, "Greater range value %d shouldn't be nan", i); - h_constraint_upper_bounds[i] = h_constraint_lower_bounds[i] + std::abs(ranges_values[i]); + h_constraint_upper_bounds[r] = h_constraint_lower_bounds[r] + std::abs(ranges_values[i]); } } else if (row_types[i] == LesserThanOrEqual) { h_constraint_lower_bounds.push_back(-std::numeric_limits::infinity()); h_constraint_upper_bounds.push_back(b_values[i]); + const size_t r = h_constraint_lower_bounds.size() - 1; if (ranges_values.size() > 0 && ranges_values[i] != unset_range_value) // Add range value if specified { - mps_parser_expects(!std::isnan(h_constraint_upper_bounds[i]), + mps_parser_expects(!std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Constraints upper bound %d shouldn't be nan", i); @@ -401,23 +426,22 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) error_type_t::ValidationError, "Lesser range value %d shouldn't be nan", i); - h_constraint_lower_bounds[i] = h_constraint_upper_bounds[i] - std::abs(ranges_values[i]); + h_constraint_lower_bounds[r] = h_constraint_upper_bounds[r] - std::abs(ranges_values[i]); } } else { mps_parser_expects(false, error_type_t::ValidationError, "Unsupported row type was passed to the Optimization Problem"); } + const size_t r = h_constraint_lower_bounds.size() - 1; mps_parser_expects( - !std::isnan(h_constraint_lower_bounds[i]), error_type_t::ValidationError, "Cannot be nan"); + !std::isnan(h_constraint_lower_bounds[r]), error_type_t::ValidationError, "Cannot be nan"); mps_parser_expects( - !std::isnan(h_constraint_upper_bounds[i]), error_type_t::ValidationError, "Cannot be nan"); + !std::isnan(h_constraint_upper_bounds[r]), error_type_t::ValidationError, "Cannot be nan"); } - problem.set_constraint_lower_bounds(h_constraint_lower_bounds.data(), - h_constraint_lower_bounds.size()); - problem.set_constraint_upper_bounds(h_constraint_upper_bounds.data(), - h_constraint_upper_bounds.size()); + problem.set_constraint_lower_bounds(h_constraint_lower_bounds); + problem.set_constraint_upper_bounds(h_constraint_upper_bounds); mps_parser_expects( (problem.get_constraint_lower_bounds().size() == @@ -432,20 +456,26 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) problem.get_constraint_upper_bounds().size()); } + const i_t num_vars_for_quad = static_cast(var_names.size()); + problem.set_problem_name(problem_name); problem.set_objective_name(objective_name); problem.set_variable_names(std::move(var_names)); problem.set_variable_types(std::move(var_types)); - problem.set_row_names(std::move(row_names)); problem.set_maximize(maximize); // Helper function to build CSR format using double transpose (O(m+n+nnz) instead of // O(nnz*log(nnz))) For QUADOBJ: handles upper triangular input by expanding to full symmetric - // matrix + // matrix. + // + // @p value_scale: + // QUADOBJ/QMATRIX use 0.5 (MPS ½ xᵀQx vs internal xᵀQx); + // QCMATRIX uses 1.0 (symmetric Q defines xᵀQx directly in the constraint). auto build_csr_via_transpose = [](const std::vector>& entries, i_t num_rows, i_t num_cols, - bool is_quadobj = false) { + bool symmetrize_upper_triangular, + f_t value_scale) { struct CSRResult { std::vector values; std::vector indices; @@ -467,7 +497,7 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) // For QUADOBJ (upper triangular), add both (row,col) and (col,row) if off-diagonal csc_data[col].emplace_back(row, val); - if (is_quadobj && row != col) { csc_data[row].emplace_back(col, val); } + if (symmetrize_upper_triangular && row != col) { csc_data[row].emplace_back(col, val); } } // Second transpose: convert CSC to CSR (entries sorted by row, columns within rows sorted) @@ -485,9 +515,10 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) for (i_t row = 0; row < num_rows; ++row) { for (const auto& [col, val] : csr_data[row]) { - // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q x - // so we have to multiply the value by 0.5 to get the correct value. - result.values.push_back(val * 0.5); + // While the mps format expects to optimize for 0.5 xT Q x, cuopt optimizes for xT Q xExpand + // commentComment on line L488 so we have to multiply the value by value_scale=0.5 to get + // the correct value. + result.values.push_back(val * value_scale); result.indices.push_back(col); } result.offsets.push_back(result.values.size()); @@ -500,29 +531,70 @@ void mps_parser_t::fill_problem(mps_data_model_t& problem) if (!quadobj_entries.empty()) { // Convert quadratic objective entries to CSR format using double transpose // QUADOBJ stores upper triangular elements, so we expand to full symmetric matrix - i_t num_vars = static_cast(var_names.size()); - auto csr_result = build_csr_via_transpose(quadobj_entries, num_vars, num_vars, true); + constexpr f_t k_mps_quad_half_scale = f_t(0.5); // MPS ½ xᵀQx vs internal xᵀQx + auto csr_result = build_csr_via_transpose( + quadobj_entries, num_vars_for_quad, num_vars_for_quad, true, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) - problem.set_quadratic_objective_matrix(csr_result.values.data(), - csr_result.values.size(), - csr_result.indices.data(), - csr_result.indices.size(), - csr_result.offsets.data(), - csr_result.offsets.size()); + problem.set_quadratic_objective_matrix( + csr_result.values, csr_result.indices, csr_result.offsets); } else if (!qmatrix_entries.empty()) { // Convert quadratic objective entries to CSR format using double transpose // QMATRIX stores full symmetric matrix - i_t num_vars = static_cast(var_names.size()); - auto csr_result = build_csr_via_transpose(qmatrix_entries, num_vars, num_vars, false); + constexpr f_t k_mps_quad_half_scale = f_t(0.5); + auto csr_result = build_csr_via_transpose( + qmatrix_entries, num_vars_for_quad, num_vars_for_quad, false, k_mps_quad_half_scale); // Use optimized double transpose method - O(m+n+nnz) instead of O(nnz*log(nnz)) - problem.set_quadratic_objective_matrix(csr_result.values.data(), - csr_result.values.size(), - csr_result.indices.data(), - csr_result.indices.size(), - csr_result.offsets.data(), - csr_result.offsets.size()); + problem.set_quadratic_objective_matrix( + csr_result.values, csr_result.indices, csr_result.offsets); + } + + // QCMATRIX: one symmetric Q per constraint row (no extra ½ factor vs file coeffs). + // Bundle row metadata, row-linear coefficients (from COLUMNS), rhs, and quadratic part together. + constexpr f_t k_qcmatrix_value_scale = f_t(1); + const i_t linear_row_count = static_cast(row_types.size() - quadratic_row_ids.size()); + i_t quadratic_row_id = 0; + for (const auto& block : qcmatrix_blocks_) { + auto csr_result = build_csr_via_transpose( + block.entries, num_vars_for_quad, num_vars_for_quad, false, k_qcmatrix_value_scale); + const i_t row_id = block.constraint_row_id; + mps_parser_expects(row_id >= 0 && row_id < static_cast(row_types.size()), + error_type_t::ValidationError, + "QCMATRIX row index %d is out of range for constraints", + static_cast(row_id)); + problem.append_quadratic_constraint(linear_row_count + quadratic_row_id, + row_names[row_id], + static_cast(row_types[row_id]), + A_values[row_id], + A_indices[row_id], + b_values[row_id], + csr_result.values, + csr_result.indices, + csr_result.offsets); + ++quadratic_row_id; + } + + if (!quadratic_row_ids.empty()) { + std::vector linear_row_names{}; + std::vector row_types_linear{}; + linear_row_names.reserve(row_names.size()); + row_types_linear.reserve(row_names.size()); + for (size_t i = 0; i < row_names.size(); ++i) { + if (!is_quadratic_row(static_cast(i))) { + linear_row_names.push_back(row_names[i]); + row_types_linear.push_back(static_cast(row_types[i])); + } + } + problem.set_row_names(std::move(linear_row_names)); + problem.set_row_types(row_types_linear); + } else { + std::vector row_types_host(row_types.size()); + for (size_t i = 0; i < row_types.size(); ++i) { + row_types_host[i] = static_cast(row_types[i]); + } + problem.set_row_names(std::move(row_names)); + problem.set_row_types(row_types_host); } } @@ -544,35 +616,30 @@ std::vector mps_parser_t::file_to_string(const std::string& file #endif // MPS_PARSER_WITH_ZLIB // Faster than using C++ I/O - FILE* fp = fopen(file.c_str(), "r"); + std::unique_ptr fp{fopen(file.c_str(), "r")}; mps_parser_expects(fp != nullptr, error_type_t::ValidationError, "Error opening MPS file! Given path: %s", mps_file.c_str()); - mps_parser_expects(fseek(fp, 0L, SEEK_END) == 0, + mps_parser_expects(fseek(fp.get(), 0L, SEEK_END) == 0, error_type_t::ValidationError, "File browsing MPS file! Given path: %s", mps_file.c_str()); - const long bufsize = ftell(fp); + const long bufsize = ftell(fp.get()); mps_parser_expects(bufsize != -1L, error_type_t::ValidationError, "File browsing MPS file! Given path: %s", mps_file.c_str()); std::vector buf(bufsize + 1); - rewind(fp); + rewind(fp.get()); - mps_parser_expects(fread(buf.data(), sizeof(char), bufsize, fp) == bufsize, + mps_parser_expects(fread(buf.data(), sizeof(char), bufsize, fp.get()) == bufsize, error_type_t::ValidationError, "Error reading MPS file! Given path: %s", mps_file.c_str()); buf[bufsize] = '\0'; - mps_parser_expects(fclose(fp) == 0, - error_type_t::ValidationError, - "Error closing MPS file! Given path: %s", - mps_file.c_str()); - return buf; } @@ -582,7 +649,8 @@ void mps_parser_t::parse_string(char* buf) // raft::common::nvtx::range fun_scope("parse string"); // Faster than C++ std::get_line - char* c_line = strtok(buf, "\n"); + char* saveptr = nullptr; + char* c_line = strtok_r(buf, "\n", &saveptr); bool skip_line = false; mps_parser_expects(c_line != nullptr, @@ -598,6 +666,11 @@ void mps_parser_t::parse_string(char* buf) // these lines mark the start of a particular "section" if (line[0] != ' ') { skip_line = false; + // Leaving QCMATRIX: any non-QCMATRIX section header ends the current block + if (inside_qcmatrix_ && line.find("QCMATRIX", 0, 8) != 0) { + flush_qcmatrix_block(); + inside_qcmatrix_ = false; + } if (line.find("NAME", 0, 4) == 0) { encountered_sections.insert("NAME"); auto name_start = line.find_first_not_of(" \t", 4); @@ -708,6 +781,7 @@ void mps_parser_t::parse_string(char* buf) inside_objname_ = false; inside_objsense_ = false; inside_qmatrix_ = false; + inside_qcmatrix_ = false; inside_quadobj_ = true; } else if (line.find("QMATRIX", 0, 7) == 0) { encountered_sections.insert("QMATRIX"); @@ -720,6 +794,21 @@ void mps_parser_t::parse_string(char* buf) inside_objsense_ = false; inside_quadobj_ = false; inside_qmatrix_ = true; + inside_qcmatrix_ = false; + } else if (line.find("QCMATRIX", 0, 8) == 0) { + encountered_sections.insert("QCMATRIX"); + flush_qcmatrix_block(); + inside_rows_ = false; + inside_columns_ = false; + inside_rhs_ = false; + inside_bounds_ = false; + inside_ranges_ = false; + inside_objname_ = false; + inside_objsense_ = false; + inside_quadobj_ = false; + inside_qmatrix_ = false; + inside_qcmatrix_ = true; + parse_qcmatrix_header(line); } else if (line.find("ENDATA", 0, 6) == 0) { encountered_sections.insert("ENDATA"); break; @@ -736,6 +825,7 @@ void mps_parser_t::parse_string(char* buf) inside_objname_ = false; inside_quadobj_ = false; inside_qmatrix_ = false; + inside_qcmatrix_ = false; } else { mps_parser_expects(false, error_type_t::ValidationError, @@ -762,13 +852,15 @@ void mps_parser_t::parse_string(char* buf) parse_quad(line, true); } else if (inside_qmatrix_) { parse_quad(line, false); + } else if (inside_qcmatrix_) { + parse_qcmatrix_data(line); } else { mps_parser_expects(false, error_type_t::ValidationError, "Ended up at a bad parser state! Line=%s", std::string(line).c_str()); } - } while ((c_line = strtok(nullptr, "\n")) != nullptr); + } while ((c_line = strtok_r(nullptr, "\n", &saveptr)) != nullptr); mps_parser_expects(!objective_name.empty(), error_type_t::ValidationError, "No objective found!"); mps_parser_expects( @@ -829,6 +921,19 @@ mps_parser_t::mps_parser_t(mps_data_model_t& problem, fill_problem(problem); } +template +mps_parser_t::mps_parser_t(mps_data_model_t& problem, + std::string_view input, + bool _fixed_mps_format) + : mps_file{""}, fixed_mps_format(_fixed_mps_format) +{ + std::vector buf = string_to_buffer(input); + + parse_string(buf.data()); + + fill_problem(problem); +} + template void mps_parser_t::parse_rows(std::string_view line) { @@ -1281,6 +1386,123 @@ void mps_parser_t::parse_objname(std::string_view line) } } +template +void mps_parser_t::flush_qcmatrix_block() +{ + if (qcmatrix_active_row_id_ < 0) { return; } + if (qcmatrix_current_entries_.empty()) { + qcmatrix_active_row_id_ = -1; + return; + } + for (const auto& b : qcmatrix_blocks_) { + mps_parser_expects(b.constraint_row_id != qcmatrix_active_row_id_, + error_type_t::ValidationError, + "Duplicate QCMATRIX block for the same constraint row (index %d)", + static_cast(qcmatrix_active_row_id_)); + } + qcmatrix_raw_block_t block; + block.constraint_row_id = qcmatrix_active_row_id_; + block.entries = std::move(qcmatrix_current_entries_); + qcmatrix_blocks_.push_back(std::move(block)); + qcmatrix_active_row_id_ = -1; +} + +template +void mps_parser_t::parse_qcmatrix_header(std::string_view line) +{ + std::string row_name; + if (fixed_mps_format) { + mps_parser_expects(line.size() >= 19, + error_type_t::ValidationError, + "QCMATRIX header line too short! line=%s", + std::string(line).c_str()); + // fixed MPS: constraint name starts in column 12 (1-based) → 0-based index 11, 8 chars + row_name = std::string(trim(line.substr(11, 8))); + } else { + std::stringstream ss{std::string(line)}; + std::string kw; + ss >> kw; + mps_parser_expects(kw == "QCMATRIX", + error_type_t::ValidationError, + "Expected QCMATRIX keyword! line=%s", + std::string(line).c_str()); + ss >> row_name; + mps_parser_expects(!row_name.empty(), + error_type_t::ValidationError, + "QCMATRIX missing constraint row name! line=%s", + std::string(line).c_str()); + } + + auto row_it = row_names_map.find(row_name); + mps_parser_expects(row_it != row_names_map.end(), + error_type_t::ValidationError, + "Unknown constraint row name '%s' in QCMATRIX! line=%s", + row_name.c_str(), + std::string(line).c_str()); + + qcmatrix_active_row_id_ = row_it->second; +} + +template +void mps_parser_t::parse_qcmatrix_data(std::string_view line) +{ + mps_parser_expects(qcmatrix_active_row_id_ >= 0, + error_type_t::ValidationError, + "QCMATRIX data line before a valid QCMATRIX header! line=%s", + std::string(line).c_str()); + + std::string var1_name, var2_name; + f_t value; + + if (fixed_mps_format) { + mps_parser_expects(line.size() >= 25, + error_type_t::ValidationError, + "QCMATRIX data line should have at least 3 entities! line=%s", + std::string(line).c_str()); + + var1_name = std::string(trim(line.substr(4, 8))); + var2_name = std::string(trim(line.substr(14, 8))); + if (var1_name[0] == '$' || var2_name[0] == '$') return; + + i_t pos = 24; + value = get_numerical_bound(line, pos); + } else { + i_t pos = 0; + i_t end = 0; + const std::string_view var1_sv = get_next_string(line, pos, end); + mps_parser_expects(!var1_sv.empty(), + error_type_t::ValidationError, + "QCMATRIX data line missing first variable name! line=%s", + std::string(line).c_str()); + if (var1_sv[0] == '$') return; + const std::string_view var2_sv = get_next_string(line, pos, end); + mps_parser_expects(!var2_sv.empty(), + error_type_t::ValidationError, + "QCMATRIX data line missing second variable name! line=%s", + std::string(line).c_str()); + if (var2_sv[0] == '$') return; + value = get_numerical_bound(line, end); + var1_name = std::string(var1_sv); + var2_name = std::string(var2_sv); + } + + auto var1_it = var_names_map.find(var1_name); + auto var2_it = var_names_map.find(var2_name); + + mps_parser_expects(var1_it != var_names_map.end(), + error_type_t::ValidationError, + "Variable '%s' not found in QCMATRIX! line=%s", + var1_name.c_str(), + std::string(line).c_str()); + mps_parser_expects(var2_it != var_names_map.end(), + error_type_t::ValidationError, + "Variable '%s' not found in QCMATRIX! line=%s", + var2_name.c_str(), + std::string(line).c_str()); + + qcmatrix_current_entries_.emplace_back(var1_it->second, var2_it->second, value); +} + template void mps_parser_t::parse_quad(std::string_view line, bool is_quadobj) { @@ -1303,9 +1525,23 @@ void mps_parser_t::parse_quad(std::string_view line, bool is_quadobj) i_t pos = 24; value = get_numerical_bound(line, pos); } else { - std::stringstream ss{std::string(line)}; - ss >> var1_name >> var2_name >> value; - if (var1_name[0] == '$' || var2_name[0] == '$') return; + i_t pos = 0; + i_t end = 0; + const std::string_view var1_sv = get_next_string(line, pos, end); + mps_parser_expects(!var1_sv.empty(), + error_type_t::ValidationError, + "QUADOBJ/QMATRIX data line missing first variable name! line=%s", + std::string(line).c_str()); + if (var1_sv[0] == '$') return; + const std::string_view var2_sv = get_next_string(line, pos, end); + mps_parser_expects(!var2_sv.empty(), + error_type_t::ValidationError, + "QUADOBJ/QMATRIX data line missing second variable name! line=%s", + std::string(line).c_str()); + if (var2_sv[0] == '$') return; + value = get_numerical_bound(line, end); + var1_name = std::string(var1_sv); + var2_name = std::string(var2_sv); } // Find variable indices @@ -1377,6 +1613,7 @@ void mps_parser_t::read_bound_and_value(std::string_view line, switch (bound_type) { case LowerBound: { variable_lower_bounds[var_id] = get_numerical_bound(line, start); + lower_bounds_defined_for_var_id.insert(var_id); break; } case UpperBound: { @@ -1393,15 +1630,18 @@ void mps_parser_t::read_bound_and_value(std::string_view line, const f_t val = get_numerical_bound(line, start); variable_lower_bounds[var_id] = val; variable_upper_bounds[var_id] = val; + lower_bounds_defined_for_var_id.insert(var_id); break; } case Free: { variable_lower_bounds[var_id] = -std::numeric_limits::infinity(); variable_upper_bounds[var_id] = +std::numeric_limits::infinity(); + lower_bounds_defined_for_var_id.insert(var_id); break; } case LowerBoundNegInf: variable_lower_bounds[var_id] = -std::numeric_limits::infinity(); + lower_bounds_defined_for_var_id.insert(var_id); break; case UpperBoundInf: variable_upper_bounds[var_id] = +std::numeric_limits::infinity(); @@ -1410,6 +1650,7 @@ void mps_parser_t::read_bound_and_value(std::string_view line, variable_lower_bounds[var_id] = 0; variable_upper_bounds[var_id] = 1; var_types[var_id] = 'I'; + lower_bounds_defined_for_var_id.insert(var_id); break; case LowerBoundIntegerVariable: // CPLEX MPS file references seems to imply that integer variables default to an upper bound @@ -1419,6 +1660,7 @@ void mps_parser_t::read_bound_and_value(std::string_view line, } variable_lower_bounds[var_id] = get_numerical_bound(line, start); var_types[var_id] = 'I'; + lower_bounds_defined_for_var_id.insert(var_id); break; case UpperBoundIntegerVariable: variable_upper_bounds[var_id] = get_numerical_bound(line, start); @@ -1430,11 +1672,15 @@ void mps_parser_t::read_bound_and_value(std::string_view line, } var_types[var_id] = 'I'; break; - case SemiContiniousVariable: - mps_parser_expects(false, + case SemiContinuousVariable: + // SC bound type: value is the upper bound U. + mps_parser_expects(start >= 0 && static_cast(start) < line.size() && + !trim(line.substr(static_cast(start))).empty(), error_type_t::ValidationError, - "Unsupported semi continous bound type found! Line=%s", + "SC bound requires an upper bound value! Line=%s", std::string(line).c_str()); + variable_upper_bounds[var_id] = get_numerical_bound(line, start); + var_types[var_id] = 'S'; break; default: mps_parser_expects(false, diff --git a/cpp/libmps_parser/src/mps_parser.hpp b/cpp/libmps_parser/src/mps_parser.hpp index facad14c66..f2a9ce14e0 100644 --- a/cpp/libmps_parser/src/mps_parser.hpp +++ b/cpp/libmps_parser/src/mps_parser.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -41,7 +42,7 @@ enum BoundType { BinaryVariable, LowerBoundIntegerVariable, UpperBoundIntegerVariable, - SemiContiniousVariable, + SemiContinuousVariable, }; // enum BoundType /** @@ -76,6 +77,18 @@ class mps_parser_t { const std::string& file, bool fixed_mps_format = true); + /** + * @brief Ctor. Parses the MPS text and generates the internal representation. + * + * @param[out] problem Problem representation that will be filled after parsing the MPS text + * @param[in] input MPS text to be parsed + * @param[in] fixed_mps_format Bool which describes whether the MPS file is in fixed format or + * not. Default is true. + */ + mps_parser_t(mps_data_model_t& problem, + std::string_view input, + bool fixed_mps_format = true); + /** path to the mps file being parsed */ std::string mps_file{}; /** whether the MPS file is in fixed format or not */ @@ -130,11 +143,24 @@ class mps_parser_t { // QPS-specific parsing states bool inside_quadobj_{false}; bool inside_qmatrix_{false}; + bool inside_qcmatrix_{false}; + + /** (free-format) QCMATRIX: finalized blocks (row id + triples) */ + struct qcmatrix_raw_block_t { + i_t constraint_row_id{}; + std::vector> entries{}; + }; + std::vector qcmatrix_blocks_{}; + /** Triples for the QCMATRIX block currently being read (-1 row id means none) */ + i_t qcmatrix_active_row_id_{-1}; + std::vector> qcmatrix_current_entries_{}; + std::unordered_set encountered_sections{}; std::unordered_map row_names_map{}; std::unordered_map var_names_map{}; std::unordered_set ignored_objective_names{}; std::unordered_set bounds_defined_for_var_id{}; + std::unordered_set lower_bounds_defined_for_var_id{}; static constexpr f_t unset_range_value = std::numeric_limits::infinity(); /* Reads an MPS input file into a buffer. @@ -170,6 +196,11 @@ class mps_parser_t { // QPS-specific parsing methods void parse_quad(std::string_view line, bool is_quadobj); + // QCMATRIX-specific parsing methods + void flush_qcmatrix_block(); + void parse_qcmatrix_header(std::string_view line); + void parse_qcmatrix_data(std::string_view line); + }; // class mps_parser_t } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/src/mps_writer.cpp b/cpp/libmps_parser/src/mps_writer.cpp index 3a0997774b..b112b53476 100644 --- a/cpp/libmps_parser/src/mps_writer.cpp +++ b/cpp/libmps_parser/src/mps_writer.cpp @@ -12,16 +12,29 @@ #include #include +#include #include #include #include -#include #include #include #include +#include namespace cuopt::mps_parser { +namespace { + +template +char linear_row_type_from_bounds(f_t cl, f_t cu) +{ + if (cl == cu) { return 'E'; } + if (std::isinf(cu)) { return 'G'; } + return 'L'; +} + +} // namespace + template mps_writer_t::mps_writer_t(const data_model_view_t& problem) : problem_(problem) { @@ -103,6 +116,12 @@ data_model_view_t mps_writer_t::create_view( static_cast(Q_offsets.size())); } + if (model.has_quadratic_constraints()) { + view.set_quadratic_constraints( + std::vector::quadratic_constraint_t>( + model.get_quadratic_constraints())); + } + return view; } @@ -129,6 +148,8 @@ void mps_writer_t::write(const std::string& mps_file_path) n_constraints = problem_.get_constraint_bounds().size(); else n_constraints = problem_.get_constraint_lower_bounds().size(); + const auto& quadratic_constraints = problem_.get_quadratic_constraints(); + const i_t n_quadratic_constraints = static_cast(quadratic_constraints.size()); std::vector objective_coefficients(problem_.get_objective_coefficients().size()); std::vector constraint_lower_bounds(n_constraints); @@ -211,16 +232,20 @@ void mps_writer_t::write(const std::string& mps_file_path) mps_file << " N " << (problem_.get_objective_name().empty() ? "OBJ" : problem_.get_objective_name()) << "\n"; - for (size_t i = 0; i < (size_t)n_constraints; i++) { + for (size_t k = 0; k < static_cast(n_constraints); ++k) { std::string row_name = - i < problem_.get_row_names().size() ? problem_.get_row_names()[i] : "R" + std::to_string(i); - char type = 'L'; - if (constraint_lower_bounds[i] == constraint_upper_bounds[i]) - type = 'E'; - else if (std::isinf(constraint_upper_bounds[i])) - type = 'G'; + k < problem_.get_row_names().size() ? problem_.get_row_names()[k] : "R" + std::to_string(k); + char const type = + linear_row_type_from_bounds(constraint_lower_bounds[k], constraint_upper_bounds[k]); mps_file << " " << type << " " << row_name << "\n"; } + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + std::string row_name = + qc.constraint_row_name.empty() ? "QC" + std::to_string(q) : qc.constraint_row_name; + // Quadratic rows are currently restricted to MPS 'L' (<=). + mps_file << " L " << row_name << "\n"; + } // COLUMNS section mps_file << "COLUMNS\n"; @@ -230,9 +255,13 @@ void mps_writer_t::write(const std::string& mps_file_path) std::vector var_in_constraint(n_variables, false); std::map>> integral_col_nnzs; std::map>> continuous_col_nnzs; - for (size_t row_id = 0; row_id < (size_t)n_constraints; row_id++) { - for (size_t k = (size_t)constraint_matrix_offsets[row_id]; - k < (size_t)constraint_matrix_offsets[row_id + 1]; + + // iterate over the constraint matrix and add the nonzeros to the integral and continuous col_nnzs + // maps + for (size_t csr_row = 0; csr_row < (size_t)n_constraints; csr_row++) { + const i_t row_id = static_cast(csr_row); + for (size_t k = (size_t)constraint_matrix_offsets[csr_row]; + k < (size_t)constraint_matrix_offsets[csr_row + 1]; k++) { size_t var = (size_t)constraint_matrix_indices[k]; if (variable_types[var] == 'I') { @@ -244,6 +273,24 @@ void mps_writer_t::write(const std::string& mps_file_path) } } + // Quadratic constraint rows omit linear coefficients from global A; add them from QC bundles. + if (problem_.has_quadratic_constraints()) { + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + const size_t row_id = static_cast(n_constraints) + q; + for (size_t t = 0; t < qc.linear_indices.size(); ++t) { + size_t var = static_cast(qc.linear_indices[t]); + f_t val = qc.linear_values[t]; + if (variable_types[var] == 'I') { + integral_col_nnzs[var].emplace_back(row_id, val); + } else { + continuous_col_nnzs[var].emplace_back(row_id, val); + } + var_in_constraint[var] = true; + } + } + } + // Record and explicitely declared variables not contained in any constraint. std::vector orphan_continuous_vars; std::vector orphan_integer_vars; @@ -276,9 +323,21 @@ void mps_writer_t::write(const std::string& mps_file_path) ? problem_.get_variable_names()[var_id] : "C" + std::to_string(var_id); for (auto& nnz : nnzs) { - std::string row_name = nnz.first < problem_.get_row_names().size() - ? problem_.get_row_names()[nnz.first] - : "R" + std::to_string(nnz.first); + std::string row_name; + if (static_cast(nnz.first) < static_cast(n_constraints)) { + // Linear rows: do not use row-name count here—names are optional; row id is 0..m-1. + row_name = static_cast(nnz.first) < problem_.get_row_names().size() + ? problem_.get_row_names()[nnz.first] + : "R" + std::to_string(nnz.first); + } else if (static_cast(nnz.first) < + static_cast(n_constraints) + quadratic_constraints.size()) { + const size_t q = static_cast(nnz.first) - static_cast(n_constraints); + row_name = quadratic_constraints[q].constraint_row_name.empty() + ? "QC" + std::to_string(q) + : quadratic_constraints[q].constraint_row_name; + } else { + row_name = "R" + std::to_string(nnz.first); + } mps_file << " " << col_name << " " << row_name << " " << nnz.second << "\n"; } // Write objective coefficients @@ -293,21 +352,28 @@ void mps_writer_t::write(const std::string& mps_file_path) // RHS section mps_file << "RHS\n"; - for (size_t i = 0; i < (size_t)n_constraints; i++) { + for (size_t k = 0; k < static_cast(n_constraints); ++k) { std::string row_name = - i < problem_.get_row_names().size() ? problem_.get_row_names()[i] : "R" + std::to_string(i); - - f_t rhs; + k < problem_.get_row_names().size() ? problem_.get_row_names()[k] : "R" + std::to_string(k); + f_t rhs{0}; if (constraint_bounds.size() > 0) - rhs = constraint_bounds[i]; - else if (std::isinf(constraint_lower_bounds[i])) { - rhs = constraint_upper_bounds[i]; - } else if (std::isinf(constraint_upper_bounds[i])) { - rhs = constraint_lower_bounds[i]; - } else { // RANGES, encode the lower bound - rhs = constraint_lower_bounds[i]; + rhs = constraint_bounds[k]; + else if (std::isinf(constraint_lower_bounds[k])) { + rhs = constraint_upper_bounds[k]; + } else if (std::isinf(constraint_upper_bounds[k])) { + rhs = constraint_lower_bounds[k]; + } else { + rhs = constraint_lower_bounds[k]; } - + if (std::isfinite(rhs) && rhs != 0.0) { + mps_file << " RHS1 " << row_name << " " << rhs << "\n"; + } + } + for (size_t q = 0; q < quadratic_constraints.size(); ++q) { + const auto& qc = quadratic_constraints[q]; + std::string row_name = + qc.constraint_row_name.empty() ? "QC" + std::to_string(q) : qc.constraint_row_name; + const f_t rhs = qc.rhs_value; if (std::isfinite(rhs) && rhs != 0.0) { mps_file << " RHS1 " << row_name << " " << rhs << "\n"; } @@ -427,6 +493,29 @@ void mps_writer_t::write(const std::string& mps_file_path) } } + // QCMATRIX sections for quadratic constraints (QCQP) + if (problem_.has_quadratic_constraints()) { + for (const auto& qc : problem_.get_quadratic_constraints()) { + mps_file << "QCMATRIX " << qc.constraint_row_name << "\n"; + const i_t n_quad_rows = static_cast(qc.quadratic_offsets.size()) - 1; + for (i_t i = 0; i < n_quad_rows; ++i) { + std::string row_var_name = static_cast(i) < problem_.get_variable_names().size() + ? problem_.get_variable_names()[i] + : "C" + std::to_string(i); + for (i_t p = qc.quadratic_offsets[i]; p < qc.quadratic_offsets[i + 1]; ++p) { + i_t j = qc.quadratic_indices[p]; + f_t v = qc.quadratic_values[p]; + std::string col_var_name = static_cast(j) < problem_.get_variable_names().size() + ? problem_.get_variable_names()[j] + : "C" + std::to_string(j); + if (v != f_t(0)) { + mps_file << " " << row_var_name << " " << col_var_name << " " << v << "\n"; + } + } + } + } + } + mps_file << "ENDATA\n"; mps_file.close(); } diff --git a/cpp/libmps_parser/src/parser.cpp b/cpp/libmps_parser/src/parser.cpp index 3cbb4aee98..681fddf380 100644 --- a/cpp/libmps_parser/src/parser.cpp +++ b/cpp/libmps_parser/src/parser.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -19,8 +19,21 @@ mps_data_model_t parse_mps(const std::string& mps_file, bool fixed_mps return problem; } +template +mps_data_model_t parse_mps_from_string(std::string_view mps_contents, + bool fixed_mps_format) +{ + mps_data_model_t problem; + mps_parser_t parser(problem, mps_contents, fixed_mps_format); + return problem; +} + template mps_data_model_t parse_mps(const std::string& mps_file, bool fixed_mps_format); template mps_data_model_t parse_mps(const std::string& mps_file, bool fixed_mps_format); +template mps_data_model_t parse_mps_from_string(std::string_view mps_contents, + bool fixed_mps_format); +template mps_data_model_t parse_mps_from_string(std::string_view mps_contents, + bool fixed_mps_format); } // namespace cuopt::mps_parser diff --git a/cpp/libmps_parser/src/utilities/error.hpp b/cpp/libmps_parser/src/utilities/error.hpp index 4ce68f5098..595a29059d 100644 --- a/cpp/libmps_parser/src/utilities/error.hpp +++ b/cpp/libmps_parser/src/utilities/error.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -49,9 +49,7 @@ inline void mps_parser_expects(bool cond, error_type_t error_type, const char* f if (not cond) { va_list args; va_start(args, fmt); - char msg[2048]; - va_start(args, fmt); vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); @@ -75,9 +73,7 @@ inline void mps_parser_expects_fatal(bool cond, error_type_t error_type, const c if (not cond) { va_list args; va_start(args, fmt); - char msg[2048]; - va_start(args, fmt); vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); std::string error_string = error_to_string(error_type); diff --git a/cpp/libmps_parser/tests/CMakeLists.txt b/cpp/libmps_parser/tests/CMakeLists.txt index 2d86a1da18..6d8b5b2ca5 100644 --- a/cpp/libmps_parser/tests/CMakeLists.txt +++ b/cpp/libmps_parser/tests/CMakeLists.txt @@ -12,11 +12,6 @@ function(ConfigureTest CMAKE_TEST_NAME) set_target_properties(${CMAKE_TEST_NAME} PROPERTIES - # set target compile options - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 20 - CUDA_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON CXX_SCAN_FOR_MODULES OFF ) @@ -26,6 +21,7 @@ function(ConfigureTest CMAKE_TEST_NAME) "${CMAKE_CURRENT_SOURCE_DIR}/../include" "${CMAKE_CURRENT_SOURCE_DIR}/../src" "${CMAKE_CURRENT_SOURCE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/../../tests" ) target_link_libraries(${CMAKE_TEST_NAME} @@ -53,4 +49,5 @@ endfunction() ConfigureTest(MPS_PARSER_TEST mps_parser_test.cpp ) + set_tests_properties(MPS_PARSER_TEST PROPERTIES LABELS "numopt") ################################################################################################### diff --git a/cpp/libmps_parser/tests/mps_parser_test.cpp b/cpp/libmps_parser/tests/mps_parser_test.cpp index f915fb2df5..0c3b2dcb5a 100644 --- a/cpp/libmps_parser/tests/mps_parser_test.cpp +++ b/cpp/libmps_parser/tests/mps_parser_test.cpp @@ -6,6 +6,7 @@ /* clang-format on */ #include +#include #include #include @@ -13,10 +14,13 @@ #include +#include #include #include #include +#include #include +#include #include #include @@ -422,6 +426,59 @@ TEST(mps_bounds, upper_inf_var_bound) EXPECT_EQ(std::numeric_limits::infinity(), mps.variable_upper_bounds[1]); } +TEST(mps_bounds, semi_continuous_var_bounds_from_dataset) +{ + struct Case { + const char* name; + const char* mps; + int n_vars; + double lower; + double upper; + }; + const std::vector cases = { + {"sc_standard", cuopt::test::inline_mps::sc_standard_mps, 2, 2.0, 10.0}, + {"sc_lb_zero", cuopt::test::inline_mps::sc_lb_zero_mps, 2, 0.0, 10.0}, + {"sc_no_ub", cuopt::test::inline_mps::sc_no_ub_mps, 2, 2.0, 1e30}, + }; + + for (const auto& c : cases) { + SCOPED_TRACE(c.name); + auto mps = cuopt::test::inline_mps::parse_inline_mps(c.mps); + const auto& var_types = mps.get_variable_types(); + const auto& lower = mps.get_variable_lower_bounds(); + const auto& upper = mps.get_variable_upper_bounds(); + + ASSERT_EQ(c.n_vars, static_cast(var_types.size())); + EXPECT_EQ('S', var_types[0]); + ASSERT_EQ(c.n_vars, static_cast(lower.size())); + ASSERT_EQ(c.n_vars, static_cast(upper.size())); + EXPECT_DOUBLE_EQ(c.lower, lower[0]); + EXPECT_DOUBLE_EQ(c.upper, upper[0]); + } +} + +TEST(mps_bounds, semi_continuous_missing_lower_defaults_to_zero) +{ + auto mps = cuopt::test::inline_mps::parse_inline_mps(cuopt::test::inline_mps::sc_lb_zero_mps); + const auto& var_types = mps.get_variable_types(); + const auto& lower = mps.get_variable_lower_bounds(); + const auto& upper = mps.get_variable_upper_bounds(); + + ASSERT_EQ(2, static_cast(var_types.size())); + EXPECT_EQ('S', var_types[0]); + ASSERT_EQ(2, static_cast(lower.size())); + ASSERT_EQ(2, static_cast(upper.size())); + EXPECT_DOUBLE_EQ(0.0, lower[0]); + EXPECT_DOUBLE_EQ(10.0, upper[0]); +} + +TEST(mps_bounds, semi_continuous_missing_upper_rejected) +{ + EXPECT_THROW( + cuopt::test::inline_mps::parse_inline_mps(cuopt::test::inline_mps::sc_missing_upper_mps), + std::logic_error); +} + TEST(mps_ranges, fixed_ranges) { std::string file = "linear_programming/good-mps-fixed-ranges.mps"; @@ -555,16 +612,22 @@ TEST(mps_ranges, bad_value) std::logic_error); } -TEST(mps_bounds, unsupported_or_invalid_mps_types) +TEST(mps_bounds, semi_continuous_bound_type) { - std::stringstream ss; - static constexpr int NumMpsFiles = 2; - for (int i = 1; i <= NumMpsFiles; ++i) { - ss << "linear_programming/bad-mps-bound-" << i << ".mps"; - ASSERT_THROW(read_from_mps(ss.str(), false), std::logic_error); - ss.str(std::string{}); - ss.clear(); - }; + auto mps = read_from_mps("linear_programming/good-mps-semi-continuous-bound.mps", false); + + ASSERT_EQ(int(2), mps.var_names.size()); + ASSERT_EQ(int(2), mps.var_types.size()); + EXPECT_EQ('S', mps.var_types[0]); + ASSERT_EQ(int(2), mps.variable_lower_bounds.size()); + ASSERT_EQ(int(2), mps.variable_upper_bounds.size()); + EXPECT_DOUBLE_EQ(0.0, mps.variable_lower_bounds[0]); + EXPECT_DOUBLE_EQ(2.0, mps.variable_upper_bounds[0]); +} + +TEST(mps_bounds, invalid_bound_type) +{ + ASSERT_THROW(read_from_mps("linear_programming/bad-mps-bound-1.mps", false), std::logic_error); } TEST(mps_parser, good_mps_file_mip_1) @@ -841,12 +904,7 @@ TEST(qps_parser, quadratic_objective_basic) std::vector Q_indices = {0, 1, 0, 1}; std::vector Q_offsets = {0, 2, 4}; // CSR offsets - model.set_quadratic_objective_matrix(Q_values.data(), - Q_values.size(), - Q_indices.data(), - Q_indices.size(), - Q_offsets.data(), - Q_offsets.size()); + model.set_quadratic_objective_matrix(Q_values, Q_indices, Q_offsets); // Verify the data was stored correctly EXPECT_TRUE(model.has_quadratic_objective()); @@ -855,6 +913,163 @@ TEST(qps_parser, quadratic_objective_basic) EXPECT_EQ(1.0, model.get_quadratic_objective_values()[1]); } +// ================================================================================================ +// QCMATRIX Support Tests +// ================================================================================================ + +TEST(qps_parser, qcmatrix_append_api) +{ + using model_t = mps_data_model_t; + model_t model; + + // Validate default-constructed struct shape. + model_t::quadratic_constraint_t default_qcm; + EXPECT_EQ(0, default_qcm.constraint_row_index); + EXPECT_TRUE(default_qcm.quadratic_values.empty()); + EXPECT_TRUE(default_qcm.quadratic_indices.empty()); + EXPECT_TRUE(default_qcm.quadratic_offsets.empty()); + EXPECT_TRUE(default_qcm.linear_values.empty()); + EXPECT_TRUE(default_qcm.linear_indices.empty()); + EXPECT_EQ(0.0, default_qcm.rhs_value); + + // QC0: [[10, 2], [2, 2]] + const std::vector qc0_values = {10.0, 2.0, 2.0, 2.0}; + const std::vector qc0_indices = {0, 1, 0, 1}; + const std::vector qc0_offsets = {0, 2, 4}; + const std::vector qc0_linear_values = {1.0, 1.0}; + const std::vector qc0_linear_indices = {0, 1}; + model.append_quadratic_constraint(0, + "QC0", + 'L', + qc0_linear_values, + qc0_linear_indices, + 5.0, + qc0_values, + qc0_indices, + qc0_offsets); + + // QC1: [[4, 1], [1, 6]] + const std::vector qc1_values = {4.0, 1.0, 1.0, 6.0}; + const std::vector qc1_indices = {0, 1, 0, 1}; + const std::vector qc1_offsets = {0, 2, 4}; + const std::vector qc1_linear_values = {3.0, 1.0}; + const std::vector qc1_linear_indices = {0, 1}; + model.append_quadratic_constraint(1, + "QC1", + 'L', + qc1_linear_values, + qc1_linear_indices, + 10.0, + qc1_values, + qc1_indices, + qc1_offsets); + + ASSERT_TRUE(model.has_quadratic_constraints()); + const auto& qcs = model.get_quadratic_constraints(); + ASSERT_EQ(2u, qcs.size()); + + EXPECT_EQ(0, qcs[0].constraint_row_index); + EXPECT_EQ("QC0", qcs[0].constraint_row_name); + EXPECT_EQ('L', qcs[0].constraint_row_type); + EXPECT_EQ(qc0_linear_values, qcs[0].linear_values); + EXPECT_EQ(qc0_linear_indices, qcs[0].linear_indices); + EXPECT_EQ(5.0, qcs[0].rhs_value); + EXPECT_EQ(qc0_values, qcs[0].quadratic_values); + EXPECT_EQ(qc0_indices, qcs[0].quadratic_indices); + EXPECT_EQ(qc0_offsets, qcs[0].quadratic_offsets); + + EXPECT_EQ(1, qcs[1].constraint_row_index); + EXPECT_EQ("QC1", qcs[1].constraint_row_name); + EXPECT_EQ('L', qcs[1].constraint_row_type); + EXPECT_EQ(qc1_linear_values, qcs[1].linear_values); + EXPECT_EQ(qc1_linear_indices, qcs[1].linear_indices); + EXPECT_EQ(10.0, qcs[1].rhs_value); + EXPECT_EQ(qc1_values, qcs[1].quadratic_values); + EXPECT_EQ(qc1_indices, qcs[1].quadratic_indices); + EXPECT_EQ(qc1_offsets, qcs[1].quadratic_offsets); +} + +// QCQP MPS: each quadratic constraint bundles row + linear + rhs + quadratic. +TEST(qps_parser, qcmatrix_mps_linear_rhs_and_bounds) +{ + if (!file_exists("qcqp/QC_Test_1.mps")) { + GTEST_SKIP() << "qcqp/QC_Test_1.mps not in dataset root"; + } + const auto model = parse_mps( + cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/QC_Test_1.mps", false); + + ASSERT_TRUE(model.has_quadratic_constraints()); + const auto& qcs = model.get_quadratic_constraints(); + ASSERT_EQ(2u, qcs.size()); + + ASSERT_EQ(1, model.get_n_constraints()); + ASSERT_EQ(1u, model.get_row_names().size()); + EXPECT_EQ("LIN0", model.get_row_names()[0]); + EXPECT_EQ('L', model.get_row_types()[0]); + + // LIN0: 2*x1 + x2 ≤ 15 (linear row only; not duplicated in quadratic_constraints) + EXPECT_DOUBLE_EQ(-std::numeric_limits::infinity(), + model.get_constraint_lower_bounds()[0]); + EXPECT_DOUBLE_EQ(15.0, model.get_constraint_upper_bounds()[0]); + const auto& A_off = model.get_constraint_matrix_offsets(); + const auto& A_val = model.get_constraint_matrix_values(); + const auto& A_idx = model.get_constraint_matrix_indices(); + ASSERT_EQ(2, A_off[1] - A_off[0]); + EXPECT_EQ(2.0, A_val[A_off[0] + 0]); + EXPECT_EQ(1.0, A_val[A_off[0] + 1]); + EXPECT_EQ(0, A_idx[A_off[0] + 0]); + EXPECT_EQ(1, A_idx[A_off[0] + 1]); + + // QC0: x1 + x2 + xᵀQ₀x ≤ 5 (MPS ROWS declaration index 1; OBJ 'N' rows are not counted) + EXPECT_EQ(1, qcs[0].constraint_row_index); + EXPECT_EQ("QC0", qcs[0].constraint_row_name); + EXPECT_EQ('L', qcs[0].constraint_row_type); + ASSERT_EQ(2u, qcs[0].linear_values.size()); + EXPECT_EQ(1.0, qcs[0].linear_values[0]); + EXPECT_EQ(1.0, qcs[0].linear_values[1]); + EXPECT_EQ(0, qcs[0].linear_indices[0]); + EXPECT_EQ(1, qcs[0].linear_indices[1]); + EXPECT_DOUBLE_EQ(5.0, qcs[0].rhs_value); + EXPECT_FALSE(qcs[0].quadratic_values.empty()); + + // QC1: 3*x1 + x2 + xᵀQ₁x ≤ 10 + EXPECT_EQ(2, qcs[1].constraint_row_index); + EXPECT_EQ("QC1", qcs[1].constraint_row_name); + EXPECT_EQ('L', qcs[1].constraint_row_type); + ASSERT_EQ(2u, qcs[1].linear_values.size()); + EXPECT_EQ(3.0, qcs[1].linear_values[0]); + EXPECT_EQ(1.0, qcs[1].linear_values[1]); + EXPECT_DOUBLE_EQ(10.0, qcs[1].rhs_value); +} + +TEST(qps_parser, qcqp_p0033_mps_sections) +{ + if (!file_exists("qcqp/p0033_qc1.mps")) { + GTEST_SKIP() << "qcqp/p0033_qc1.mps not in dataset root"; + } + const auto model = parse_mps( + cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps", false); + + EXPECT_EQ(12, model.get_n_constraints()); + EXPECT_EQ(33, model.get_n_variables()); + ASSERT_EQ(12u, model.get_row_types().size()); + ASSERT_EQ(12u, model.get_row_names().size()); + + const auto& qcs = model.get_quadratic_constraints(); + ASSERT_EQ(4u, qcs.size()); + EXPECT_EQ(12, qcs[0].constraint_row_index); + ASSERT_EQ(1u, qcs[0].linear_values.size()); + EXPECT_DOUBLE_EQ(1.0, qcs[0].linear_values[0]); + + const auto& vnames = model.get_variable_names(); + auto c159_it = std::find(vnames.begin(), vnames.end(), std::string("C159")); + ASSERT_NE(c159_it, vnames.end()); + EXPECT_EQ(static_cast(c159_it - vnames.begin()), qcs[0].linear_indices[0]); + + EXPECT_DOUBLE_EQ(1.0, qcs[0].rhs_value); + EXPECT_FALSE(qcs[0].quadratic_values.empty()); +} + // Test actual QPS files from the dataset TEST(qps_parser, test_qps_files) { @@ -1017,6 +1232,37 @@ void compare_data_models(const mps_data_model_t& original, EXPECT_EQ(orig_Q_off[i], reload_Q_off[i]) << "Q offset mismatch at index " << i; } } + + EXPECT_EQ(original.has_quadratic_constraints(), reloaded.has_quadratic_constraints()); + if (original.has_quadratic_constraints() && reloaded.has_quadratic_constraints()) { + const auto& oqc = original.get_quadratic_constraints(); + const auto& rq = reloaded.get_quadratic_constraints(); + ASSERT_EQ(oqc.size(), rq.size()) << "Quadratic constraint count mismatch"; + for (size_t k = 0; k < oqc.size(); ++k) { + EXPECT_EQ(oqc[k].constraint_row_index, rq[k].constraint_row_index); + EXPECT_EQ(oqc[k].constraint_row_name, rq[k].constraint_row_name); + EXPECT_EQ(oqc[k].constraint_row_type, rq[k].constraint_row_type); + EXPECT_NEAR(oqc[k].rhs_value, rq[k].rhs_value, tol); + ASSERT_EQ(oqc[k].linear_values.size(), rq[k].linear_values.size()); + ASSERT_EQ(oqc[k].linear_indices.size(), rq[k].linear_indices.size()); + for (size_t i = 0; i < oqc[k].linear_values.size(); ++i) { + EXPECT_NEAR(oqc[k].linear_values[i], rq[k].linear_values[i], tol); + EXPECT_EQ(oqc[k].linear_indices[i], rq[k].linear_indices[i]); + } + ASSERT_EQ(oqc[k].quadratic_values.size(), rq[k].quadratic_values.size()); + ASSERT_EQ(oqc[k].quadratic_indices.size(), rq[k].quadratic_indices.size()); + ASSERT_EQ(oqc[k].quadratic_offsets.size(), rq[k].quadratic_offsets.size()); + for (size_t i = 0; i < oqc[k].quadratic_values.size(); ++i) { + EXPECT_NEAR(oqc[k].quadratic_values[i], rq[k].quadratic_values[i], tol); + } + for (size_t i = 0; i < oqc[k].quadratic_indices.size(); ++i) { + EXPECT_EQ(oqc[k].quadratic_indices[i], rq[k].quadratic_indices[i]); + } + for (size_t i = 0; i < oqc[k].quadratic_offsets.size(); ++i) { + EXPECT_EQ(oqc[k].quadratic_offsets[i], rq[k].quadratic_offsets[i]); + } + } + } } TEST(mps_roundtrip, linear_programming_basic) @@ -1127,4 +1373,29 @@ TEST(mps_roundtrip, quadratic_programming_qp_test_2) std::filesystem::remove(temp_file); } +TEST(mps_roundtrip, qcqp_p0033_qc1) +{ + if (!file_exists("qcqp/p0033_qc1.mps")) { GTEST_SKIP() << "Test file not found"; } + + std::string input_file = cuopt::test::get_rapids_dataset_root_dir() + "/qcqp/p0033_qc1.mps"; + std::string temp_file = "/tmp/mps_roundtrip_p0033_qc1.mps"; + std::string temp_file_2 = "/tmp/mps_roundtrip_p0033_qc1_r2.mps"; + + auto original = parse_mps(input_file, false); + ASSERT_TRUE(original.has_quadratic_objective()); + ASSERT_TRUE(original.has_quadratic_constraints()); + + mps_writer_t writer(original); + writer.write(temp_file); + + auto reloaded = parse_mps(temp_file, false); + mps_writer_t writer_r2(reloaded); + writer_r2.write(temp_file_2); + auto reloaded_2 = parse_mps(temp_file_2, false); + compare_data_models(reloaded, reloaded_2); + + std::filesystem::remove(temp_file); + std::filesystem::remove(temp_file_2); +} + } // namespace cuopt::mps_parser diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 4da66abe77..778038db1f 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -40,7 +40,9 @@ #include #include +#include #include +#include namespace cuopt::linear_programming::dual_simplex { @@ -1092,6 +1094,7 @@ class iteration_data_t { std::sort(column_nz_permutation.begin(), column_nz_permutation.end(), [&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; }); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // We then compute the exact sparsity pattern for columns of A whose where // the number of nonzeros is less than a threshold. This part can be done @@ -1122,6 +1125,7 @@ class iteration_data_t { // The best way to do that is to have A stored in CSR format. csr_matrix_t A_row(0, 0, 0); A.to_compressed_row(A_row); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } std::vector histogram(m + 1, 0); for (i_t j = 0; j < n; j++) { @@ -1251,6 +1255,7 @@ class iteration_data_t { std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) { return delta_nz[i] < delta_nz[j]; }); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // Now we make a forward pass and compute the number of nonzeros in C // assuming we had included column j @@ -2295,6 +2300,12 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tfactorize(data.device_augmented); #ifdef CHOLESKY_DEBUG_CHECK @@ -2303,6 +2314,12 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tfactorize(data.device_ADAT); } diff --git a/cpp/src/barrier/iterative_refinement.hpp b/cpp/src/barrier/iterative_refinement.hpp index d37760cd07..69e72d66bc 100644 --- a/cpp/src/barrier/iterative_refinement.hpp +++ b/cpp/src/barrier/iterative_refinement.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh index f7938fb989..52fea89502 100644 --- a/cpp/src/barrier/sparse_cholesky.cuh +++ b/cpp/src/barrier/sparse_cholesky.cuh @@ -247,8 +247,8 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { CUDSS_CALL_AND_CHECK_EXIT(cudssSetStream(handle, stream), status, "cudaStreamCreate"); mem_handler.ctx = reinterpret_cast(handle_ptr_->get_workspace_resource()); - mem_handler.device_alloc = cudss_device_alloc; - mem_handler.device_free = cudss_device_dealloc; + mem_handler.device_alloc = cudss_device_alloc; + mem_handler.device_free = cudss_device_dealloc; CUDSS_CALL_AND_CHECK_EXIT( cudssSetDeviceMemHandler(handle, &mem_handler), status, "cudssSetDeviceMemHandler"); diff --git a/cpp/src/branch_and_bound/CMakeLists.txt b/cpp/src/branch_and_bound/CMakeLists.txt index 5bb1017120..1e40c1bbf1 100644 --- a/cpp/src/branch_and_bound/CMakeLists.txt +++ b/cpp/src/branch_and_bound/CMakeLists.txt @@ -5,7 +5,6 @@ set(BRANCH_AND_BOUND_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/branch_and_bound.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mip_node.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pseudo_costs.cpp ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp ) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 33a2d983c9..1acc16af54 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -6,10 +6,13 @@ /* clang-format on */ #include +#include #include #include #include +#include +#include #include #include @@ -25,6 +28,7 @@ #include #include +#include #include @@ -33,17 +37,12 @@ #include #include #include -#include #include -#include #include #include -#include -#include #include namespace cuopt::linear_programming::dual_simplex { - namespace { template @@ -258,7 +257,7 @@ branch_and_bound_t::branch_and_bound_t( incumbent_(1), root_relax_soln_(1, 1), root_crossover_soln_(1, 1), - pc_(1), + pc_(1, solver_settings), solver_status_(mip_status_t::UNSET) { exploration_stats_.start_time = start_time; @@ -299,10 +298,11 @@ branch_and_bound_t::branch_and_bound_t( template f_t branch_and_bound_t::get_lower_bound() { - f_t lower_bound = lower_bound_ceiling_.load(); - f_t heap_lower_bound = node_queue_.get_lower_bound(); - lower_bound = std::min(heap_lower_bound, lower_bound); - lower_bound = std::min(worker_pool_.get_lower_bound(), lower_bound); + f_t lower_bound = lower_bound_ceiling_.load(); + f_t heap_lower_bound = node_queue_.get_lower_bound(); + f_t worker_lower_bound = worker_pool_.get_lower_bound(); + lower_bound = std::min(heap_lower_bound, lower_bound); + lower_bound = std::min(worker_lower_bound, lower_bound); if (std::isfinite(lower_bound)) { return lower_bound; @@ -809,7 +809,7 @@ void branch_and_bound_t::add_feasible_solution(f_t leaf_objective, // Technische Universit¨at Berlin, Berlin, 1999. Accessed: Aug. 08, 2025. // [Online]. Available: https://opus4.kobv.de/opus4-zib/frontdoor/index/index/docId/391 template -rounding_direction_t martin_criteria(f_t val, f_t root_val) +branch_direction_t martin_criteria(f_t val, f_t root_val) { const f_t down_val = std::floor(root_val); const f_t up_val = std::ceil(root_val); @@ -818,10 +818,10 @@ rounding_direction_t martin_criteria(f_t val, f_t root_val) constexpr f_t eps = 1e-6; if (down_dist < up_dist + eps) { - return rounding_direction_t::DOWN; + return branch_direction_t::DOWN; } else { - return rounding_direction_t::UP; + return branch_direction_t::UP; } } @@ -832,9 +832,9 @@ branch_variable_t branch_and_bound_t::variable_selection( branch_and_bound_worker_t* worker) { logger_t log; - log.log = false; - i_t branch_var = -1; - rounding_direction_t round_dir = rounding_direction_t::NONE; + log.log = false; + i_t branch_var = -1; + branch_direction_t round_dir = branch_direction_t::NONE; std::vector current_incumbent; std::vector& solution = worker->leaf_solution.x; @@ -847,14 +847,12 @@ branch_variable_t branch_and_bound_t::variable_selection( worker, var_types_, exploration_stats_, - settings_, upper_bound_, worker_pool_.num_idle_workers(), - log, new_slacks_, original_lp_); } else { - branch_var = pc_.variable_selection(fractional, solution, log); + branch_var = pc_.variable_selection(fractional, solution); } round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]); @@ -879,7 +877,7 @@ branch_variable_t branch_and_bound_t::variable_selection( default: log.debug("Unknown variable selection method: %d\n", worker->search_strategy); - return {-1, rounding_direction_t::NONE}; + return {-1, branch_direction_t::NONE}; } } @@ -906,7 +904,7 @@ struct tree_update_policy_t { const std::vector& x) = 0; virtual void on_node_completed(mip_node_t* node, node_status_t status, - rounding_direction_t dir) = 0; + branch_direction_t dir) = 0; virtual void on_numerical_issue(mip_node_t*) = 0; virtual void graphviz(search_tree_t&, mip_node_t*, const char*, f_t) = 0; virtual void on_optimal_callback(const std::vector&, f_t) = 0; @@ -951,9 +949,7 @@ struct nondeterministic_policy_t : tree_update_policy_t { const std::vector& x) override { if (worker->search_strategy == search_strategy_t::BEST_FIRST) { - logger_t pc_log; - pc_log.log = false; - node->objective_estimate = bnb.pc_.obj_estimate(fractional, x, node->lower_bound, pc_log); + node->objective_estimate = bnb.pc_.obj_estimate(fractional, x, node->lower_bound); } } @@ -985,7 +981,7 @@ struct nondeterministic_policy_t : tree_update_policy_t { } } - void on_node_completed(mip_node_t*, node_status_t, rounding_direction_t) override {} + void on_node_completed(mip_node_t*, node_status_t, branch_direction_t) override {} }; template @@ -1004,7 +1000,7 @@ struct deterministic_policy_base_t : tree_update_policy_t { { if (node->branch_var < 0) return; f_t change = std::max(leaf_obj - node->lower_bound, f_t(0)); - f_t frac = node->branch_dir == rounding_direction_t::DOWN + f_t frac = node->branch_dir == branch_direction_t::DOWN ? node->fractional_val - std::floor(node->fractional_val) : std::ceil(node->fractional_val) - node->fractional_val; if (frac > 1e-10) { @@ -1048,13 +1044,15 @@ struct deterministic_bfs_policy_t const std::vector& fractional, const std::vector& x) override { + logger_t log; + log.log = false; node->objective_estimate = this->worker.pc_snapshot.obj_estimate(fractional, x, node->lower_bound); } void on_node_completed(mip_node_t* node, node_status_t status, - rounding_direction_t dir) override + branch_direction_t dir) override { switch (status) { case node_status_t::INFEASIBLE: this->worker.record_infeasible(node); break; @@ -1114,25 +1112,28 @@ struct deterministic_diving_policy_t const std::vector& fractional, const std::vector& x) override { + logger_t log; + log.log = false; + switch (this->worker.diving_type) { case search_strategy_t::PSEUDOCOST_DIVING: - return this->worker.variable_selection_from_snapshot(fractional, x); + return pseudocost_diving( + this->worker.pc_snapshot, fractional, x, *this->worker.root_solution, log); case search_strategy_t::LINE_SEARCH_DIVING: - if (this->worker.root_solution) { - logger_t log; - log.log = false; - return line_search_diving(fractional, x, *this->worker.root_solution, log); - } - return this->worker.variable_selection_from_snapshot(fractional, x); + return line_search_diving(fractional, x, *this->worker.root_solution, log); case search_strategy_t::GUIDED_DIVING: - return this->worker.guided_variable_selection(fractional, x); + if (this->worker.incumbent_snapshot.empty()) { + return pseudocost_diving( + this->worker.pc_snapshot, fractional, x, *this->worker.root_solution, log); + } else { + return guided_diving( + this->worker.pc_snapshot, fractional, x, this->worker.incumbent_snapshot, log); + } case search_strategy_t::COEFFICIENT_DIVING: { - logger_t log; - log.log = false; - return coefficient_diving(this->bnb.original_lp_, + return coefficient_diving(this->worker.leaf_problem, fractional, x, this->bnb.var_up_locks_, @@ -1140,7 +1141,7 @@ struct deterministic_diving_policy_t log); } - default: return this->worker.variable_selection_from_snapshot(fractional, x); + default: CUOPT_LOG_ERROR("Invalid diving method!"); return {-1, branch_direction_t::NONE}; } } @@ -1152,10 +1153,10 @@ struct deterministic_diving_policy_t void on_node_completed(mip_node_t* node, node_status_t status, - rounding_direction_t dir) override + branch_direction_t dir) override { if (status == node_status_t::HAS_CHILDREN) { - if (dir == rounding_direction_t::UP) { + if (dir == branch_direction_t::UP) { stack.push_front(node->get_down_child()); stack.push_front(node->get_up_child()); } else { @@ -1174,7 +1175,7 @@ struct deterministic_diving_policy_t template template -std::pair branch_and_bound_t::update_tree_impl( +std::pair branch_and_bound_t::update_tree_impl( mip_node_t* node_ptr, search_tree_t& search_tree, WorkerT* worker, @@ -1186,7 +1187,10 @@ std::pair branch_and_bound_t::upd lp_solution_t& leaf_solution = worker->leaf_solution; const f_t upper_bound = policy.upper_bound(); node_status_t status = node_status_t::PENDING; - rounding_direction_t round_dir = rounding_direction_t::NONE; + branch_direction_t round_dir = branch_direction_t::NONE; + + worker->recompute_basis = true; + worker->recompute_bounds = true; if (lp_status == dual::status_t::DUAL_UNBOUNDED) { node_ptr->lower_bound = inf; @@ -1244,9 +1248,11 @@ std::pair branch_and_bound_t::upd assert(node_ptr->vstatus.size() == leaf_problem.num_cols); assert(branch_var >= 0); - assert(dir != rounding_direction_t::NONE); + assert(dir != branch_direction_t::NONE); policy.update_objective_estimate(node_ptr, leaf_fractional, leaf_solution.x); + worker->recompute_basis = false; + worker->recompute_bounds = false; logger_t log; log.log = false; @@ -1283,7 +1289,7 @@ std::pair branch_and_bound_t::upd } template -std::pair branch_and_bound_t::update_tree( +std::pair branch_and_bound_t::update_tree( mip_node_t* node_ptr, search_tree_t& search_tree, branch_and_bound_worker_t* worker, @@ -1376,7 +1382,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( node_ptr->node_id, node_ptr->depth, node_ptr->branch_var, - node_ptr->branch_dir == rounding_direction_t::DOWN ? "DOWN" : "UP", + node_ptr->branch_dir == branch_direction_t::DOWN ? "DOWN" : "UP", node_ptr->fractional_val, node_ptr->branch_var_lower, node_ptr->branch_var_upper, @@ -1510,7 +1516,7 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_tget_down_child()); } else { @@ -1622,7 +1628,7 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t worker->recompute_bounds = node_status != node_status_t::HAS_CHILDREN; if (node_status == node_status_t::HAS_CHILDREN) { - if (round_dir == rounding_direction_t::UP) { + if (round_dir == branch_direction_t::UP) { stack.push_front(node_ptr->get_down_child()); stack.push_front(node_ptr->get_up_child()); } else { @@ -1754,7 +1760,7 @@ void branch_and_bound_t::run_scheduler() active_workers_per_strategy_[strategy]++; launched_any_task = true; -#pragma omp task affinity(worker) +#pragma omp task affinity(worker) default(none) firstprivate(worker) plunge_with(worker); } else { @@ -1775,7 +1781,7 @@ void branch_and_bound_t::run_scheduler() active_workers_per_strategy_[strategy]++; launched_any_task = true; -#pragma omp task affinity(worker) +#pragma omp task affinity(worker) default(none) firstprivate(worker) dive_with(worker); } } @@ -1800,7 +1806,9 @@ void branch_and_bound_t::run_scheduler() template void branch_and_bound_t::single_threaded_solve() { - branch_and_bound_worker_t worker(0, original_lp_, Arow_, var_types_, settings_); + raft::common::nvtx::range scope("BB::single_threaded_solve"); + worker_pool_.init(1, original_lp_, Arow_, var_types_, settings_); + branch_and_bound_worker_t* worker = worker_pool_.get_idle_worker(); f_t lower_bound = get_lower_bound(); f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); @@ -1808,7 +1816,6 @@ void branch_and_bound_t::single_threaded_solve() while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) { - bool launched_any_task = false; repair_heuristic_solutions(); f_t now = toc(exploration_stats_.start_time); @@ -1844,8 +1851,8 @@ void branch_and_bound_t::single_threaded_solve() continue; } - worker.init_best_first(start_node.value(), original_lp_); - plunge_with(&worker); + worker->init_best_first(start_node.value(), original_lp_); + plunge_with(worker); lower_bound = get_lower_bound(); abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); @@ -1873,27 +1880,28 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( i_t iter = 0; std::string solver_name = ""; - // Root node path lp_status_t root_status; - std::future root_status_future; - root_status_future = std::async(std::launch::async, - &solve_linear_program_with_advanced_basis, - std::ref(original_lp_), - exploration_stats_.start_time, - std::ref(lp_settings), - std::ref(root_relax_soln), - std::ref(basis_update), - std::ref(basic_list), - std::ref(nonbasic_list), - std::ref(root_vstatus), - std::ref(edge_norms), - nullptr); + +// Launch a task for solving the root LP relaxation via dual simplex. +#pragma omp task default(shared) depend(out : root_status) + { + root_status = solve_linear_program_with_advanced_basis(original_lp_, + exploration_stats_.start_time, + lp_settings, + root_relax_soln_, + basis_update, + basic_list, + nonbasic_list, + root_vstatus_, + edge_norms_, + nullptr); + } + // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex - // to finish while (!root_crossover_solution_set_.load(std::memory_order_acquire) && *get_root_concurrent_halt() == 0) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); - continue; +#pragma omp taskyield } if (root_crossover_solution_set_.load(std::memory_order_acquire)) { @@ -1929,9 +1937,11 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Check if crossover was stopped by dual simplex if (crossover_status == crossover_status_t::OPTIMAL) { - set_root_concurrent_halt(1); // Stop dual simplex - root_status = root_status_future.get(); // Wait for dual simplex to finish - set_root_concurrent_halt(0); // Clear the concurrent halt flag + // Stop dual simplex and then wait it to finish + set_root_concurrent_halt(1); +#pragma omp taskwait depend(in : root_status) + + set_root_concurrent_halt(0); // Clear the concurrent halt flag // Override the root relaxation solution with the crossover solution root_relax_soln = root_crossover_soln_; root_vstatus = crossover_vstatus_; @@ -1981,14 +1991,16 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name = method_to_string(root_relax_solved_by); } else { - root_status = root_status_future.get(); +// Wait for the dual simplex to finish (after telling PDLP/Barrier to stop) +#pragma omp taskwait depend(in : root_status) user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; solver_name = "Dual Simplex"; } } else { - root_status = root_status_future.get(); + // Wait for the dual simplex to finish (crossover do not produced a solution) +#pragma omp taskwait depend(in : root_status) user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; @@ -2013,6 +2025,283 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( return root_status; } +template +auto branch_and_bound_t::do_cut_pass( + [[maybe_unused]] i_t cut_pass, + mip_solution_t& solution, + i_t& num_fractional, + std::vector& fractional, + cut_generation_t& cut_generation, + basis_update_mpf_t& basis_update, + std::vector& basic_list, + std::vector& nonbasic_list, + variable_bounds_t& variable_bounds, + cut_pool_t& cut_pool, + cut_info_t& cut_info, + simplex_solver_settings_t& lp_settings, + i_t original_rows, + f_t& last_upper_bound, + f_t& last_objective, + f_t root_relax_objective, + i_t& cut_pool_size, + [[maybe_unused]] const std::vector& saved_solution) -> cut_pass_result_t +{ +#ifdef PRINT_FRACTIONAL_INFO + settings_.log.printf("Found %d fractional variables on cut pass %d\n", num_fractional, cut_pass); + for (i_t j : fractional) { + settings_.log.printf("Fractional variable %d lower %e value %e upper %e\n", + j, + original_lp_.lower[j], + root_relax_soln_.x[j], + original_lp_.upper[j]); + } +#endif + + f_t cut_start_time = tic(); + bool problem_feasible = cut_generation.generate_cuts(original_lp_, + settings_, + Arow_, + new_slacks_, + var_types_, + basis_update, + root_relax_soln_.x, + root_relax_soln_.y, + root_relax_soln_.z, + basic_list, + nonbasic_list, + variable_bounds, + exploration_stats_.start_time); + if (!problem_feasible) { + if (settings_.heuristic_preemption_callback != nullptr) { + settings_.heuristic_preemption_callback(); + } + return {cut_pass_action_t::RETURN, mip_status_t::INFEASIBLE}; + } + f_t cut_generation_time = toc(cut_start_time); + if (cut_generation_time > 1.0) { + settings_.log.debug("Cut generation time %.2f seconds\n", cut_generation_time); + } + // Score the cuts + f_t score_start_time = tic(); + cut_pool.score_cuts(root_relax_soln_.x); + f_t score_time = toc(score_start_time); + if (score_time > 1.0) { settings_.log.debug("Cut scoring time %.2f seconds\n", score_time); } + // Get the best cuts from the cut pool + csr_matrix_t cuts_to_add(0, original_lp_.num_cols, 0); + std::vector cut_rhs; + std::vector cut_types; + i_t num_cuts = cut_pool.get_best_cuts(cuts_to_add, cut_rhs, cut_types); + if (num_cuts == 0) { return {cut_pass_action_t::BREAK, mip_status_t::UNSET}; } + cut_info.record_cut_types(cut_types); +#ifdef PRINT_CUT_POOL_TYPES + cut_pool.print_cutpool_types(); + print_cut_types("In LP ", cut_types, settings_); + printf("Cut pool size: %d\n", cut_pool.pool_size()); +#endif + +#ifdef CHECK_CUT_MATRIX + if (cuts_to_add.check_matrix() != 0) { + settings_.log.printf("Bad cuts matrix\n"); + for (i_t i = 0; i < static_cast(cut_types.size()); ++i) { + settings_.log.printf("row %d cut type %d\n", i, cut_types[i]); + } + return {cut_pass_action_t::RETURN, mip_status_t::NUMERICAL}; + } +#endif +#ifdef CHECK_CUTS_AGAINST_SAVED_SOLUTION + verify_cuts_against_saved_solution(cuts_to_add, cut_rhs, saved_solution); +#endif + cut_pool_size = cut_pool.pool_size(); + + // Resolve the LP with the new cuts + settings_.log.debug( + "Solving LP with %d cuts (%d cut nonzeros). Cuts in pool %d. Total constraints %d\n", + num_cuts, + cuts_to_add.row_start[cuts_to_add.m], + cut_pool.pool_size(), + cuts_to_add.m + original_lp_.num_rows); + lp_settings.log.log = false; + + f_t add_cuts_start_time = tic(); + mutex_original_lp_.lock(); + i_t add_cuts_status = add_cuts(settings_, + cuts_to_add, + cut_rhs, + original_lp_, + new_slacks_, + root_relax_soln_, + basis_update, + basic_list, + nonbasic_list, + root_vstatus_, + edge_norms_); + var_types_.resize(original_lp_.num_cols, variable_type_t::CONTINUOUS); + variable_bounds.resize(original_lp_.num_cols); + mutex_original_lp_.unlock(); + f_t add_cuts_time = toc(add_cuts_start_time); + if (add_cuts_time > 1.0) { settings_.log.debug("Add cuts time %.2f seconds\n", add_cuts_time); } + if (add_cuts_status != 0) { + settings_.log.printf("Failed to add cuts\n"); + return {cut_pass_action_t::RETURN, mip_status_t::NUMERICAL}; + } + + if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) { + mutex_upper_.lock(); + last_upper_bound = upper_bound_.load(); + std::vector lower_bounds; + std::vector upper_bounds; + find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds); + mutex_upper_.unlock(); + mutex_original_lp_.lock(); + original_lp_.lower = lower_bounds; + original_lp_.upper = upper_bounds; + mutex_original_lp_.unlock(); + } + + // Try to do bound strengthening + std::vector bounds_changed(original_lp_.num_cols, true); + std::vector row_sense; +#ifdef CHECK_MATRICES + settings_.log.printf("Before A check\n"); + original_lp_.A.check_matrix(); +#endif + original_lp_.A.to_compressed_row(Arow_); + + f_t node_presolve_start_time = tic(); + bounds_strengthening_t node_presolve(original_lp_, Arow_, row_sense, var_types_); + std::vector new_lower = original_lp_.lower; + std::vector new_upper = original_lp_.upper; + bool feasible = + node_presolve.bounds_strengthening(settings_, bounds_changed, new_lower, new_upper); + mutex_original_lp_.lock(); + original_lp_.lower = new_lower; + original_lp_.upper = new_upper; + mutex_original_lp_.unlock(); + f_t node_presolve_time = toc(node_presolve_start_time); + if (node_presolve_time > 1.0) { + settings_.log.debug("Node presolve time %.2f seconds\n", node_presolve_time); + } + if (!feasible) { + settings_.log.printf("Bound strengthening detected infeasibility\n"); +#ifdef WRITE_BOUND_STRENGTHENING_INFEASIBLE_MPS + original_lp_.write_mps("bound_strengthening_infeasible.mps"); +#endif + return {cut_pass_action_t::RETURN, mip_status_t::INFEASIBLE}; + } + + i_t iter = 0; + bool initialize_basis = false; + lp_settings.concurrent_halt = NULL; + f_t dual_phase2_start_time = tic(); + dual::status_t cut_status = dual_phase2_with_advanced_basis(2, + 0, + initialize_basis, + exploration_stats_.start_time, + original_lp_, + lp_settings, + root_vstatus_, + basis_update, + basic_list, + nonbasic_list, + root_relax_soln_, + iter, + edge_norms_); + exploration_stats_.total_lp_iters += iter; + f_t dual_phase2_time = toc(dual_phase2_start_time); + if (dual_phase2_time > 1.0) { + settings_.log.debug("Dual phase2 time %.2f seconds\n", dual_phase2_time); + } + if (cut_status == dual::status_t::TIME_LIMIT) { + solver_status_ = mip_status_t::TIME_LIMIT; + set_final_solution(solution, root_objective_); + return {cut_pass_action_t::RETURN, solver_status_}; + } + + if (cut_status != dual::status_t::OPTIMAL) { + settings_.log.printf("Numerical issue at root node. Resolving from scratch\n"); + lp_status_t scratch_status = + solve_linear_program_with_advanced_basis(original_lp_, + exploration_stats_.start_time, + lp_settings, + root_relax_soln_, + basis_update, + basic_list, + nonbasic_list, + root_vstatus_, + edge_norms_); + if (scratch_status == lp_status_t::OPTIMAL) { + // We recovered + cut_status = convert_lp_status_to_dual_status(scratch_status); + exploration_stats_.total_lp_iters += root_relax_soln_.iterations; + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + } else { + settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str()); +#ifdef WRITE_CUT_INFEASIBLE_MPS + original_lp_.write_mps("cut_infeasible.mps"); +#endif + return {cut_pass_action_t::RETURN, mip_status_t::NUMERICAL}; + } + } + root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); + + f_t remove_cuts_start_time = tic(); + mutex_original_lp_.lock(); + remove_cuts(original_lp_, + settings_, + exploration_stats_.start_time, + Arow_, + new_slacks_, + original_rows, + var_types_, + root_vstatus_, + edge_norms_, + root_relax_soln_.x, + root_relax_soln_.y, + root_relax_soln_.z, + basic_list, + nonbasic_list, + basis_update); + variable_bounds.resize(original_lp_.num_cols); + mutex_original_lp_.unlock(); + f_t remove_cuts_time = toc(remove_cuts_start_time); + if (remove_cuts_time > 1.0) { + settings_.log.debug("Remove cuts time %.2f seconds\n", remove_cuts_time); + } + fractional.clear(); + num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional); + + if (num_fractional == 0) { + upper_bound_ = root_objective_; + mutex_upper_.lock(); + incumbent_.set_incumbent_solution(root_objective_, root_relax_soln_.x); + mutex_upper_.unlock(); + } + f_t obj = upper_bound_.load(); + report(' ', obj, root_objective_, 0, num_fractional); + + f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), root_objective_); + f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), root_objective_); + if (rel_gap < settings_.relative_mip_gap_tol || abs_gap < settings_.absolute_mip_gap_tol) { + if (num_fractional == 0) { set_solution_at_root(solution, cut_info); } + set_final_solution(solution, root_objective_); + return {cut_pass_action_t::RETURN, mip_status_t::OPTIMAL}; + } + + f_t change_in_objective = root_objective_ - last_objective; + const f_t factor = settings_.cut_change_threshold; + const f_t min_objective = 1e-3; + if (factor > 0.0 && + change_in_objective <= factor * std::max(min_objective, std::abs(root_relax_objective))) { + settings_.log.printf( + "Change in objective %.16e is less than 1e-3 of root relax objective %.16e\n", + change_in_objective, + root_relax_objective); + return {cut_pass_action_t::BREAK, mip_status_t::UNSET}; + } + last_objective = root_objective_; + return {cut_pass_action_t::CONTINUE, mip_status_t::UNSET}; +} + template mip_status_t branch_and_bound_t::solve(mip_solution_t& solution) { @@ -2054,29 +2343,26 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut root_relax_soln_.resize(original_lp_.num_rows, original_lp_.num_cols); - if (settings_.clique_cuts != 0 && clique_table_ == nullptr) { + omp_atomic_t* clique_signal = &signal_extend_cliques_; + + if (settings_.clique_cuts != 0 && clique_table_ == nullptr && + omp_get_num_threads() >= CUOPT_MIP_CLIQUE_CUTS_REQUIRED_THREAD_COUNT) { signal_extend_cliques_.store(false, std::memory_order_release); - typename ::cuopt::linear_programming::mip_solver_settings_t::tolerances_t - tolerances_for_clique{}; + typename mip_solver_settings_t::tolerances_t tolerances_for_clique{}; tolerances_for_clique.presolve_absolute_tolerance = settings_.primal_tol; tolerances_for_clique.absolute_tolerance = settings_.primal_tol; tolerances_for_clique.relative_tolerance = settings_.zero_tol; tolerances_for_clique.integrality_tolerance = settings_.integer_tol; tolerances_for_clique.absolute_mip_gap = settings_.absolute_mip_gap_tol; tolerances_for_clique.relative_mip_gap = settings_.relative_mip_gap_tol; - auto* signal_ptr = &signal_extend_cliques_; - clique_table_future_ = - std::async(std::launch::async, - [this, - tolerances_for_clique, - signal_ptr]() -> std::shared_ptr> { - user_problem_t problem_copy = original_problem_; - cuopt::timer_t timer(std::numeric_limits::infinity()); - std::shared_ptr> table; - detail::find_initial_cliques( - problem_copy, tolerances_for_clique, &table, timer, false, signal_ptr); - return table; - }); + +#pragma omp task depend(out : *clique_signal) firstprivate(tolerances_for_clique) + { + user_problem_t problem_copy = original_problem_; + timer_t timer(std::numeric_limits::infinity()); + detail::find_initial_cliques( + problem_copy, tolerances_for_clique, &clique_table_, timer, false, clique_signal); + } } i_t original_rows = original_lp_.num_rows; @@ -2119,16 +2405,10 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.total_lp_iters = root_relax_soln_.iterations; exploration_stats_.total_lp_solve_time = toc(exploration_stats_.start_time); - auto finish_clique_thread = [this]() { - if (clique_table_future_.valid()) { - signal_extend_cliques_.store(true, std::memory_order_release); - clique_table_ = clique_table_future_.get(); - } - }; - if (root_status == lp_status_t::INFEASIBLE) { settings_.log.printf("MIP Infeasible\n"); - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return mip_status_t::INFEASIBLE; } if (root_status == lp_status_t::UNBOUNDED) { @@ -2136,27 +2416,31 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (settings_.heuristic_preemption_callback != nullptr) { settings_.heuristic_preemption_callback(); } - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return mip_status_t::UNBOUNDED; } if (root_status == lp_status_t::TIME_LIMIT) { solver_status_ = mip_status_t::TIME_LIMIT; set_final_solution(solution, -inf); - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return solver_status_; } if (root_status == lp_status_t::WORK_LIMIT) { solver_status_ = mip_status_t::WORK_LIMIT; set_final_solution(solution, -inf); - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return solver_status_; } if (root_status == lp_status_t::NUMERICAL_ISSUES) { solver_status_ = mip_status_t::NUMERICAL; set_final_solution(solution, -inf); - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return solver_status_; } @@ -2187,7 +2471,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (num_fractional == 0) { set_solution_at_root(solution, cut_info); - finish_clique_thread(); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return mip_status_t::OPTIMAL; } @@ -2211,8 +2496,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut original_problem_, probing_implied_bound_, clique_table_, - &clique_table_future_, - &signal_extend_cliques_); + clique_signal); std::vector saved_solution; #ifdef CHECK_CUTS_AGAINST_SAVED_SOLUTION @@ -2223,272 +2507,92 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut f_t last_objective = root_objective_; f_t root_relax_objective = root_objective_; + constexpr bool enable_root_cut_cpufj = true; + std::unique_ptr> root_cut_cpufj_task; + auto root_cut_cpufj_improvement_callback = + [this](f_t obj, const std::vector& assignment, double work_units) { + std::vector user_assignment; + mutex_original_lp_.lock(); + uncrush_primal_solution(original_problem_, original_lp_, assignment, user_assignment); + mutex_original_lp_.unlock(); + settings_.log.debug("Root cut CPUFJ found solution with objective %.16e\n", obj); + // In deterministic mode the solution must be ordered by its work-unit timestamp so + // B&B sees incumbents in a reproducible sequence; otherwise apply it immediately. + if (settings_.deterministic) { + queue_external_solution_deterministic(user_assignment, work_units); + } else { + set_new_solution(user_assignment); + } + }; + auto stop_root_cut_cpufj = [&]() { + if (!root_cut_cpufj_task) { return; } + detail::stop_fj_cpu_task(*root_cut_cpufj_task); + root_cut_cpufj_task.reset(); + }; + cuopt::scope_guard root_cut_cpufj_guard([&]() { stop_root_cut_cpufj(); }); + f_t cut_generation_start_time = tic(); i_t cut_pool_size = 0; for (i_t cut_pass = 0; cut_pass < settings_.max_cut_passes; cut_pass++) { if (num_fractional == 0) { set_solution_at_root(solution, cut_info); + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) return mip_status_t::OPTIMAL; - } else { -#ifdef PRINT_FRACTIONAL_INFO - settings_.log.printf( - "Found %d fractional variables on cut pass %d\n", num_fractional, cut_pass); - for (i_t j : fractional) { - settings_.log.printf("Fractional variable %d lower %e value %e upper %e\n", - j, - original_lp_.lower[j], - root_relax_soln_.x[j], - original_lp_.upper[j]); - } -#endif - - // Generate cuts and add them to the cut pool - f_t cut_start_time = tic(); - bool problem_feasible = cut_generation.generate_cuts(original_lp_, - settings_, - Arow_, - new_slacks_, - var_types_, - basis_update, - root_relax_soln_.x, - root_relax_soln_.y, - root_relax_soln_.z, - basic_list, - nonbasic_list, - variable_bounds, - exploration_stats_.start_time); - if (!problem_feasible) { - if (settings_.heuristic_preemption_callback != nullptr) { - settings_.heuristic_preemption_callback(); - } - finish_clique_thread(); - return mip_status_t::INFEASIBLE; - } - f_t cut_generation_time = toc(cut_start_time); - if (cut_generation_time > 1.0) { - settings_.log.debug("Cut generation time %.2f seconds\n", cut_generation_time); - } - // Score the cuts - f_t score_start_time = tic(); - cut_pool.score_cuts(root_relax_soln_.x); - f_t score_time = toc(score_start_time); - if (score_time > 1.0) { settings_.log.debug("Cut scoring time %.2f seconds\n", score_time); } - // Get the best cuts from the cut pool - csr_matrix_t cuts_to_add(0, original_lp_.num_cols, 0); - std::vector cut_rhs; - std::vector cut_types; - i_t num_cuts = cut_pool.get_best_cuts(cuts_to_add, cut_rhs, cut_types); - if (num_cuts == 0) { break; } - cut_info.record_cut_types(cut_types); -#ifdef PRINT_CUT_POOL_TYPES - cut_pool.print_cutpool_types(); - print_cut_types("In LP ", cut_types, settings_); - printf("Cut pool size: %d\n", cut_pool.pool_size()); -#endif - -#ifdef CHECK_CUT_MATRIX - if (cuts_to_add.check_matrix() != 0) { - settings_.log.printf("Bad cuts matrix\n"); - for (i_t i = 0; i < static_cast(cut_types.size()); ++i) { - settings_.log.printf("row %d cut type %d\n", i, cut_types[i]); - } - return mip_status_t::NUMERICAL; - } -#endif - // Check against saved solution -#ifdef CHECK_CUTS_AGAINST_SAVED_SOLUTION - verify_cuts_against_saved_solution(cuts_to_add, cut_rhs, saved_solution); -#endif - cut_pool_size = cut_pool.pool_size(); - - // Resolve the LP with the new cuts - settings_.log.debug( - "Solving LP with %d cuts (%d cut nonzeros). Cuts in pool %d. Total constraints %d\n", - num_cuts, - cuts_to_add.row_start[cuts_to_add.m], - cut_pool.pool_size(), - cuts_to_add.m + original_lp_.num_rows); - lp_settings.log.log = false; - - f_t add_cuts_start_time = tic(); - mutex_original_lp_.lock(); - i_t add_cuts_status = add_cuts(settings_, - cuts_to_add, - cut_rhs, - original_lp_, - new_slacks_, - root_relax_soln_, - basis_update, - basic_list, - nonbasic_list, - root_vstatus_, - edge_norms_); - var_types_.resize(original_lp_.num_cols, variable_type_t::CONTINUOUS); - variable_bounds.resize(original_lp_.num_cols); - mutex_original_lp_.unlock(); - f_t add_cuts_time = toc(add_cuts_start_time); - if (add_cuts_time > 1.0) { - settings_.log.debug("Add cuts time %.2f seconds\n", add_cuts_time); - } - if (add_cuts_status != 0) { - settings_.log.printf("Failed to add cuts\n"); - return mip_status_t::NUMERICAL; - } - - if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) { - mutex_upper_.lock(); - last_upper_bound = upper_bound_.load(); - std::vector lower_bounds; - std::vector upper_bounds; - find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds); - mutex_upper_.unlock(); - mutex_original_lp_.lock(); - original_lp_.lower = lower_bounds; - original_lp_.upper = upper_bounds; - mutex_original_lp_.unlock(); - } - - // Try to do bound strengthening - std::vector bounds_changed(original_lp_.num_cols, true); - std::vector row_sense; -#ifdef CHECK_MATRICES - settings_.log.printf("Before A check\n"); - original_lp_.A.check_matrix(); -#endif - original_lp_.A.to_compressed_row(Arow_); - - f_t node_presolve_start_time = tic(); - bounds_strengthening_t node_presolve(original_lp_, Arow_, row_sense, var_types_); - std::vector new_lower = original_lp_.lower; - std::vector new_upper = original_lp_.upper; - bool feasible = - node_presolve.bounds_strengthening(settings_, bounds_changed, new_lower, new_upper); - mutex_original_lp_.lock(); - original_lp_.lower = new_lower; - original_lp_.upper = new_upper; - mutex_original_lp_.unlock(); - f_t node_presolve_time = toc(node_presolve_start_time); - if (node_presolve_time > 1.0) { - settings_.log.debug("Node presolve time %.2f seconds\n", node_presolve_time); - } - if (!feasible) { - settings_.log.printf("Bound strengthening detected infeasibility\n"); -#ifdef WRITE_BOUND_STRENGTHENING_INFEASIBLE_MPS - original_lp_.write_mps("bound_strengthening_infeasible.mps"); -#endif - return mip_status_t::INFEASIBLE; - } - - i_t iter = 0; - bool initialize_basis = false; - lp_settings.concurrent_halt = NULL; - f_t dual_phase2_start_time = tic(); - dual::status_t cut_status = dual_phase2_with_advanced_basis(2, - 0, - initialize_basis, - exploration_stats_.start_time, - original_lp_, - lp_settings, - root_vstatus_, - basis_update, - basic_list, - nonbasic_list, - root_relax_soln_, - iter, - edge_norms_); - exploration_stats_.total_lp_iters += iter; - f_t dual_phase2_time = toc(dual_phase2_start_time); - if (dual_phase2_time > 1.0) { - settings_.log.debug("Dual phase2 time %.2f seconds\n", dual_phase2_time); - } - if (cut_status == dual::status_t::TIME_LIMIT) { - solver_status_ = mip_status_t::TIME_LIMIT; - set_final_solution(solution, root_objective_); - return solver_status_; - } - - if (cut_status != dual::status_t::OPTIMAL) { - settings_.log.printf("Numerical issue at root node. Resolving from scratch\n"); - lp_status_t scratch_status = - solve_linear_program_with_advanced_basis(original_lp_, - exploration_stats_.start_time, - lp_settings, - root_relax_soln_, - basis_update, - basic_list, - nonbasic_list, - root_vstatus_, - edge_norms_); - if (scratch_status == lp_status_t::OPTIMAL) { - // We recovered - cut_status = convert_lp_status_to_dual_status(scratch_status); - exploration_stats_.total_lp_iters += root_relax_soln_.iterations; - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); - } else { - settings_.log.printf("Cut status %s\n", dual::status_to_string(cut_status).c_str()); -#ifdef WRITE_CUT_INFEASIBLE_MPS - original_lp_.write_mps("cut_infeasible.mps"); -#endif - return mip_status_t::NUMERICAL; - } - } - root_objective_ = compute_objective(original_lp_, root_relax_soln_.x); - - f_t remove_cuts_start_time = tic(); - mutex_original_lp_.lock(); - remove_cuts(original_lp_, - settings_, - exploration_stats_.start_time, - Arow_, - new_slacks_, - original_rows, - var_types_, - root_vstatus_, - edge_norms_, - root_relax_soln_.x, - root_relax_soln_.y, - root_relax_soln_.z, - basic_list, - nonbasic_list, - basis_update); - variable_bounds.resize(original_lp_.num_cols); - mutex_original_lp_.unlock(); - f_t remove_cuts_time = toc(remove_cuts_start_time); - if (remove_cuts_time > 1.0) { - settings_.log.debug("Remove cuts time %.2f seconds\n", remove_cuts_time); - } - fractional.clear(); - num_fractional = fractional_variables(settings_, root_relax_soln_.x, var_types_, fractional); + } - if (num_fractional == 0) { - upper_bound_ = root_objective_; - mutex_upper_.lock(); - incumbent_.set_incumbent_solution(root_objective_, root_relax_soln_.x); - mutex_upper_.unlock(); - } - f_t obj = upper_bound_.load(); - report(' ', obj, root_objective_, 0, num_fractional); - - f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), root_objective_); - f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), root_objective_); - if (rel_gap < settings_.relative_mip_gap_tol || abs_gap < settings_.absolute_mip_gap_tol) { - if (num_fractional == 0) { set_solution_at_root(solution, cut_info); } - set_final_solution(solution, root_objective_); - return mip_status_t::OPTIMAL; - } + cut_pass_result_t cut_pass_result; + if (root_cut_cpufj_task) { +#pragma omp task shared(root_cut_cpufj_task) default(none) depend(out : *root_cut_cpufj_task) + detail::run_fj_cpu_task(*root_cut_cpufj_task, + std::numeric_limits::infinity(), + std::numeric_limits::infinity()); + } + + cut_pass_result = do_cut_pass(cut_pass, + solution, + num_fractional, + fractional, + cut_generation, + basis_update, + basic_list, + nonbasic_list, + variable_bounds, + cut_pool, + cut_info, + lp_settings, + original_rows, + last_upper_bound, + last_objective, + root_relax_objective, + cut_pool_size, + saved_solution); + + if (root_cut_cpufj_task) { + detail::stop_fj_cpu_task(*root_cut_cpufj_task); +#pragma omp taskwait depend(in : *root_cut_cpufj_task) + } + + if (cut_pass_result.action == cut_pass_action_t::RETURN) { + signal_extend_cliques_.store(true, std::memory_order_release); +#pragma omp taskwait depend(in : *clique_signal) + return cut_pass_result.status; + } + if (cut_pass_result.action == cut_pass_action_t::BREAK) { break; } - f_t change_in_objective = root_objective_ - last_objective; - const f_t factor = settings_.cut_change_threshold; - const f_t min_objective = 1e-3; - if (factor > 0.0 && - change_in_objective <= factor * std::max(min_objective, std::abs(root_relax_objective))) { - settings_.log.printf( - "Change in objective %.16e is less than 1e-3 of root relax objective %.16e\n", - change_in_objective, - root_relax_objective); - break; - } - last_objective = root_objective_; + if (enable_root_cut_cpufj && !settings_.deterministic && settings_.num_threads >= 2 && + cut_pass + 1 < settings_.max_cut_passes) { + f_t root_cut_cpufj_build_start_time = tic(); + root_cut_cpufj_task = + detail::make_fj_cpu_task_from_host_lp(original_lp_, + var_types_, + root_relax_soln_.x, + settings_, + root_cut_cpufj_improvement_callback, + "[RootCut CPUFJ] "); + settings_.log.debug("Root cut CPUFJ problem build time after pass %d: %.6f seconds\n", + cut_pass, + toc(root_cut_cpufj_build_start_time)); } } @@ -2503,10 +2607,37 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut original_lp_.A.col_start[original_lp_.A.n]); } + if (enable_root_cut_cpufj && cut_info.has_cuts()) { + f_t root_cut_cpufj_build_start_time = tic(); + // In deterministic mode this CPUFJ is built on the B&B task while the LS deterministic + // CPUFJ is being built on the main thread; both would otherwise race on the global + // seed_generator and pick non-reproducible seeds. Pin a stable seed here so this + // climber's behavior depends only on settings_.random_seed. + int64_t root_cut_cpufj_seed = + settings_.deterministic ? static_cast(settings_.random_seed) : -1; + root_cut_cpufj_task = + detail::make_fj_cpu_task_from_host_lp(original_lp_, + var_types_, + root_relax_soln_.x, + settings_, + root_cut_cpufj_improvement_callback, + "[RootCut CPUFJ] ", + root_cut_cpufj_seed); + settings_.log.debug("Root cut CPUFJ final problem build time: %.6f seconds\n", + toc(root_cut_cpufj_build_start_time)); + f_t remaining_time = f_t(settings_.time_limit - toc(exploration_stats_.start_time)); + // Reserve at least half of the remaining time for B&B exploration; cap absolute spend + // at 1s so generous budgets don't grant CPUFJ more than the historical ceiling. + f_t fj_time_limit = + settings_.deterministic ? remaining_time : std::min(remaining_time * f_t{0.5}, f_t{1}); + detail::run_fj_cpu_task(*root_cut_cpufj_task, fj_time_limit, 0.5); + root_cut_cpufj_task.reset(); + } + set_uninitialized_steepest_edge_norms(original_lp_, basic_list, edge_norms_); pc_.resize(original_lp_.num_cols); - original_lp_.A.transpose(pc_.AT); + original_lp_.A.transpose(*pc_.AT); { raft::common::nvtx::range scope_sb("BB::strong_branching"); strong_branching(original_lp_, @@ -2577,7 +2708,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } // Choose variable to branch on - i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x, log); + i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x); search_tree_.root = std::move(mip_node_t(root_objective_, root_vstatus_)); search_tree_.num_nodes = 0; @@ -2614,17 +2745,16 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut "| Gap | Time |\n"); } - if (settings_.deterministic) { - run_deterministic_coordinator(Arow_); - } else if (settings_.num_threads > 1) { -#pragma omp parallel num_threads(settings_.num_threads) - { -#pragma omp master +#pragma omp taskgroup + { + if (settings_.deterministic) { + run_deterministic_coordinator(Arow_); + } else if (settings_.num_threads > 1) { run_scheduler(); + } else { + single_threaded_solve(); } - } else { - single_threaded_solve(); - } + } // Implicit barrier for all tasks created within the group (RINS, B&B workers) is_running_ = false; @@ -2787,7 +2917,7 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri deterministic_horizon_step_ = 0.50; // Compute worker counts using the same formula as reliability-branching scheduler - const i_t num_workers = 2 * settings_.num_threads; + const i_t num_workers = settings_.num_threads; std::vector search_strategies = get_search_strategies(settings_.diving_settings); std::array max_num_workers = @@ -3079,6 +3209,19 @@ void branch_and_bound_t::deterministic_sync_callback() f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound, lower_bound); f_t rel_gap = user_relative_gap(original_lp_, upper_bound, lower_bound); + // Apply limit-based statuses first so a definitive answer (gap closure or tree exhaustion) + // detected in the same callback can override them. Otherwise a long producer wait that + // pushes the wall clock past time_limit would clobber a true INFEASIBLE/OPTIMAL conclusion + // and the solver would report TIME_LIMIT for an already-solved instance. + if (toc(exploration_stats_.start_time) > settings_.time_limit) { + deterministic_global_termination_status_ = mip_status_t::TIME_LIMIT; + } + + // Stop early if next horizon exceeds work limit + if (deterministic_current_horizon_ > settings_.work_limit) { + deterministic_global_termination_status_ = mip_status_t::WORK_LIMIT; + } + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { deterministic_global_termination_status_ = mip_status_t::OPTIMAL; } @@ -3092,15 +3235,6 @@ void branch_and_bound_t::deterministic_sync_callback() } } - if (toc(exploration_stats_.start_time) > settings_.time_limit) { - deterministic_global_termination_status_ = mip_status_t::TIME_LIMIT; - } - - // Stop early if next horizon exceeds work limit - if (deterministic_current_horizon_ > settings_.work_limit) { - deterministic_global_termination_status_ = mip_status_t::WORK_LIMIT; - } - // Signal shutdown to prevent threads from entering barriers after termination if (deterministic_global_termination_status_ != mip_status_t::UNSET) { deterministic_scheduler_->signal_shutdown(); @@ -3321,11 +3455,12 @@ template void branch_and_bound_t::deterministic_broadcast_snapshots( PoolT& pool, const std::vector& incumbent_snapshot) { - deterministic_snapshot_t snap; - snap.upper_bound = upper_bound_.load(); - snap.total_lp_iters = exploration_stats_.total_lp_iters.load(); - snap.incumbent = incumbent_snapshot; - snap.pc_snapshot = pc_.create_snapshot(); + deterministic_snapshot_t snap{ + .upper_bound = upper_bound_, + .pc_snapshot = pc_, + .incumbent = incumbent_snapshot, + .total_lp_iters = exploration_stats_.total_lp_iters, + }; for (auto& worker : pool) { worker.set_snapshots(snap); diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp index f2917ba930..bb4e7a1040 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.hpp +++ b/cpp/src/branch_and_bound/branch_and_bound.hpp @@ -8,12 +8,12 @@ #pragma once #include -#include #include -#include #include #include #include +#include +#include #include @@ -162,8 +162,7 @@ class branch_and_bound_t { const simplex_solver_settings_t settings_; const probing_implied_bound_t& probing_implied_bound_; std::shared_ptr> clique_table_; - std::future>> clique_table_future_; - std::atomic signal_extend_cliques_{false}; + omp_atomic_t signal_extend_cliques_{false}; work_limit_context_t work_unit_context_{"B&B"}; @@ -270,6 +269,31 @@ class branch_and_bound_t { i_t node_int_infeas, double work_time = -1); + enum class cut_pass_action_t { CONTINUE, BREAK, RETURN }; + struct cut_pass_result_t { + cut_pass_action_t action{cut_pass_action_t::CONTINUE}; + mip_status_t status{mip_status_t::UNSET}; + }; + + cut_pass_result_t do_cut_pass(i_t cut_pass, + mip_solution_t& solution, + i_t& num_fractional, + std::vector& fractional, + cut_generation_t& cut_generation, + basis_update_mpf_t& basis_update, + std::vector& basic_list, + std::vector& nonbasic_list, + variable_bounds_t& variable_bounds, + cut_pool_t& cut_pool, + cut_info_t& cut_info, + simplex_solver_settings_t& lp_settings, + i_t original_rows, + f_t& last_upper_bound, + f_t& last_objective, + f_t root_relax_objective, + i_t& cut_pool_size, + const std::vector& saved_solution); + // Set the solution when found at the root node void set_solution_at_root(mip_solution_t& solution, const cut_info_t& cut_info); @@ -318,7 +342,7 @@ class branch_and_bound_t { // Policy-based tree update shared between opportunistic and deterministic codepaths. template - std::pair update_tree_impl( + std::pair update_tree_impl( mip_node_t* node_ptr, search_tree_t& search_tree, WorkerT* worker, @@ -326,7 +350,7 @@ class branch_and_bound_t { Policy& policy); // Opportunistic tree update wrapper. - std::pair update_tree( + std::pair update_tree( mip_node_t* node_ptr, search_tree_t& search_tree, branch_and_bound_worker_t* worker, diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp new file mode 100644 index 0000000000..39bfa0bf3a --- /dev/null +++ b/cpp/src/branch_and_bound/constants.hpp @@ -0,0 +1,31 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +namespace cuopt::linear_programming::dual_simplex { + +constexpr int num_search_strategies = 5; + +// Indicate the search and variable selection algorithms used by each thread +// in B&B (See [1]). +// +// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin, +// Berlin, 2007. doi: 10.14279/depositonce-1634. +enum search_strategy_t : int { + BEST_FIRST = 0, // Best-First + Plunging. + PSEUDOCOST_DIVING = 1, // Pseudocost diving (9.2.5) + LINE_SEARCH_DIVING = 2, // Line search diving (9.2.4) + GUIDED_DIVING = 3, // Guided diving (9.2.3). + COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) +}; + +enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 }; + +enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 }; + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp index 7a074051c6..53d7e4ef65 100644 --- a/cpp/src/branch_and_bound/deterministic_workers.hpp +++ b/cpp/src/branch_and_bound/deterministic_workers.hpp @@ -8,9 +8,9 @@ #pragma once #include -#include #include #include +#include #include @@ -58,7 +58,7 @@ struct deterministic_snapshot_t { f_t upper_bound; pseudo_cost_snapshot_t pc_snapshot; std::vector incumbent; - i_t total_lp_iters; + int64_t total_lp_iters; }; template @@ -74,7 +74,7 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { // Diving-specific snapshots (ignored by BFS workers) std::vector incumbent_snapshot; - i_t total_lp_iters_snapshot{0}; + int64_t total_lp_iters_snapshot{0}; std::vector> integer_solutions; int next_solution_seq{0}; @@ -90,7 +90,9 @@ class deterministic_worker_base_t : public branch_and_bound_worker_t { const std::vector& var_types, const simplex_solver_settings_t& settings, const std::string& context_name) - : base_t(id, original_lp, Arow, var_types, settings), work_context(context_name) + : base_t(id, original_lp, Arow, var_types, settings), + work_context(context_name), + pc_snapshot(1, settings) { work_context.deterministic = true; } @@ -156,7 +158,7 @@ class deterministic_bfs_worker_t mip_node_t* enqueue_children_for_plunge(mip_node_t* down_child, mip_node_t* up_child, - rounding_direction_t preferred_direction) + branch_direction_t preferred_direction) { if (!plunge_stack.empty()) { backlog.push(plunge_stack.back()); @@ -169,7 +171,7 @@ class deterministic_bfs_worker_t up_child->creation_seq = next_creation_seq++; mip_node_t* first_child; - if (preferred_direction == rounding_direction_t::UP) { + if (preferred_direction == branch_direction_t::UP) { plunge_stack.push_front(down_child); plunge_stack.push_front(up_child); first_child = up_child; @@ -342,22 +344,6 @@ class deterministic_diving_worker_t {objective, solution, depth, this->worker_id, this->next_solution_seq++}); ++this->total_integer_solutions; } - - branch_variable_t variable_selection_from_snapshot(const std::vector& fractional, - const std::vector& solution) const - { - assert(root_solution != nullptr); - return this->pc_snapshot.pseudocost_diving(fractional, solution, *root_solution); - } - - branch_variable_t guided_variable_selection(const std::vector& fractional, - const std::vector& solution) const - { - if (this->incumbent_snapshot.empty()) { - return variable_selection_from_snapshot(fractional, solution); - } - return this->pc_snapshot.guided_diving(fractional, solution, this->incumbent_snapshot); - } }; template diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp index f9791280a6..a0bb731c1e 100644 --- a/cpp/src/branch_and_bound/diving_heuristics.cpp +++ b/cpp/src/branch_and_bound/diving_heuristics.cpp @@ -7,8 +7,6 @@ #include -#include - namespace cuopt::linear_programming::dual_simplex { template @@ -17,26 +15,26 @@ branch_variable_t line_search_diving(const std::vector& fractional, const std::vector& root_solution, logger_t& log) { - constexpr f_t eps = 1e-6; - i_t branch_var = -1; - f_t min_score = std::numeric_limits::max(); - rounding_direction_t round_dir = rounding_direction_t::NONE; + constexpr f_t eps = 1e-6; + i_t branch_var = -1; + f_t min_score = std::numeric_limits::max(); + branch_direction_t round_dir = branch_direction_t::NONE; for (i_t j : fractional) { - f_t score = inf; - rounding_direction_t dir = rounding_direction_t::NONE; + f_t score = inf; + branch_direction_t dir = branch_direction_t::NONE; if (solution[j] < root_solution[j] - eps) { f_t f = solution[j] - std::floor(solution[j]); f_t d = root_solution[j] - solution[j]; score = f / d; - dir = rounding_direction_t::DOWN; + dir = branch_direction_t::DOWN; } else if (solution[j] > root_solution[j] + eps) { f_t f = std::ceil(solution[j]) - solution[j]; f_t d = solution[j] - root_solution[j]; score = f / d; - dir = rounding_direction_t::UP; + dir = branch_direction_t::UP; } if (min_score > score) { @@ -48,12 +46,12 @@ branch_variable_t line_search_diving(const std::vector& fractional, // If the current solution is equal to the root solution, arbitrarily // set the branch variable to the first fractional variable and round it down - if (round_dir == rounding_direction_t::NONE) { + if (round_dir == branch_direction_t::NONE) { branch_var = fractional[0]; - round_dir = rounding_direction_t::DOWN; + round_dir = branch_direction_t::DOWN; } - assert(round_dir != rounding_direction_t::NONE); + assert(round_dir != branch_direction_t::NONE); assert(branch_var >= 0); log.debug("Line search diving: selected var %d with val = %e, round dir = %d and score = %e\n", @@ -72,14 +70,63 @@ branch_variable_t pseudocost_diving(pseudo_costs_t& pc, const std::vector& root_solution, logger_t& log) { - return pseudocost_diving_from_arrays(pc.pseudo_cost_sum_down.data(), - pc.pseudo_cost_sum_up.data(), - pc.pseudo_cost_num_down.data(), - pc.pseudo_cost_num_up.data(), - (i_t)pc.pseudo_cost_sum_down.size(), - fractional, - solution, - root_solution); + const i_t num_fractional = fractional.size(); + if (num_fractional == 0) return {-1, branch_direction_t::NONE}; + + f_t avg_down = pc.compute_pseudocost_average_down(); + f_t avg_up = pc.compute_pseudocost_average_up(); + + i_t branch_var = fractional[0]; + f_t max_score = std::numeric_limits::lowest(); + branch_direction_t round_dir = branch_direction_t::DOWN; + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t f_down = solution[j] - std::floor(solution[j]); + f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t pc_down = pc.get_pseudocost_down(j, avg_down); + f_t pc_up = pc.get_pseudocost_up(j, avg_up); + f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down); + f_t score_up = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up); + + f_t score = 0; + branch_direction_t dir = branch_direction_t::DOWN; + + f_t root_val = (j < static_cast(root_solution.size())) ? root_solution[j] : solution[j]; + + if (solution[j] < root_val - f_t(0.4)) { + score = score_down; + dir = branch_direction_t::DOWN; + } else if (solution[j] > root_val + f_t(0.4)) { + score = score_up; + dir = branch_direction_t::UP; + } else if (f_down < f_t(0.3)) { + score = score_down; + dir = branch_direction_t::DOWN; + } else if (f_down > f_t(0.7)) { + score = score_up; + dir = branch_direction_t::UP; + } else if (pc_down < pc_up + eps) { + score = score_down; + dir = branch_direction_t::DOWN; + } else { + score = score_up; + dir = branch_direction_t::UP; + } + + if (score > max_score) { + max_score = score; + branch_var = j; + round_dir = dir; + } + } + + if (round_dir == branch_direction_t::NONE) { + branch_var = fractional[0]; + round_dir = branch_direction_t::DOWN; + } + + return {branch_var, round_dir}; } template @@ -89,14 +136,39 @@ branch_variable_t guided_diving(pseudo_costs_t& pc, const std::vector& incumbent, logger_t& log) { - return guided_diving_from_arrays(pc.pseudo_cost_sum_down.data(), - pc.pseudo_cost_sum_up.data(), - pc.pseudo_cost_num_down.data(), - pc.pseudo_cost_num_up.data(), - (i_t)pc.pseudo_cost_sum_down.size(), - fractional, - solution, - incumbent); + const i_t num_fractional = fractional.size(); + if (num_fractional == 0) return {-1, branch_direction_t::NONE}; + + f_t avg_down = pc.compute_pseudocost_average_down(); + f_t avg_up = pc.compute_pseudocost_average_up(); + + i_t branch_var = fractional[0]; + f_t max_score = std::numeric_limits::lowest(); + branch_direction_t round_dir = branch_direction_t::DOWN; + constexpr f_t eps = f_t(1e-6); + + for (i_t j : fractional) { + f_t f_down = solution[j] - std::floor(solution[j]); + f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j])); + f_t up_dist = std::abs(std::ceil(solution[j]) - incumbent[j]); + branch_direction_t dir = + down_dist < up_dist + eps ? branch_direction_t::DOWN : branch_direction_t::UP; + + f_t pc_down = pc.get_pseudocost_down(j, avg_down); + f_t pc_up = pc.get_pseudocost_up(j, avg_up); + f_t score1 = dir == branch_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up; + f_t score2 = dir == branch_direction_t::DOWN ? pc_up * f_up : pc_down * f_down; + f_t score = (score1 + score2) / 6; + + if (score > max_score) { + max_score = score; + branch_var = j; + round_dir = dir; + } + } + + return {branch_var, round_dir}; } template @@ -130,10 +202,10 @@ branch_variable_t coefficient_diving(const lp_problem_t& lp_probl const std::vector& down_locks, logger_t& log) { - i_t branch_var = -1; - i_t min_locks = std::numeric_limits::max(); - rounding_direction_t round_dir = rounding_direction_t::NONE; - constexpr f_t eps = 1e-6; + i_t branch_var = -1; + i_t min_locks = std::numeric_limits::max(); + branch_direction_t round_dir = branch_direction_t::NONE; + constexpr f_t eps = 1e-6; for (i_t j : fractional) { f_t f_down = solution[j] - std::floor(solution[j]); @@ -151,18 +223,18 @@ branch_variable_t coefficient_diving(const lp_problem_t& lp_probl branch_var = j; if (up_lock < down_lock) { - round_dir = rounding_direction_t::UP; + round_dir = branch_direction_t::UP; } else if (up_lock > down_lock) { - round_dir = rounding_direction_t::DOWN; + round_dir = branch_direction_t::DOWN; } else if (f_down < f_up + eps) { - round_dir = rounding_direction_t::DOWN; + round_dir = branch_direction_t::DOWN; } else { - round_dir = rounding_direction_t::UP; + round_dir = branch_direction_t::UP; } } } - assert(round_dir != rounding_direction_t::NONE); + assert(round_dir != branch_direction_t::NONE); assert(branch_var >= 0); log.debug( diff --git a/cpp/src/branch_and_bound/mip_node.cpp b/cpp/src/branch_and_bound/mip_node.cpp deleted file mode 100644 index 7b0f644f4e..0000000000 --- a/cpp/src/branch_and_bound/mip_node.cpp +++ /dev/null @@ -1,18 +0,0 @@ -/* clang-format off */ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - */ -/* clang-format on */ - -#include - -namespace cuopt::linear_programming::dual_simplex { - -bool inactive_status(node_status_t status) -{ - return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE || - status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL); -} - -} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp index a24f67c3bc..694a7099c4 100644 --- a/cpp/src/branch_and_bound/mip_node.hpp +++ b/cpp/src/branch_and_bound/mip_node.hpp @@ -7,6 +7,8 @@ #pragma once +#include + #include #include @@ -29,9 +31,11 @@ enum class node_status_t : int { NUMERICAL = 5 // Encountered numerical issue when solving the LP relaxation }; -enum class rounding_direction_t : int8_t { NONE = -1, DOWN = 0, UP = 1 }; - -bool inactive_status(node_status_t status); +inline bool inactive_status(node_status_t status) +{ + return (status == node_status_t::FATHOMED || status == node_status_t::INTEGER_FEASIBLE || + status == node_status_t::INFEASIBLE || status == node_status_t::NUMERICAL); +} template class mip_node_t { @@ -64,7 +68,7 @@ class mip_node_t { parent(nullptr), node_id(0), branch_var(-1), - branch_dir(rounding_direction_t::NONE), + branch_dir(branch_direction_t::NONE), branch_var_lower(-std::numeric_limits::infinity()), branch_var_upper(std::numeric_limits::infinity()), fractional_val(std::numeric_limits::infinity()), @@ -82,7 +86,7 @@ class mip_node_t { parent(nullptr), node_id(0), branch_var(-1), - branch_dir(rounding_direction_t::NONE), + branch_dir(branch_direction_t::NONE), integer_infeasible(-1), objective_estimate(std::numeric_limits::infinity()), vstatus(basis) @@ -95,7 +99,7 @@ class mip_node_t { mip_node_t* parent_node, i_t node_num, i_t branch_variable, - rounding_direction_t branch_direction, + branch_direction_t branch_direction, f_t branch_var_value, i_t integer_inf, const std::vector& basis) @@ -111,10 +115,10 @@ class mip_node_t { objective_estimate(parent_node->objective_estimate), vstatus(basis) { - branch_var_lower = branch_direction == rounding_direction_t::DOWN ? problem.lower[branch_var] - : std::ceil(branch_var_value); - branch_var_upper = branch_direction == rounding_direction_t::DOWN ? std::floor(branch_var_value) - : problem.upper[branch_var]; + branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var] + : std::ceil(branch_var_value); + branch_var_upper = branch_direction == branch_direction_t::DOWN ? std::floor(branch_var_value) + : problem.upper[branch_var]; children[0] = nullptr; children[1] = nullptr; } @@ -282,7 +286,7 @@ class mip_node_t { i_t depth; i_t node_id; i_t branch_var; - rounding_direction_t branch_dir; + branch_direction_t branch_dir; f_t branch_var_lower; f_t branch_var_upper; f_t fractional_val; @@ -312,7 +316,7 @@ class mip_node_t { const mip_node_t* node = this; while (node != nullptr && node->branch_var >= 0) { uint64_t step = static_cast(node->branch_var) << 1; - step |= (node->branch_dir == rounding_direction_t::UP) ? 1 : 0; + step |= (node->branch_dir == branch_direction_t::UP) ? 1 : 0; path_steps.push_back(step); node = node->parent; } @@ -359,7 +363,7 @@ class search_tree_t { parent_node, ++id, branch_var, - rounding_direction_t::DOWN, + branch_direction_t::DOWN, fractional_val, integer_infeasible, parent_vstatus); @@ -367,14 +371,14 @@ class search_tree_t { parent_node, down_child.get(), branch_var, - rounding_direction_t::DOWN, + branch_direction_t::DOWN, std::floor(fractional_val)); auto up_child = std::make_unique>(original_lp, parent_node, ++id, branch_var, - rounding_direction_t::UP, + branch_direction_t::UP, fractional_val, integer_infeasible, parent_vstatus); @@ -383,7 +387,7 @@ class search_tree_t { parent_node, up_child.get(), branch_var, - rounding_direction_t::UP, + branch_direction_t::UP, std::ceil(fractional_val)); assert(parent_vstatus.size() == original_lp.num_cols); @@ -405,7 +409,7 @@ class search_tree_t { const mip_node_t* origin_ptr, const mip_node_t* dest_ptr, const i_t branch_var, - rounding_direction_t branch_dir, + branch_direction_t branch_dir, const f_t bound) { if (write_graphviz) { @@ -413,7 +417,7 @@ class search_tree_t { origin_ptr->node_id, dest_ptr->node_id, branch_var, - branch_dir == rounding_direction_t::DOWN ? "<=" : ">=", + branch_dir == branch_direction_t::DOWN ? "<=" : ">=", bound); } } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c38e98e27d..9cef45edb0 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -7,13 +7,14 @@ #include #include +#include #include #include #include #include -#include +#include #include @@ -24,7 +25,6 @@ #include namespace cuopt::linear_programming::dual_simplex { - namespace { static bool is_dual_simplex_done(dual::status_t status) @@ -218,8 +218,10 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, const std::vector& basic_list, const std::vector& nonbasic_list, const std::vector& fractional, + const csc_matrix_t& AT, basis_update_mpf_t& basis_factors, - pseudo_costs_t& pc) + std::vector& strong_branch_down, + std::vector& strong_branch_up) { i_t m = lp.num_rows; i_t n = lp.num_cols; @@ -246,7 +248,7 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, objective_change_estimate_t estimate = single_pivot_objective_change_estimate(lp, settings, - pc.AT, + AT, vstatus, j, basic_map[j], @@ -258,8 +260,8 @@ void initialize_pseudo_costs_with_estimate(const lp_problem_t& lp, workspace, delta_z, work_estimate); - pc.strong_branch_down[k] = estimate.down_obj_change; - pc.strong_branch_up[k] = estimate.up_obj_change; + strong_branch_down[k] = estimate.down_obj_change; + strong_branch_up[k] = estimate.up_obj_change; } } @@ -298,12 +300,14 @@ void strong_branch_helper(i_t start, f_t root_obj, f_t upper_bound, i_t iter_limit, - pseudo_costs_t& pc, + std::vector& strong_branch_down, + std::vector& strong_branch_up, std::vector& dual_simplex_obj_down, std::vector& dual_simplex_obj_up, std::vector& dual_simplex_status_down, std::vector& dual_simplex_status_up, - shared_strong_branching_context_view_t& sb_view) + shared_strong_branching_context_view_t& sb_view, + omp_atomic_t& num_strong_branches_completed) { raft::common::nvtx::range scope("BB::strong_branch_helper"); lp_problem_t child_problem = original_lp; @@ -380,7 +384,7 @@ void strong_branch_helper(i_t start, } if (branch == 0) { - pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0); + strong_branch_down[k] = std::max(obj - root_obj, 0.0); dual_simplex_obj_down[k] = std::max(obj - root_obj, 0.0); dual_simplex_status_down[k] = status; if (verbose) { @@ -393,7 +397,7 @@ void strong_branch_helper(i_t start, toc(start_time)); } } else { - pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0); + strong_branch_up[k] = std::max(obj - root_obj, 0.0); dual_simplex_obj_up[k] = std::max(obj - root_obj, 0.0); dual_simplex_status_up[k] = status; if (verbose) { @@ -431,7 +435,7 @@ void strong_branch_helper(i_t start, } if (toc(start_time) > settings.time_limit) { break; } - const i_t completed = pc.num_strong_branches_completed++; + const i_t completed = num_strong_branches_completed++; if (thread_id == 0 && toc(last_log) > 10) { last_log = tic(); @@ -463,7 +467,7 @@ std::pair trial_branching(const lp_problem_t& ori f_t upper_bound, f_t start_time, i_t iter_limit, - omp_atomic_t& total_lp_iter) + i_t& iter) { lp_problem_t child_problem = original_lp; child_problem.lower[branch_var] = branch_var_lower; @@ -479,7 +483,7 @@ std::pair trial_branching(const lp_problem_t& ori objective_upper_bound(child_problem, upper_bound, child_settings.dual_tol); lp_solution_t solution(original_lp.num_rows, original_lp.num_cols); - i_t iter = 0; + iter = 0; std::vector child_vstatus = vstatus; std::vector child_edge_norms = edge_norms; std::vector child_basic_list = basic_list; @@ -502,7 +506,7 @@ std::pair trial_branching(const lp_problem_t& ori solution, iter, child_edge_norms); - total_lp_iter += iter; + settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n", branch_var, child_problem.lower[branch_var], @@ -569,10 +573,13 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data // Set CSR constraint matrix mps_model.set_csr_constraint_matrix( - csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1); + std::span{csr_A.x.data(), static_cast(nz)}, + std::span{csr_A.j.data(), static_cast(nz)}, + std::span{csr_A.row_start.data(), static_cast(m + 1)}); // Set objective coefficients - mps_model.set_objective_coefficients(lp.objective.data(), n); + mps_model.set_objective_coefficients( + std::span{lp.objective.data(), static_cast(n)}); // The LP is already in minimization form (objective negated for max problems). // Pass identity scaling so PDLP returns the raw DS-space objective directly. @@ -580,8 +587,10 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data mps_model.set_objective_offset(f_t(0.0)); // Set variable bounds - mps_model.set_variable_lower_bounds(lp.lower.data(), n); - mps_model.set_variable_upper_bounds(lp.upper.data(), n); + mps_model.set_variable_lower_bounds( + std::span{lp.lower.data(), static_cast(n)}); + mps_model.set_variable_upper_bounds( + std::span{lp.upper.data(), static_cast(n)}); // Convert row sense and RHS to constraint bounds std::vector constraint_lower(m); @@ -629,8 +638,8 @@ static cuopt::mps_parser::mps_data_model_t simplex_problem_to_mps_data } } - mps_model.set_constraint_lower_bounds(constraint_lower.data(), m); - mps_model.set_constraint_upper_bounds(constraint_upper.data(), m); + mps_model.set_constraint_lower_bounds(constraint_lower); + mps_model.set_constraint_upper_bounds(constraint_upper); mps_model.set_maximize(false); return mps_model; @@ -732,9 +741,9 @@ static void batch_pdlp_strong_branching_task( std::max(static_cast(0.0), settings.time_limit - batch_elapsed_time); if (warm_start_remaining_time <= 0.0) { return; } - assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point"); + assert(!pc.pdlp_warm_cache->populated && "PDLP warm cache should not be populated at this point"); - if (!pc.pdlp_warm_cache.populated) { + if (!pc.pdlp_warm_cache->populated) { pdlp_solver_settings_t ws_settings; ws_settings.method = method_t::PDLP; ws_settings.presolver = presolver_t::None; @@ -746,7 +755,7 @@ static void batch_pdlp_strong_branching_task( constexpr int warm_start_iteration_limit = 500000; ws_settings.iteration_limit = warm_start_iteration_limit; ws_settings.time_limit = warm_start_remaining_time; - constexpr f_t pdlp_tolerance = 1e-5; + constexpr f_t pdlp_tolerance = 1e-4; ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; @@ -756,14 +765,15 @@ static void batch_pdlp_strong_branching_task( ws_settings.inside_mip = true; if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } - auto start_time = std::chrono::high_resolution_clock::now(); + auto pdlp_start_time = std::chrono::high_resolution_clock::now(); - auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); + auto ws_solution = solve_lp(&pc.pdlp_warm_cache->batch_pdlp_handle, mps_model, ws_settings); if (verbose) { - auto end_time = std::chrono::high_resolution_clock::now(); + auto pdlp_end_time = std::chrono::high_resolution_clock::now(); auto duration = - std::chrono::duration_cast(end_time - start_time).count(); + std::chrono::duration_cast(pdlp_end_time - pdlp_start_time) + .count(); settings.log.printf( "Original problem solved in %d milliseconds" " and iterations: %d\n", @@ -777,21 +787,21 @@ static void batch_pdlp_strong_branching_task( const auto& ws_dual = ws_solution.get_dual_solution(); // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm // start - cache.initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); - cache.initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); - cache.step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; - cache.primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; - cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; - cache.populated = true; + cache->initial_primal = rmm::device_uvector(ws_primal, ws_primal.stream()); + cache->initial_dual = rmm::device_uvector(ws_dual, ws_dual.stream()); + cache->step_size = ws_solution.get_pdlp_warm_start_data().initial_step_size_; + cache->primal_weight = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_; + cache->pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_; + cache->populated = true; if (verbose) { settings.log.printf( "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n", - cache.initial_primal.size(), - cache.initial_dual.size(), - cache.step_size, - cache.primal_weight, - cache.pdlp_iteration); + cache->initial_primal.size(), + cache->initial_dual.size(), + cache->step_size, + cache->primal_weight, + cache->pdlp_iteration); } } else { if (verbose) { @@ -817,22 +827,23 @@ static void batch_pdlp_strong_branching_task( if (batch_remaining_time <= 0.0) { return; } pdlp_settings.time_limit = batch_remaining_time; - if (pc.pdlp_warm_cache.populated) { + if (pc.pdlp_warm_cache->populated) { auto& cache = pc.pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(), - cache.initial_primal.size(), - cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_step_size(cache.step_size); - pdlp_settings.set_initial_primal_weight(cache.primal_weight); - pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); + pdlp_settings.set_initial_primal_solution(cache->initial_primal.data(), + cache->initial_primal.size(), + cache->batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution(cache->initial_dual.data(), + cache->initial_dual.size(), + cache->batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(cache->step_size); + pdlp_settings.set_initial_primal_weight(cache->primal_weight); + pdlp_settings.set_initial_pdlp_iteration(cache->pdlp_iteration); } if (concurrent_halt.load() == 1) { return; } const auto solutions = batch_pdlp_solve( - &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); + &pc.pdlp_warm_cache->batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Fail safe in case the batch PDLP failed and produced no solutions @@ -888,13 +899,13 @@ static void batch_pdlp_reliability_branching_task( const std::vector& candidate_vars, const simplex_solver_settings_t& settings, shared_strong_branching_context_view_t& sb_view, - batch_pdlp_warm_cache_t& pdlp_warm_cache, + batch_pdlp_warm_cache_t* pdlp_warm_cache, std::vector& pdlp_obj_down, std::vector& pdlp_obj_up) { - log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n" - : "RB cooperative batch PDLP and DS for %d candidates\n", - num_candidates); + log.debug(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n" + : "RB cooperative batch PDLP and DS for %d candidates\n", + num_candidates); f_t start_batch = tic(); @@ -935,15 +946,16 @@ static void batch_pdlp_reliability_branching_task( } pdlp_settings.time_limit = batch_remaining_time; - if (pdlp_warm_cache.populated) { - auto& cache = pdlp_warm_cache; - pdlp_settings.set_initial_primal_solution( - cache.initial_primal.data(), cache.initial_primal.size(), batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_dual_solution( - cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream()); - pdlp_settings.set_initial_step_size(cache.step_size); - pdlp_settings.set_initial_primal_weight(cache.primal_weight); - pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration); + if (pdlp_warm_cache->populated) { + pdlp_settings.set_initial_primal_solution(pdlp_warm_cache->initial_primal.data(), + pdlp_warm_cache->initial_primal.size(), + batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_dual_solution(pdlp_warm_cache->initial_dual.data(), + pdlp_warm_cache->initial_dual.size(), + batch_pdlp_handle.get_stream()); + pdlp_settings.set_initial_step_size(pdlp_warm_cache->step_size); + pdlp_settings.set_initial_primal_weight(pdlp_warm_cache->primal_weight); + pdlp_settings.set_initial_pdlp_iteration(pdlp_warm_cache->pdlp_iteration); } if (concurrent_halt.load() == 1) { return; } @@ -955,7 +967,7 @@ static void batch_pdlp_reliability_branching_task( if (solutions.get_additional_termination_informations().size() != static_cast(num_candidates) * 2) { - log.printf("RB batch PDLP failed and produced no solutions\n"); + log.debug("RB batch PDLP failed and produced no solutions\n"); return; } @@ -966,10 +978,10 @@ static void batch_pdlp_reliability_branching_task( } } - log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n", - batch_pdlp_time, - amount_done, - num_candidates * 2); + log.debug("RB batch PDLP completed in %.2fs. Solved %d/%d\n", + batch_pdlp_time, + amount_done, + num_candidates * 2); for (i_t k = 0; k < num_candidates; k++) { if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) { @@ -999,21 +1011,31 @@ void strong_branching(const lp_problem_t& original_lp, basis_update_mpf_t& basis_factors, pseudo_costs_t& pc) { - constexpr bool verbose = false; + raft::common::nvtx::range scope("BB::strong_branching"); pc.resize(original_lp.num_cols); - pc.strong_branch_down.assign(fractional.size(), 0); - pc.strong_branch_up.assign(fractional.size(), 0); - pc.num_strong_branches_completed = 0; + std::vector strong_branch_down(fractional.size(), std::numeric_limits::quiet_NaN()); + std::vector strong_branch_up(fractional.size(), std::numeric_limits::quiet_NaN()); + omp_atomic_t num_strong_branches_completed = 0; const f_t elapsed_time = toc(start_time); if (elapsed_time > settings.time_limit) { return; } // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only - const i_t effective_batch_pdlp = - (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1)) - ? 0 - : settings.mip_batch_pdlp_strong_branching; + i_t effective_batch_pdlp = settings.mip_batch_pdlp_strong_branching; + + // Disable for sub MIP + if (settings.sub_mip) { effective_batch_pdlp = 0; } + + // Disable if running in deterministic mode + if (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1) { + effective_batch_pdlp = 0; + } + + // Disable if the number of threads available is too low. + if (omp_get_num_threads() < CUOPT_MIP_BATCH_PDLP_REQUIRED_THREAD_COUNT) { + effective_batch_pdlp = 0; + } if (settings.mip_batch_pdlp_strong_branching != 0 && (settings.sub_mip || settings.deterministic)) { @@ -1049,78 +1071,81 @@ void strong_branching(const lp_problem_t& original_lp, basic_list, nonbasic_list, fractional, + *pc.AT, basis_factors, - pc); + strong_branch_down, + strong_branch_up); } else { -#pragma omp parallel num_threads(settings.num_threads) - { -#pragma omp single nowait - { - if (effective_batch_pdlp != 0) { -#pragma omp task - batch_pdlp_strong_branching_task(settings, - effective_batch_pdlp, - start_time, - concurrent_halt, - original_lp, - new_slacks, - root_solution.x, - fractional, - root_obj, - pc, - sb_view, - pdlp_obj_down, - pdlp_obj_up); - } + if (effective_batch_pdlp != 0) { +#pragma omp task default(shared) + batch_pdlp_strong_branching_task(settings, + effective_batch_pdlp, + start_time, + concurrent_halt, + original_lp, + new_slacks, + root_solution.x, + fractional, + root_obj, + pc, + sb_view, + pdlp_obj_down, + pdlp_obj_up); + } - if (effective_batch_pdlp != 2) { - i_t n = std::min(4 * settings.num_threads, fractional.size()); + if (effective_batch_pdlp != 2) { + i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) - for (i_t k = 0; k < n; k++) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); - - constexpr bool verbose = false; - if (verbose) { - settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", - omp_get_thread_num(), - k, - start, - end, - end - start); - } - - strong_branch_helper(start, - end, - start_time, - original_lp, - settings, - var_types, - fractional, - root_solution.x, - root_vstatus, - edge_norms, - root_obj, - upper_bound, - simplex_iteration_limit, - pc, - dual_simplex_obj_down, - dual_simplex_obj_up, - dual_simplex_status_down, - dual_simplex_status_up, - sb_view); - } - // DS done: signal PDLP to stop (time-limit or all work done) and wait - if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } +#pragma omp taskloop num_tasks(n) default(shared) + for (i_t k = 0; k < n; ++k) { + i_t start = std::floor(k * fractional.size() / n); + i_t end = std::floor((k + 1) * fractional.size() / n); + + constexpr bool verbose = false; + if (verbose) { + settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", + omp_get_thread_num(), + k, + start, + end, + end - start); } + + strong_branch_helper(start, + end, + start_time, + original_lp, + settings, + var_types, + fractional, + root_solution.x, + root_vstatus, + edge_norms, + root_obj, + upper_bound, + simplex_iteration_limit, + strong_branch_down, + strong_branch_up, + dual_simplex_obj_down, + dual_simplex_obj_up, + dual_simplex_status_down, + dual_simplex_status_up, + sb_view, + num_strong_branches_completed); } + // DS done: signal PDLP to stop (time-limit or all work done) and wait + if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } + } + + if (effective_batch_pdlp != 0) { +#pragma omp taskwait // Wait for the batch PDLP task to finish } } settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); + constexpr bool verbose = false; if (verbose) { // Collect Dual Simplex statistics i_t dual_simplex_optimal = 0, dual_simplex_infeasible = 0, dual_simplex_iter_limit = 0; @@ -1178,7 +1203,7 @@ void strong_branching(const lp_problem_t& original_lp, for (i_t k = 0; k < fractional.size(); k++) { for (i_t branch = 0; branch < 2; branch++) { const bool is_down = (branch == 0); - f_t& sb_dest = is_down ? pc.strong_branch_down[k] : pc.strong_branch_up[k]; + f_t& sb_dest = is_down ? strong_branch_down[k] : strong_branch_up[k]; f_t ds_obj = is_down ? dual_simplex_obj_down[k] : dual_simplex_obj_up[k]; dual::status_t ds_status = is_down ? dual_simplex_status_down[k] : dual_simplex_status_up[k]; @@ -1211,12 +1236,12 @@ void strong_branching(const lp_problem_t& original_lp, } } - pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root = + pc.pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0; if (verbose) { settings.log.printf( "Batch PDLP for strong branching. Percent solved by batch PDLP at root: %f\n", - pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root); + pc.pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root); settings.log.printf( "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d solved by both\n", merged_from_ds, @@ -1226,22 +1251,57 @@ void strong_branching(const lp_problem_t& original_lp, } } - pc.update_pseudo_costs_from_strong_branching(fractional, root_solution.x); + pc.update_pseudo_costs_from_strong_branching( + fractional, strong_branch_down, strong_branch_up, root_solution.x); +} + +template +inline f_t pseudo_costs_t::compute_pseudocost_average_down() +{ + i_t num_initialized = 0; + f_t avg = 0.0; + + for (size_t j = 0; j < pseudo_cost_sum_down.size(); ++j) { + i_t num = pseudo_cost_num_down[j]; + f_t sum = pseudo_cost_sum_down[j]; + if (num > 0 && std::isfinite(sum)) { + ++num_initialized; + avg += sum / num; + } + } + + return (num_initialized > 0) ? avg / num_initialized : 1.0; +} + +template +inline f_t pseudo_costs_t::compute_pseudocost_average_up() +{ + i_t num_initialized = 0; + f_t avg = 0.0; + + for (size_t j = 0; j < pseudo_cost_sum_up.size(); ++j) { + i_t num = pseudo_cost_num_up[j]; + f_t sum = pseudo_cost_sum_up[j]; + if (num > 0 && std::isfinite(sum)) { + ++num_initialized; + avg += sum / num; + } + } + + return (num_initialized > 0) ? avg / num_initialized : 1.0; } template f_t pseudo_costs_t::calculate_pseudocost_score(i_t j, const std::vector& solution, - f_t pseudo_cost_up_avg, - f_t pseudo_cost_down_avg) const + f_t avg_down, + f_t avg_up) const { constexpr f_t eps = 1e-6; - i_t num_up = pseudo_cost_num_up[j]; - i_t num_down = pseudo_cost_num_down[j]; - f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; - f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; f_t f_down = solution[j] - std::floor(solution[j]); f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t pc_down = get_pseudocost_down(j, avg_down); + f_t pc_up = get_pseudocost_up(j, avg_up); return std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); } @@ -1250,11 +1310,11 @@ void pseudo_costs_t::update_pseudo_costs(mip_node_t* node_pt f_t leaf_objective) { const f_t change_in_obj = std::max(leaf_objective - node_ptr->lower_bound, 0.0); - const f_t frac = node_ptr->branch_dir == rounding_direction_t::DOWN + const f_t frac = node_ptr->branch_dir == branch_direction_t::DOWN ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val) : std::ceil(node_ptr->fractional_val) - node_ptr->fractional_val; - if (node_ptr->branch_dir == rounding_direction_t::DOWN) { + if (node_ptr->branch_dir == branch_direction_t::DOWN) { pseudo_cost_sum_down[node_ptr->branch_var] += change_in_obj / frac; pseudo_cost_num_down[node_ptr->branch_var]++; } else { @@ -1263,43 +1323,21 @@ void pseudo_costs_t::update_pseudo_costs(mip_node_t* node_pt } } -template -void pseudo_costs_t::initialized(i_t& num_initialized_down, - i_t& num_initialized_up, - f_t& pseudo_cost_down_avg, - f_t& pseudo_cost_up_avg) const -{ - auto avgs = compute_pseudo_cost_averages(pseudo_cost_sum_down.data(), - pseudo_cost_sum_up.data(), - pseudo_cost_num_down.data(), - pseudo_cost_num_up.data(), - pseudo_cost_sum_down.size()); - pseudo_cost_down_avg = avgs.down_avg; - pseudo_cost_up_avg = avgs.up_avg; -} - template i_t pseudo_costs_t::variable_selection(const std::vector& fractional, - const std::vector& solution, - logger_t& log) + const std::vector& solution) { + raft::common::nvtx::range scope("BB::pseudocost_branching"); + i_t branch_var = fractional[0]; f_t max_score = -1; - i_t num_initialized_down; - i_t num_initialized_up; - f_t pseudo_cost_down_avg; - f_t pseudo_cost_up_avg; - - initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); + f_t avg_down = compute_pseudocost_average_down(); + f_t avg_up = compute_pseudocost_average_up(); - log.printf("PC: num initialized down %d up %d avg down %e up %e\n", - num_initialized_down, - num_initialized_up, - pseudo_cost_down_avg, - pseudo_cost_up_avg); + settings.log.debug("PC: avg down %e up %e\n", avg_down, avg_up); for (i_t j : fractional) { - f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg); + f_t score = calculate_pseudocost_score(j, solution, avg_down, avg_up); if (score > max_score) { max_score = score; @@ -1307,10 +1345,10 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio } } - log.debug("Pseudocost branching on %d. Value %e. Score %e.\n", - branch_var, - solution[branch_var], - max_score); + settings.log.debug("Pseudocost branching on %d. Value %e. Score %e.\n", + branch_var, + solution[branch_var], + max_score); return branch_var; } @@ -1322,19 +1360,19 @@ i_t pseudo_costs_t::reliable_variable_selection( branch_and_bound_worker_t* worker, const std::vector& var_types, const branch_and_bound_stats_t& bnb_stats, - const simplex_solver_settings_t& settings, f_t upper_bound, int max_num_tasks, - logger_t& log, const std::vector& new_slacks, const lp_problem_t& original_lp) { - constexpr f_t eps = 1e-6; - f_t start_time = bnb_stats.start_time; - i_t branch_var = fractional[0]; - f_t max_score = -1; - f_t pseudo_cost_down_avg = -1; - f_t pseudo_cost_up_avg = -1; + raft::common::nvtx::range scope("BB::reliability_branching"); + + constexpr f_t eps = 1e-6; + f_t start_time = bnb_stats.start_time; + i_t branch_var = fractional[0]; + f_t max_score = -1; + f_t avg_down{0}; + f_t avg_up{0}; lp_solution_t& leaf_solution = worker->leaf_solution; const int64_t branch_and_bound_lp_iters = bnb_stats.total_lp_iters; @@ -1367,14 +1405,9 @@ i_t pseudo_costs_t::reliable_variable_selection( // In the latter, we are not using the average pseudocost (which calculated in the `initialized` // method). if (reliable_threshold == 0) { - i_t num_initialized_up; - i_t num_initialized_down; - initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); - log.printf("PC: num initialized down %d up %d avg down %e up %e\n", - num_initialized_down, - num_initialized_up, - pseudo_cost_down_avg, - pseudo_cost_up_avg); + avg_down = compute_pseudocost_average_down(); + avg_up = compute_pseudocost_average_up(); + settings.log.debug("PC: avg down %e up %e\n", avg_down, avg_up); } std::vector> unreliable_list; @@ -1386,8 +1419,7 @@ i_t pseudo_costs_t::reliable_variable_selection( unreliable_list.push_back(std::make_pair(-1, j)); continue; } - f_t score = - calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); + f_t score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up); if (score > max_score) { max_score = score; @@ -1396,16 +1428,17 @@ i_t pseudo_costs_t::reliable_variable_selection( } if (unreliable_list.empty()) { - log.printf("pc branching on %d. Value %e. Score %e\n", - branch_var, - leaf_solution.x[branch_var], - max_score); + settings.log.debug("pc branching on %d. Value %e. Score %e\n", + branch_var, + leaf_solution.x[branch_var], + max_score); return branch_var; } // 0: no batch PDLP, 1: cooperative batch PDLP and DS, 2: batch PDLP only const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching; + // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled // This indicates that PDLP alone (not batched) couldn't even run at the root node // So it will most likely perform poorly compared to DS @@ -1415,34 +1448,68 @@ i_t pseudo_costs_t::reliable_variable_selection( constexpr i_t min_num_candidates_for_pdlp = 5; constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0; // Batch PDLP is either forced or we use the heuristic to decide if it should be used - const bool use_pdlp = (rb_mode == 2) || (rb_mode != 0 && !settings.sub_mip && - !settings.deterministic && pdlp_warm_cache.populated && - unreliable_list.size() > min_num_candidates_for_pdlp && - pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root > - min_percent_solved_by_batch_pdlp_at_root_for_pdlp); - - if (rb_mode != 0 && !pdlp_warm_cache.populated) { - log.printf("PDLP warm start data not populated, using DS only\n"); + // Check if batch PDLP was forced to be on + bool use_pdlp = rb_mode == 2; + + // Use the heuristic to decide if it should be used (in case it is set to automatic) + if (!use_pdlp && rb_mode != 0) { + // Check if it is a sub MIP or the determinism mode is on. + use_pdlp = !settings.sub_mip; + use_pdlp &= !settings.deterministic; + + // Check if the warm cache was filled at the root + use_pdlp &= pdlp_warm_cache->populated; + + // Check if there are enough candidates for batch PDLP + use_pdlp &= unreliable_list.size() > min_num_candidates_for_pdlp; + + // Check if batch PDLP was effective for strong branching at the root node + use_pdlp &= pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root > + min_percent_solved_by_batch_pdlp_at_root_for_pdlp; + + // Check if there are enough threads available + use_pdlp &= omp_get_num_threads() >= CUOPT_MIP_BATCH_PDLP_REQUIRED_THREAD_COUNT; + } + + // Use the heuristic to decide if it should be used (in case it is set to automatic) + if (!use_pdlp && rb_mode != 0) { + // Check if it is a sub MIP or the determinism mode is on. + use_pdlp = !settings.sub_mip; + use_pdlp &= !settings.deterministic; + + // Check if the warm cache was filled at the root + use_pdlp &= pdlp_warm_cache->populated; + + // Check if there are enough candidates for batch PDLP + use_pdlp &= unreliable_list.size() > min_num_candidates_for_pdlp; + + // Check if batch PDLP was effective for strong branching at the root node + use_pdlp &= pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root > + min_percent_solved_by_batch_pdlp_at_root_for_pdlp; + } + + if (rb_mode != 0 && !pdlp_warm_cache->populated) { + settings.log.debug("PDLP warm start data not populated, using DS only\n"); } else if (rb_mode != 0 && settings.sub_mip) { - log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n"); + settings.log.debug("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n"); } else if (rb_mode != 0 && settings.deterministic) { - log.printf( + settings.log.debug( "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n"); } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) { - log.printf("Not enough candidates to use batch PDLP, using DS only\n"); - } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) { - log.printf("Percent solved by batch PDLP at root is too low, using DS only\n"); + settings.log.debug("Not enough candidates to use batch PDLP, using DS only\n"); + } else if (rb_mode != 0 && pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root < 5.0) { + settings.log.debug("Percent solved by batch PDLP at root is too low, using DS only\n"); } else if (use_pdlp) { - log.printf( + settings.log.debug( "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved " "by batch PDLP at root is %f%% (> %f%%)\n", static_cast(unreliable_list.size()), min_num_candidates_for_pdlp, - pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root, + pdlp_warm_cache->percent_solved_by_batch_pdlp_at_root, min_percent_solved_by_batch_pdlp_at_root_for_pdlp); } - const int num_tasks = std::max(max_num_tasks, 10); + const int num_tasks = std::max(max_num_tasks, 1); const int task_priority = reliability_branching_settings.task_priority; // If both batch PDLP and DS are used we double the max number of candidates const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates @@ -1454,9 +1521,9 @@ i_t pseudo_costs_t::reliable_variable_selection( assert(num_candidates > 0); assert(num_tasks > 0); - log.printf( + settings.log.debug( "RB iters = %d, B&B iters = %d, unreliable = %d, num_tasks = %d, reliable_threshold = %d\n", - strong_branching_lp_iter.load(), + static_cast(strong_branching_lp_iter), branch_and_bound_lp_iters, unreliable_list.size(), num_tasks, @@ -1487,7 +1554,7 @@ i_t pseudo_costs_t::reliable_variable_selection( objective_change_estimate_t estimate = single_pivot_objective_change_estimate(worker->leaf_problem, settings, - AT, + *AT, node_ptr->vstatus, j, basic_map[j], @@ -1503,8 +1570,7 @@ i_t pseudo_costs_t::reliable_variable_selection( score = std::max(estimate.up_obj_change, eps) * std::max(estimate.down_obj_change, eps); } else { // Use the previous score, even if it is unreliable - score = calculate_pseudocost_score( - j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); + score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up); } } } else { @@ -1542,7 +1608,7 @@ i_t pseudo_costs_t::reliable_variable_selection( if (use_pdlp) { #pragma omp task default(shared) - batch_pdlp_reliability_branching_task(log, + batch_pdlp_reliability_branching_task(settings.log, rb_mode, num_candidates, start_time, @@ -1554,16 +1620,16 @@ i_t pseudo_costs_t::reliable_variable_selection( candidate_vars, settings, sb_view, - pdlp_warm_cache, + pdlp_warm_cache.get(), pdlp_obj_down, pdlp_obj_up); } if (toc(start_time) > settings.time_limit) { - log.printf("Time limit reached\n"); + settings.log.debug("Time limit reached\n"); if (use_pdlp) { concurrent_halt.store(1); -#pragma omp taskwait +#pragma omp taskwait // Wait for the batch PDLP task to finish } return branch_var; } @@ -1576,26 +1642,20 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t dual_simplex_start_time = tic(); if (rb_mode != 2) { -#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ - shared(score_mutex, \ - sb_view, \ - dual_simplex_obj_down, \ - dual_simplex_obj_up, \ - dual_simplex_status_down, \ - dual_simplex_status_up, \ - unreliable_list) +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(shared) for (i_t i = 0; i < num_candidates; ++i) { auto [score, j] = unreliable_list[i]; if (toc(start_time) > settings.time_limit) { continue; } if (rb_mode == 1 && sb_view.is_solved(i)) { - log.printf( + settings.log.debug( "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i); } else { pseudo_cost_mutex_down[j].lock(); if (pseudo_cost_num_down[j] < reliable_threshold) { // Do trial branching on the down branch + i_t iter = 0; const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, @@ -1610,7 +1670,8 @@ i_t pseudo_costs_t::reliable_variable_selection( upper_bound, start_time, iter_limit_per_trial, - strong_branching_lp_iter); + iter); + strong_branching_lp_iter += iter; dual_simplex_obj_down[i] = obj; dual_simplex_status_down[i] = status; @@ -1619,7 +1680,6 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t change_in_x = leaf_solution.x[j] - std::floor(leaf_solution.x[j]); pseudo_cost_sum_down[j] += change_in_obj / change_in_x; pseudo_cost_num_down[j]++; - // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); } } } else { @@ -1633,12 +1693,14 @@ i_t pseudo_costs_t::reliable_variable_selection( const i_t shared_idx = i + num_candidates; if (rb_mode == 1 && sb_view.is_solved(shared_idx)) { - log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", - j, - shared_idx); + settings.log.debug( + "DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", + j, + shared_idx); } else { pseudo_cost_mutex_up[j].lock(); if (pseudo_cost_num_up[j] < reliable_threshold) { + i_t iter = 0; const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, @@ -1653,7 +1715,8 @@ i_t pseudo_costs_t::reliable_variable_selection( upper_bound, start_time, iter_limit_per_trial, - strong_branching_lp_iter); + iter); + strong_branching_lp_iter += iter; dual_simplex_obj_up[i] = obj; dual_simplex_status_up[i] = status; @@ -1662,7 +1725,6 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t change_in_x = std::ceil(leaf_solution.x[j]) - leaf_solution.x[j]; pseudo_cost_sum_up[j] += change_in_obj / change_in_x; pseudo_cost_num_up[j]++; - // Should be valid if were are already here if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); } } } else { @@ -1674,9 +1736,7 @@ i_t pseudo_costs_t::reliable_variable_selection( if (toc(start_time) > settings.time_limit) { continue; } - score = - calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); - + score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up); score_mutex.lock(); if (score > max_score) { max_score = score; @@ -1690,26 +1750,8 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t dual_simplex_elapsed = toc(dual_simplex_start_time); - // TODO put back - // if (rb_mode != 2) { - // if (rb_mode == 1) { - // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped - // (PDLP) in %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2, - // dual_simplex_infeasible.load(), num_candidates * 2, - // dual_simplex_failed.load(), num_candidates * 2, - // dual_simplex_skipped.load(), dual_simplex_elapsed); - // } else { - // log.printf( - // "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in - // %.2fs\n", num_candidates, dual_simplex_optimal.load(), num_candidates * 2, - // dual_simplex_infeasible.load(), num_candidates * 2, dual_simplex_failed.load(), - // num_candidates * 2, dual_simplex_elapsed); - // } - //} - if (use_pdlp) { -#pragma omp taskwait +#pragma omp taskwait // Wait for the batch PDLP task to finish i_t pdlp_applied = 0; i_t pdlp_optimal = 0; @@ -1756,22 +1798,21 @@ i_t pseudo_costs_t::reliable_variable_selection( } } - f_t score = - calculate_pseudocost_score(j, leaf_solution.x, pseudo_cost_up_avg, pseudo_cost_down_avg); + f_t score = calculate_pseudocost_score(j, leaf_solution.x, avg_down, avg_up); if (score > max_score) { max_score = score; branch_var = j; } } - log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n", - num_candidates, - pdlp_optimal, - num_candidates * 2, - pdlp_applied); + settings.log.debug("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n", + num_candidates, + pdlp_optimal, + num_candidates * 2, + pdlp_applied); } - log.printf( + settings.log.debug( "pc branching on %d. Value %e. Score %e\n", branch_var, leaf_solution.x[branch_var], max_score); return branch_var; @@ -1780,37 +1821,30 @@ i_t pseudo_costs_t::reliable_variable_selection( template f_t pseudo_costs_t::obj_estimate(const std::vector& fractional, const std::vector& solution, - f_t lower_bound, - logger_t& log) + f_t lower_bound) { - const i_t num_fractional = fractional.size(); - f_t estimate = lower_bound; - - i_t num_initialized_down; - i_t num_initialized_up; - f_t pseudo_cost_down_avg; - f_t pseudo_cost_up_avg; - - initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg); + f_t estimate = lower_bound; + f_t avg_down = compute_pseudocost_average_down(); + f_t avg_up = compute_pseudocost_average_up(); for (i_t j : fractional) { - constexpr f_t eps = 1e-6; - i_t num_up = pseudo_cost_num_up[j]; - i_t num_down = pseudo_cost_num_down[j]; - f_t pc_up = num_up > 0 ? pseudo_cost_sum_up[j] / num_up : pseudo_cost_up_avg; - f_t pc_down = num_down > 0 ? pseudo_cost_sum_down[j] / num_down : pseudo_cost_down_avg; - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; + f_t pc_down = get_pseudocost_down(j, avg_down); + f_t pc_up = get_pseudocost_up(j, avg_up); + f_t f_down = solution[j] - std::floor(solution[j]); + f_t f_up = std::ceil(solution[j]) - solution[j]; estimate += std::min(pc_down * f_down, pc_up * f_up); } - log.printf("pseudocost estimate = %e\n", estimate); + settings.log.debug("pseudocost estimate = %e\n", estimate); return estimate; } template void pseudo_costs_t::update_pseudo_costs_from_strong_branching( - const std::vector& fractional, const std::vector& root_soln) + const std::vector& fractional, + const std::vector& strong_branch_down, + const std::vector& strong_branch_up, + const std::vector& root_soln) { for (i_t k = 0; k < fractional.size(); k++) { const i_t j = fractional[k]; @@ -1835,6 +1869,7 @@ void pseudo_costs_t::update_pseudo_costs_from_strong_branching( #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE template class pseudo_costs_t; +template class pseudo_cost_snapshot_t; template void strong_branching(const lp_problem_t& original_lp, const simplex_solver_settings_t& settings, diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 009bd8b81a..8139054a7b 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -7,8 +7,9 @@ #pragma once -#include +#include #include +#include #include #include @@ -18,7 +19,6 @@ #include #include -#include #include #include @@ -27,354 +27,6 @@ namespace cuopt::linear_programming::dual_simplex { -template -struct branch_variable_t { - i_t variable; - rounding_direction_t direction; -}; - -template -struct pseudo_cost_update_t { - i_t variable; - rounding_direction_t direction; - f_t delta; - double work_timestamp; - int worker_id; - - bool operator<(const pseudo_cost_update_t& other) const - { - if (work_timestamp != other.work_timestamp) return work_timestamp < other.work_timestamp; - if (variable != other.variable) return variable < other.variable; - if (delta != other.delta) return delta < other.delta; - return worker_id < other.worker_id; - } -}; - -template -struct pseudo_cost_averages_t { - f_t down_avg; - f_t up_avg; -}; - -// used to get T from omp_atomic_t based on the fact that omp_atomic_t::operator++ returns T -template -using underlying_type = decltype(std::declval()++); - -// Necessary because omp_atomic_t may be passed instead of f_t -template -auto compute_pseudo_cost_averages(const MaybeWrappedF* pc_sum_down, - const MaybeWrappedF* pc_sum_up, - const MaybeWrappedI* pc_num_down, - const MaybeWrappedI* pc_num_up, - size_t n) -{ - using underlying_f_t = underlying_type; - using underlying_i_t = underlying_type; - - underlying_i_t num_initialized_down = 0; - underlying_i_t num_initialized_up = 0; - underlying_f_t pseudo_cost_down_avg = 0.0; - underlying_f_t pseudo_cost_up_avg = 0.0; - - for (size_t j = 0; j < n; ++j) { - if (pc_num_down[j] > 0) { - ++num_initialized_down; - if (std::isfinite(pc_sum_down[j])) { - pseudo_cost_down_avg += pc_sum_down[j] / pc_num_down[j]; - } - } - if (pc_num_up[j] > 0) { - ++num_initialized_up; - if (std::isfinite(pc_sum_up[j])) { pseudo_cost_up_avg += pc_sum_up[j] / pc_num_up[j]; } - } - } - - pseudo_cost_down_avg = - (num_initialized_down > 0) ? pseudo_cost_down_avg / num_initialized_down : 1.0; - pseudo_cost_up_avg = (num_initialized_up > 0) ? pseudo_cost_up_avg / num_initialized_up : 1.0; - - return pseudo_cost_averages_t{pseudo_cost_down_avg, pseudo_cost_up_avg}; -} - -// Variable selection using pseudo-cost product scoring -// Returns the best variable to branch on -template -i_t variable_selection_from_pseudo_costs(const f_t* pc_sum_down, - const f_t* pc_sum_up, - const i_t* pc_num_down, - const i_t* pc_num_up, - i_t n_vars, - const std::vector& fractional, - const std::vector& solution) -{ - const i_t num_fractional = fractional.size(); - if (num_fractional == 0) return -1; - - auto [pc_down_avg, pc_up_avg] = - compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); - - i_t branch_var = fractional[0]; - f_t max_score = std::numeric_limits::lowest(); - constexpr f_t eps = f_t(1e-6); - - for (i_t j : fractional) { - f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg; - f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg; - const f_t f_down = solution[j] - std::floor(solution[j]); - const f_t f_up = std::ceil(solution[j]) - solution[j]; - f_t score = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps); - if (score > max_score) { - max_score = score; - branch_var = j; - } - } - - return branch_var; -} - -// Objective estimate using pseudo-costs (lock-free implementation) -// Returns lower_bound + estimated cost to reach integer feasibility -template -f_t obj_estimate_from_arrays(const f_t* pc_sum_down, - const f_t* pc_sum_up, - const i_t* pc_num_down, - const i_t* pc_num_up, - i_t n_vars, - const std::vector& fractional, - const std::vector& solution, - f_t lower_bound) -{ - auto [pc_down_avg, pc_up_avg] = - compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); - - f_t estimate = lower_bound; - constexpr f_t eps = f_t(1e-6); - - for (i_t j : fractional) { - f_t pc_down = pc_num_down[j] != 0 ? pc_sum_down[j] / pc_num_down[j] : pc_down_avg; - f_t pc_up = pc_num_up[j] != 0 ? pc_sum_up[j] / pc_num_up[j] : pc_up_avg; - const f_t f_down = solution[j] - std::floor(solution[j]); - const f_t f_up = std::ceil(solution[j]) - solution[j]; - estimate += std::min(std::max(pc_down * f_down, eps), std::max(pc_up * f_up, eps)); - } - - return estimate; -} - -template -branch_variable_t pseudocost_diving_from_arrays(const MaybeWrappedF* pc_sum_down, - const MaybeWrappedF* pc_sum_up, - const MaybeWrappedI* pc_num_down, - const MaybeWrappedI* pc_num_up, - i_t n_vars, - const std::vector& fractional, - const std::vector& solution, - const std::vector& root_solution) -{ - const i_t num_fractional = fractional.size(); - if (num_fractional == 0) return {-1, rounding_direction_t::NONE}; - - auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); - - i_t branch_var = fractional[0]; - f_t max_score = std::numeric_limits::lowest(); - rounding_direction_t round_dir = rounding_direction_t::DOWN; - constexpr f_t eps = f_t(1e-6); - - for (i_t j : fractional) { - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; - f_t pc_down = pc_num_down[j] != 0 ? (f_t)pc_sum_down[j] / (f_t)pc_num_down[j] : avgs.down_avg; - f_t pc_up = pc_num_up[j] != 0 ? (f_t)pc_sum_up[j] / (f_t)pc_num_up[j] : avgs.up_avg; - - f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down); - f_t score_up = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up); - - f_t score = 0; - rounding_direction_t dir = rounding_direction_t::DOWN; - - f_t root_val = (j < static_cast(root_solution.size())) ? root_solution[j] : solution[j]; - - if (solution[j] < root_val - f_t(0.4)) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else if (solution[j] > root_val + f_t(0.4)) { - score = score_up; - dir = rounding_direction_t::UP; - } else if (f_down < f_t(0.3)) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else if (f_down > f_t(0.7)) { - score = score_up; - dir = rounding_direction_t::UP; - } else if (pc_down < pc_up + eps) { - score = score_down; - dir = rounding_direction_t::DOWN; - } else { - score = score_up; - dir = rounding_direction_t::UP; - } - - if (score > max_score) { - max_score = score; - branch_var = j; - round_dir = dir; - } - } - - if (round_dir == rounding_direction_t::NONE) { - branch_var = fractional[0]; - round_dir = rounding_direction_t::DOWN; - } - - return {branch_var, round_dir}; -} - -template -branch_variable_t guided_diving_from_arrays(const MaybeWrappedF* pc_sum_down, - const MaybeWrappedF* pc_sum_up, - const MaybeWrappedI* pc_num_down, - const MaybeWrappedI* pc_num_up, - i_t n_vars, - const std::vector& fractional, - const std::vector& solution, - const std::vector& incumbent) -{ - const i_t num_fractional = fractional.size(); - if (num_fractional == 0) return {-1, rounding_direction_t::NONE}; - - auto avgs = compute_pseudo_cost_averages(pc_sum_down, pc_sum_up, pc_num_down, pc_num_up, n_vars); - - i_t branch_var = fractional[0]; - f_t max_score = std::numeric_limits::lowest(); - rounding_direction_t round_dir = rounding_direction_t::DOWN; - constexpr f_t eps = f_t(1e-6); - - for (i_t j : fractional) { - f_t f_down = solution[j] - std::floor(solution[j]); - f_t f_up = std::ceil(solution[j]) - solution[j]; - f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j])); - f_t up_dist = std::abs(std::ceil(solution[j]) - incumbent[j]); - rounding_direction_t dir = - down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP; - - f_t pc_down = pc_num_down[j] != 0 ? (f_t)pc_sum_down[j] / (f_t)pc_num_down[j] : avgs.down_avg; - f_t pc_up = pc_num_up[j] != 0 ? (f_t)pc_sum_up[j] / (f_t)pc_num_up[j] : avgs.up_avg; - - f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up; - f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down; - f_t score = (score1 + score2) / 6; - - if (score > max_score) { - max_score = score; - branch_var = j; - round_dir = dir; - } - } - - return {branch_var, round_dir}; -} - -template -class pseudo_cost_snapshot_t { - public: - pseudo_cost_snapshot_t() = default; - - pseudo_cost_snapshot_t(std::vector sum_down, - std::vector sum_up, - std::vector num_down, - std::vector num_up) - : sum_down_(std::move(sum_down)), - sum_up_(std::move(sum_up)), - num_down_(std::move(num_down)), - num_up_(std::move(num_up)) - { - } - - i_t variable_selection(const std::vector& fractional, const std::vector& solution) const - { - return variable_selection_from_pseudo_costs(sum_down_.data(), - sum_up_.data(), - num_down_.data(), - num_up_.data(), - n_vars(), - fractional, - solution); - } - - f_t obj_estimate(const std::vector& fractional, - const std::vector& solution, - f_t lower_bound) const - { - return obj_estimate_from_arrays(sum_down_.data(), - sum_up_.data(), - num_down_.data(), - num_up_.data(), - n_vars(), - fractional, - solution, - lower_bound); - } - - branch_variable_t pseudocost_diving(const std::vector& fractional, - const std::vector& solution, - const std::vector& root_solution) const - { - return pseudocost_diving_from_arrays(sum_down_.data(), - sum_up_.data(), - num_down_.data(), - num_up_.data(), - n_vars(), - fractional, - solution, - root_solution); - } - - branch_variable_t guided_diving(const std::vector& fractional, - const std::vector& solution, - const std::vector& incumbent) const - { - return guided_diving_from_arrays(sum_down_.data(), - sum_up_.data(), - num_down_.data(), - num_up_.data(), - n_vars(), - fractional, - solution, - incumbent); - } - - void queue_update( - i_t variable, rounding_direction_t direction, f_t delta, double clock, int worker_id) - { - updates_.push_back({variable, direction, delta, clock, worker_id}); - if (direction == rounding_direction_t::DOWN) { - sum_down_[variable] += delta; - num_down_[variable]++; - } else { - sum_up_[variable] += delta; - num_up_[variable]++; - } - } - - std::vector> take_updates() - { - std::vector> result; - result.swap(updates_); - return result; - } - - i_t n_vars() const { return (i_t)sum_down_.size(); } - - std::vector sum_down_; - std::vector sum_up_; - std::vector num_down_; - std::vector num_up_; - - private: - std::vector> updates_; -}; - template struct reliability_branching_settings_t { // Lower bound for the maximum number of LP iterations for a single trial branching @@ -413,6 +65,12 @@ struct reliability_branching_settings_t { bool rank_candidates_with_dual_pivot = true; }; +template +struct branch_variable_t { + i_t variable; + branch_direction_t direction; +}; + template struct batch_pdlp_warm_cache_t { const raft::handle_t batch_pdlp_handle{}; @@ -425,41 +83,63 @@ struct batch_pdlp_warm_cache_t { bool populated{false}; }; +template +struct pseudo_cost_update_t { + i_t variable; + branch_direction_t direction; + f_t delta; + double work_timestamp; + int worker_id; + + bool operator<(const pseudo_cost_update_t& other) const + { + if (work_timestamp != other.work_timestamp) return work_timestamp < other.work_timestamp; + if (variable != other.variable) return variable < other.variable; + if (delta != other.delta) return delta < other.delta; + return worker_id < other.worker_id; + } +}; + template class pseudo_costs_t { public: - explicit pseudo_costs_t(i_t num_variables) - : pseudo_cost_sum_down(num_variables), + explicit pseudo_costs_t(i_t num_variables, const simplex_solver_settings_t& settings) + : settings(settings), + pseudo_cost_sum_down(num_variables), pseudo_cost_sum_up(num_variables), pseudo_cost_num_down(num_variables), pseudo_cost_num_up(num_variables), pseudo_cost_mutex_up(num_variables), pseudo_cost_mutex_down(num_variables), - AT(1, 1, 1) + AT(std::make_shared>(1, 1, 1)), + pdlp_warm_cache(std::make_shared>()) { } - void update_pseudo_costs(mip_node_t* node_ptr, f_t leaf_objective); + pseudo_costs_t(const pseudo_costs_t& other) : pseudo_costs_t(1, other.settings) + { + *this = other; + } - pseudo_cost_snapshot_t create_snapshot() const + pseudo_costs_t& operator=(const pseudo_costs_t& other) { - const i_t n = (i_t)pseudo_cost_sum_down.size(); - std::vector sd(n), su(n); - std::vector nd(n), nu(n); - for (i_t j = 0; j < n; ++j) { - sd[j] = pseudo_cost_sum_down[j]; - su[j] = pseudo_cost_sum_up[j]; - nd[j] = pseudo_cost_num_down[j]; - nu[j] = pseudo_cost_num_up[j]; + if (this != &other) { + this->AT = other.AT; + this->pdlp_warm_cache = other.pdlp_warm_cache; + this->pseudo_cost_num_down = other.pseudo_cost_num_down; + this->pseudo_cost_num_up = other.pseudo_cost_num_up; + this->pseudo_cost_sum_down = other.pseudo_cost_sum_down; + this->pseudo_cost_sum_up = other.pseudo_cost_sum_up; } - return pseudo_cost_snapshot_t( - std::move(sd), std::move(su), std::move(nd), std::move(nu)); + return *this; } + void update_pseudo_costs(mip_node_t* node_ptr, f_t leaf_objective); + void merge_updates(const std::vector>& updates) { for (const auto& upd : updates) { - if (upd.direction == rounding_direction_t::DOWN) { + if (upd.direction == branch_direction_t::DOWN) { pseudo_cost_sum_down[upd.variable] += upd.delta; pseudo_cost_num_down[upd.variable]++; } else { @@ -479,33 +159,42 @@ class pseudo_costs_t { pseudo_cost_mutex_down.resize(num_variables); } - void initialized(i_t& num_initialized_down, - i_t& num_initialized_up, - f_t& pseudo_cost_down_avg, - f_t& pseudo_cost_up_avg) const; + f_t get_pseudocost_down(i_t j, f_t avg) const + { + i_t num = pseudo_cost_num_down[j]; + f_t sum = pseudo_cost_sum_down[j]; + return num > 0 ? sum / num : avg; + } + + f_t get_pseudocost_up(i_t j, f_t avg) const + { + i_t num = pseudo_cost_num_up[j]; + f_t sum = pseudo_cost_sum_up[j]; + return num > 0 ? sum / num : avg; + } + + f_t compute_pseudocost_average_down(); + f_t compute_pseudocost_average_up(); f_t obj_estimate(const std::vector& fractional, const std::vector& solution, - f_t lower_bound, - logger_t& log); + f_t lower_bound); - i_t variable_selection(const std::vector& fractional, - const std::vector& solution, - logger_t& log); + i_t variable_selection(const std::vector& fractional, const std::vector& solution); i_t reliable_variable_selection(const mip_node_t* node_ptr, const std::vector& fractional, branch_and_bound_worker_t* worker, const std::vector& var_types, const branch_and_bound_stats_t& bnb_stats, - const simplex_solver_settings_t& settings, f_t upper_bound, int max_num_tasks, - logger_t& log, const std::vector& new_slacks, const lp_problem_t& original_lp); void update_pseudo_costs_from_strong_branching(const std::vector& fractional, + const std::vector& strong_branch_down, + const std::vector& strong_branch_up, const std::vector& root_soln); uint32_t compute_state_hash() const @@ -514,31 +203,68 @@ class pseudo_costs_t { detail::compute_hash(pseudo_cost_num_down) ^ detail::compute_hash(pseudo_cost_num_up); } - uint32_t compute_strong_branch_hash() const - { - return detail::compute_hash(strong_branch_down) ^ detail::compute_hash(strong_branch_up); - } - f_t calculate_pseudocost_score(i_t j, const std::vector& solution, - f_t pseudo_cost_up_avg, - f_t pseudo_cost_down_avg) const; + f_t avg_down, + f_t avg_up) const; + + std::shared_ptr> AT; // Transpose of the constraint matrix A + std::shared_ptr> pdlp_warm_cache; reliability_branching_settings_t reliability_branching_settings; + simplex_solver_settings_t settings; - csc_matrix_t AT; // Transpose of the constraint matrix A + protected: std::vector> pseudo_cost_sum_up; std::vector> pseudo_cost_sum_down; std::vector> pseudo_cost_num_up; std::vector> pseudo_cost_num_down; - std::vector strong_branch_down; - std::vector strong_branch_up; std::vector pseudo_cost_mutex_up; std::vector pseudo_cost_mutex_down; - omp_atomic_t num_strong_branches_completed = 0; - omp_atomic_t strong_branching_lp_iter = 0; - batch_pdlp_warm_cache_t pdlp_warm_cache; + omp_atomic_t strong_branching_lp_iter = 0; +}; + +template +class pseudo_cost_snapshot_t : public pseudo_costs_t { + public: + using Base = pseudo_costs_t; + using Base::Base; + + pseudo_cost_snapshot_t(const pseudo_costs_t& other) : Base(1, other.settings) + { + Base::operator=(other); + } + + pseudo_cost_snapshot_t operator=(const pseudo_costs_t& other) + { + return Base::operator=(other); + } + + void queue_update( + i_t variable, branch_direction_t direction, f_t delta, double clock, int worker_id) + { + updates_.push_back({variable, direction, delta, clock, worker_id}); + if (direction == branch_direction_t::DOWN) { + this->pseudo_cost_sum_down[variable] += delta; + ++this->pseudo_cost_num_down[variable]; + } else { + this->pseudo_cost_sum_up[variable] += delta; + ++this->pseudo_cost_num_up[variable]; + } + } + + std::vector> take_updates() + { + std::vector> result; + result.swap(updates_); + return result; + } + + i_t n_vars() const { return this->pseudo_cost_sum_down.size(); } + + private: + std::vector> updates_; }; template diff --git a/cpp/src/branch_and_bound/branch_and_bound_worker.hpp b/cpp/src/branch_and_bound/worker.hpp similarity index 52% rename from cpp/src/branch_and_bound/branch_and_bound_worker.hpp rename to cpp/src/branch_and_bound/worker.hpp index 4de2b43cae..87689e57bb 100644 --- a/cpp/src/branch_and_bound/branch_and_bound_worker.hpp +++ b/cpp/src/branch_and_bound/worker.hpp @@ -7,36 +7,19 @@ #pragma once +#include #include #include #include -#include #include -#include #include -#include #include namespace cuopt::linear_programming::dual_simplex { -constexpr int num_search_strategies = 5; - -// Indicate the search and variable selection algorithms used by each thread -// in B&B (See [1]). -// -// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin, -// Berlin, 2007. doi: 10.14279/depositonce-1634. -enum search_strategy_t : int { - BEST_FIRST = 0, // Best-First + Plunging. - PSEUDOCOST_DIVING = 1, // Pseudocost diving (9.2.5) - LINE_SEARCH_DIVING = 2, // Line search diving (9.2.4) - GUIDED_DIVING = 3, // Guided diving (9.2.3). - COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) -}; - template struct branch_and_bound_stats_t { f_t start_time = 0.0; @@ -116,9 +99,8 @@ class branch_and_bound_worker_t { const lp_problem_t& original_lp, const simplex_solver_settings_t& settings) { - internal_node = node->detach_copy(); - start_node = &internal_node; - + internal_node = node->detach_copy(); + start_node = &internal_node; start_lower = original_lp.lower; start_upper = original_lp.upper; search_strategy = type; @@ -130,7 +112,7 @@ class branch_and_bound_worker_t { return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper); } - // Set the variables bounds for the LP relaxation of the current node. + // Set the variables bounds for the LP relaxation in the current node. bool set_lp_variable_bounds(mip_node_t* node_ptr, const simplex_solver_settings_t& settings) { @@ -162,120 +144,4 @@ class branch_and_bound_worker_t { mip_node_t internal_node; }; -template -class branch_and_bound_worker_pool_t { - public: - void init(i_t num_workers, - const lp_problem_t& original_lp, - const csr_matrix_t& Arow, - const std::vector& var_type, - const simplex_solver_settings_t& settings) - { - workers_.resize(num_workers); - num_idle_workers_ = num_workers; - for (i_t i = 0; i < num_workers; ++i) { - workers_[i] = std::make_unique>( - i, original_lp, Arow, var_type, settings); - idle_workers_.push_front(i); - } - - is_initialized = true; - } - - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - branch_and_bound_worker_t* get_idle_worker() - { - std::lock_guard lock(mutex_); - if (idle_workers_.empty()) { - return nullptr; - } else { - i_t idx = idle_workers_.front(); - return workers_[idx].get(); - } - } - - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - void pop_idle_worker() - { - std::lock_guard lock(mutex_); - if (!idle_workers_.empty()) { - idle_workers_.pop_front(); - num_idle_workers_--; - } - } - - void return_worker_to_pool(branch_and_bound_worker_t* worker) - { - worker->is_active = false; - std::lock_guard lock(mutex_); - idle_workers_.push_back(worker->worker_id); - num_idle_workers_++; - } - - f_t get_lower_bound() - { - f_t lower_bound = std::numeric_limits::infinity(); - - if (is_initialized) { - for (i_t i = 0; i < workers_.size(); ++i) { - if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) { - lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound); - } - } - } - - return lower_bound; - } - - i_t num_idle_workers() { return num_idle_workers_; } - - private: - // Worker pool - std::vector>> workers_; - bool is_initialized = false; - - omp_mutex_t mutex_; - std::deque idle_workers_; - omp_atomic_t num_idle_workers_; -}; - -template -std::vector get_search_strategies( - diving_heuristics_settings_t settings) -{ - std::vector types; - types.reserve(num_search_strategies); - types.push_back(BEST_FIRST); - if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); } - if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); } - if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); } - if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); } - return types; -} - -template -std::array get_max_workers( - i_t num_workers, const std::vector& strategies) -{ - std::array max_num_workers; - max_num_workers.fill(0); - - i_t bfs_workers = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1); - max_num_workers[BEST_FIRST] = bfs_workers; - - i_t diving_workers = (num_workers - bfs_workers); - i_t m = strategies.size() - 1; - - for (size_t i = 1, k = 0; i < strategies.size(); ++i) { - i_t start = (double)k * diving_workers / m; - i_t end = (double)(k + 1) * diving_workers / m; - max_num_workers[strategies[i]] = end - start; - ++k; - } - - return max_num_workers; -} - } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp new file mode 100644 index 0000000000..2b52b6e7bf --- /dev/null +++ b/cpp/src/branch_and_bound/worker_pool.hpp @@ -0,0 +1,130 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include + +namespace cuopt::linear_programming::dual_simplex { + +template +class branch_and_bound_worker_pool_t { + public: + void init(i_t num_workers, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_type, + const simplex_solver_settings_t& settings) + { + workers_.resize(num_workers); + num_idle_workers_ = num_workers; + for (i_t i = 0; i < num_workers; ++i) { + workers_[i] = std::make_unique>( + i, original_lp, Arow, var_type, settings); + idle_workers_.push_front(i); + } + + is_initialized = true; + } + + // Here, we are assuming that the scheduler is the only + // thread that can retrieve/pop an idle worker. + branch_and_bound_worker_t* get_idle_worker() + { + std::lock_guard lock(mutex_); + if (idle_workers_.empty()) { + return nullptr; + } else { + i_t idx = idle_workers_.front(); + return workers_[idx].get(); + } + } + + // Here, we are assuming that the scheduler is the only + // thread that can retrieve/pop an idle worker. + void pop_idle_worker() + { + std::lock_guard lock(mutex_); + if (!idle_workers_.empty()) { + idle_workers_.pop_front(); + num_idle_workers_--; + } + } + + void return_worker_to_pool(branch_and_bound_worker_t* worker) + { + worker->is_active = false; + std::lock_guard lock(mutex_); + idle_workers_.push_back(worker->worker_id); + num_idle_workers_++; + } + + f_t get_lower_bound() + { + f_t lower_bound = std::numeric_limits::infinity(); + + if (is_initialized) { + for (i_t i = 0; i < workers_.size(); ++i) { + if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) { + lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound); + } + } + } + + return lower_bound; + } + + i_t num_idle_workers() { return num_idle_workers_; } + + private: + // Worker pool + std::vector>> workers_; + bool is_initialized = false; + + omp_mutex_t mutex_; + std::deque idle_workers_; + omp_atomic_t num_idle_workers_; +}; + +template +std::vector get_search_strategies( + diving_heuristics_settings_t settings) +{ + std::vector types; + types.reserve(num_search_strategies); + types.push_back(BEST_FIRST); + if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); } + if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); } + if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); } + if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); } + return types; +} + +template +std::array get_max_workers( + i_t num_workers, const std::vector& strategies) +{ + std::array max_num_workers; + max_num_workers.fill(0); + + i_t bfs_workers = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1); + max_num_workers[BEST_FIRST] = bfs_workers; + + i_t diving_workers = (num_workers - bfs_workers); + i_t m = strategies.size() - 1; + + for (size_t i = 1, k = 0; i < strategies.size(); ++i) { + i_t start = (double)k * diving_workers / m; + i_t end = (double)(k + 1) * diving_workers / m; + max_num_workers[strategies[i]] = end - start; + ++k; + } + + return max_num_workers; +} + +} // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/cuts/cuts.cpp b/cpp/src/cuts/cuts.cpp index 6d7d97ef0a..0b93ece0c7 100644 --- a/cpp/src/cuts/cuts.cpp +++ b/cpp/src/cuts/cuts.cpp @@ -1878,12 +1878,10 @@ bool cut_generation_t::generate_clique_cuts( static_cast(settings.time_limit), static_cast(toc(start_time))); - if (clique_table_ == nullptr && clique_table_future_ != nullptr && - clique_table_future_->valid()) { + if (clique_table_ == nullptr) { CLIQUE_CUTS_DEBUG("generate_clique_cuts signaling background thread and waiting"); if (signal_extend_) { signal_extend_->store(true, std::memory_order_release); } - clique_table_ = clique_table_future_->get(); - clique_table_future_ = nullptr; +#pragma omp taskwait depend(in : *signal_extend_) if (clique_table_) { CLIQUE_CUTS_DEBUG("generate_clique_cuts received clique table first=%lld addtl=%lld", static_cast(clique_table_->first.size()), diff --git a/cpp/src/cuts/cuts.hpp b/cpp/src/cuts/cuts.hpp index 2da9760e27..2d2a2dcd21 100644 --- a/cpp/src/cuts/cuts.hpp +++ b/cpp/src/cuts/cuts.hpp @@ -406,24 +406,21 @@ class variable_bounds_t; template class cut_generation_t { public: - cut_generation_t( - cut_pool_t& cut_pool, - const lp_problem_t& lp, - const simplex_solver_settings_t& settings, - csr_matrix_t& Arow, - const std::vector& new_slacks, - const std::vector& var_types, - const user_problem_t& user_problem, - const probing_implied_bound_t& probing_implied_bound, - std::shared_ptr> clique_table = nullptr, - std::future>>* clique_table_future = nullptr, - std::atomic* signal_extend = nullptr) + cut_generation_t(cut_pool_t& cut_pool, + const lp_problem_t& lp, + const simplex_solver_settings_t& settings, + csr_matrix_t& Arow, + const std::vector& new_slacks, + const std::vector& var_types, + const user_problem_t& user_problem, + const probing_implied_bound_t& probing_implied_bound, + std::shared_ptr> clique_table = nullptr, + omp_atomic_t* signal_extend = nullptr) : cut_pool_(cut_pool), knapsack_generation_(lp, settings, Arow, new_slacks, var_types), user_problem_(user_problem), probing_implied_bound_(probing_implied_bound), clique_table_(std::move(clique_table)), - clique_table_future_(clique_table_future), signal_extend_(signal_extend) { } @@ -493,8 +490,7 @@ class cut_generation_t { const user_problem_t& user_problem_; const probing_implied_bound_t& probing_implied_bound_; std::shared_ptr> clique_table_; - std::future>>* clique_table_future_{nullptr}; - std::atomic* signal_extend_{nullptr}; + omp_atomic_t* signal_extend_{nullptr}; }; template diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index 9c56ada50e..fdf8acf07d 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -2431,7 +2431,22 @@ int basis_update_mpf_t::refactor_basis( assert(q.size() == A.m); reorder_basic_list(q, basic_list); // We no longer need q after reordering the basic list work_estimate_ += 3 * q.size(); - reset(); + + // Check halt before the transpose operations: these can take hundreds of ms + // on large problems (L0 and U0 each have O(fill-in) nonzeros) and have no + // internal halt checks. Catching the flag here avoids the dead zone. + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return CONCURRENT_HALT_RETURN; + } + // Inline reset() so we can check halt between the two transposes. + clear(); + L0_.transpose(L0_transpose_); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return CONCURRENT_HALT_RETURN; + } + U0_.transpose(U0_transpose_); + work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n]; + reset_stats(); return 0; } diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 5b1130796e..75e5ecae3c 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -2488,7 +2488,6 @@ dual::status_t dual_phase2(i_t phase, const i_t n = lp.num_cols; std::vector basic_list(m); std::vector nonbasic_list; - std::vector superbasic_list; basis_update_mpf_t ft(m, settings.refactor_frequency); const bool initialize_basis = true; return dual_phase2_with_advanced_basis(phase, @@ -2688,6 +2687,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, vector_norm2(delta_y_steepest_edge)); } + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } + if (phase == 2) { settings.log.printf(" Iter Objective Num Inf. Sum Inf. Perturb Time\n"); } @@ -2735,10 +2738,18 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0); #endif + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } + csc_matrix_t A_transpose(1, 1, 0); lp.A.transpose(A_transpose); phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } + f_t obj = compute_objective(lp, x); phase2_work_estimate += 2 * n; @@ -2908,6 +2919,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse); } timers.btran_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared(); phase2_work_estimate += 2 * delta_y_sparse.i.size(); @@ -2966,6 +2980,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } } timers.delta_z_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } #ifdef COMPUTE_DUAL_RESIDUAL std::vector dual_residual; @@ -3301,6 +3318,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } timers.ftran_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } #ifdef CHECK_PRIMAL_STEP std::vector residual(m); @@ -3331,6 +3351,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif assert(steepest_edge_status == 0); timers.se_norms_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } timers.start_timer(); // x <- x + delta_x diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp index d570ea933e..d0e2d52812 100644 --- a/cpp/src/dual_simplex/presolve.hpp +++ b/cpp/src/dual_simplex/presolve.hpp @@ -50,58 +50,6 @@ struct lp_problem_t { f_t obj_scale; // 1.0 for min, -1.0 for max bool objective_is_integral{false}; - void write_problem(const std::string& path) const - { - FILE* fid = fopen(path.c_str(), "w"); - if (fid) { - fwrite(&num_rows, sizeof(i_t), 1, fid); - fwrite(&num_cols, sizeof(i_t), 1, fid); - fwrite(&obj_constant, sizeof(f_t), 1, fid); - fwrite(&obj_scale, sizeof(f_t), 1, fid); - i_t is_integral = objective_is_integral ? 1 : 0; - fwrite(&is_integral, sizeof(i_t), 1, fid); - fwrite(objective.data(), sizeof(f_t), num_cols, fid); - fwrite(rhs.data(), sizeof(f_t), num_rows, fid); - fwrite(lower.data(), sizeof(f_t), num_cols, fid); - fwrite(upper.data(), sizeof(f_t), num_cols, fid); - fwrite(A.col_start.data(), sizeof(i_t), A.col_start.size(), fid); - fwrite(A.i.data(), sizeof(i_t), A.i.size(), fid); - fwrite(A.x.data(), sizeof(f_t), A.x.size(), fid); - fclose(fid); - } - } - - void read_problem(const std::string& path) - { - FILE* fid = fopen(path.c_str(), "r"); - if (fid) { - fread(&num_rows, sizeof(i_t), 1, fid); - fread(&num_cols, sizeof(i_t), 1, fid); - fread(&obj_constant, sizeof(f_t), 1, fid); - fread(&obj_scale, sizeof(f_t), 1, fid); - i_t is_integral; - fread(&is_integral, sizeof(i_t), 1, fid); - objective_is_integral = is_integral == 1; - objective.resize(num_cols); - fread(objective.data(), sizeof(f_t), num_cols, fid); - rhs.resize(num_rows); - fread(rhs.data(), sizeof(f_t), num_rows, fid); - lower.resize(num_cols); - fread(lower.data(), sizeof(f_t), num_cols, fid); - upper.resize(num_cols); - fread(upper.data(), sizeof(f_t), num_cols, fid); - A.n = num_cols; - A.m = num_rows; - A.col_start.resize(num_cols + 1); - fread(A.col_start.data(), sizeof(i_t), num_cols + 1, fid); - A.i.resize(A.col_start[num_cols]); - fread(A.i.data(), sizeof(i_t), A.i.size(), fid); - A.x.resize(A.i.size()); - fread(A.x.data(), sizeof(f_t), A.x.size(), fid); - fclose(fid); - } - } - void write_mps(const std::string& path) const { std::ofstream mps_file(path); diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index 5cb0185c8c..9eecdc254c 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -247,7 +247,7 @@ i_t markowitz_search(const std::vector& Cdegree, constexpr bool verbose = false; i_t nz_max = std::min(m, n); while (nz <= nz_max) { - i_t markowitz_lower_bound = (nz - 1) * (nz - 1); + int64_t markowitz_lower_bound = static_cast(nz - 1) * static_cast(nz - 1); // Search columns of length nz for (const i_t j : col_count[nz]) { assert(Cdegree[j] == nz); @@ -272,7 +272,7 @@ i_t markowitz_search(const std::vector& Cdegree, } #endif assert(Rdegree[i] >= 0); - const i_t Mij = (Rdegree[i] - 1) * (nz - 1); + const int64_t Mij = static_cast(Rdegree[i] - 1) * static_cast(nz - 1); if (Mij < markowitz && std::abs(entry->x) >= threshold_tol * max_in_col && #ifdef THRESHOLD_ROOK_PIVOTING std::abs(entry->x) >= threshold_tol * max_in_row[i] && @@ -291,7 +291,7 @@ i_t markowitz_search(const std::vector& Cdegree, if (markowitz <= markowitz_lower_bound) { break; } - markowitz_lower_bound = (nz - 1) * nz; + markowitz_lower_bound = static_cast(nz - 1) * static_cast(nz); // Search rows of length nz assert(row_count[nz].size() >= 0); @@ -307,7 +307,7 @@ i_t markowitz_search(const std::vector& Cdegree, assert(entry->i == i); const f_t max_in_col = max_in_column[j]; assert(Cdegree[j] >= 0); - const i_t Mij = (nz - 1) * (Cdegree[j] - 1); + const int64_t Mij = static_cast(nz - 1) * static_cast(Cdegree[j] - 1); if (Mij < markowitz && std::abs(entry->x) >= threshold_tol * max_in_col && #ifdef THRESHOLD_ROOK_PIVOTING std::abs(entry->x) >= threshold_tol * max_in_row_i && diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index b7c619f246..82d922eec3 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -120,16 +120,17 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original std::vector basic_list(m); std::vector nonbasic_list; basis_update_mpf_t ft(m, settings.refactor_frequency); - return solve_linear_program_with_advanced_basis(original_lp, - start_time, - settings, - original_solution, - ft, - basic_list, - nonbasic_list, - vstatus, - edge_norms, - work_unit_context); + lp_status_t result = solve_linear_program_with_advanced_basis(original_lp, + start_time, + settings, + original_solution, + ft, + basic_list, + nonbasic_list, + vstatus, + edge_norms, + work_unit_context); + return result; } template @@ -222,7 +223,10 @@ lp_status_t solve_linear_program_with_advanced_basis( if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; } if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; } if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } - if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } + if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { + original_solution.iterations = iter; + return lp_status_t::CONCURRENT_LIMIT; + } phase1_obj = phase1_solution.objective; if (phase1_obj > -settings.primal_tol) { settings.log.printf("Dual feasible solution found.\n"); @@ -309,7 +313,10 @@ lp_status_t solve_linear_program_with_advanced_basis( if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; } if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; } if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; } - if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; } + if (status == dual::status_t::CONCURRENT_LIMIT) { + original_solution.iterations = iter; + return lp_status_t::CONCURRENT_LIMIT; + } if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; } if (status == dual::status_t::CUTOFF) { lp_status = lp_status_t::CUTOFF; } original_solution.iterations = iter; @@ -581,6 +588,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us solution.iterations = barrier_solution.iterations; } + if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } + // If we aren't doing crossover, we're done if (!settings.crossover || barrier_lp.Q.n > 0) { return barrier_status; } @@ -681,6 +690,10 @@ lp_status_t solve_linear_program(const user_problem_t& user_problem, std::vector edge_norms; lp_status_t status = solve_linear_program_advanced( original_lp, start_time, settings, lp_solution, vstatus, edge_norms); + if (status == lp_status_t::CONCURRENT_LIMIT) { + solution.iterations = lp_solution.iterations; + return lp_status_t::CONCURRENT_LIMIT; + } uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); uncrush_dual_solution( user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z); diff --git a/cpp/src/grpc/client/solve_remote.cpp b/cpp/src/grpc/client/solve_remote.cpp index 19908557e8..fb39a6d184 100644 --- a/cpp/src/grpc/client/solve_remote.cpp +++ b/cpp/src/grpc/client/solve_remote.cpp @@ -20,6 +20,8 @@ #include #include +#include + namespace cuopt::linear_programming { // Buffer added to the solver's time_limit to account for worker startup, @@ -209,6 +211,15 @@ std::unique_ptr> solve_mip_remote( // Check if user has set incumbent callbacks auto mip_callbacks = settings.get_mip_callbacks(); + const auto var_types = cpu_problem.get_variable_types_host(); + const bool has_sc_variables = + thrust::count(var_types.begin(), var_types.end(), var_t::SEMI_CONTINUOUS) > 0; + if (has_sc_variables && !mip_callbacks.empty()) { + CUOPT_LOG_WARN( + "Disabling remote MIP get/set callbacks: semi-continuous models are not " + "supported with callbacks"); + mip_callbacks.clear(); + } bool has_incumbents = !mip_callbacks.empty(); bool enable_tracking = has_incumbents; diff --git a/cpp/src/grpc/cuopt_remote.proto b/cpp/src/grpc/cuopt_remote.proto index d58145a8e6..5231abeaef 100644 --- a/cpp/src/grpc/cuopt_remote.proto +++ b/cpp/src/grpc/cuopt_remote.proto @@ -19,6 +19,7 @@ enum ProblemCategory { enum VariableType { CONTINUOUS = 0; INTEGER = 1; + SEMI_CONTINUOUS = 2; } // Optimization problem representation (field names match cpu_optimization_problem_t) @@ -50,7 +51,7 @@ message OptimizationProblem { repeated double constraint_upper_bounds = 17; bytes row_types = 18; // char array: 'E' (=), 'L' (<=), 'G' (>=), 'N' (objective) - // Variable types (enum-based: CONTINUOUS or INTEGER) + // Variable types (enum-based: CONTINUOUS, INTEGER, or SEMI_CONTINUOUS) repeated VariableType variable_types = 19; // Initial solutions @@ -122,6 +123,10 @@ message PDLPSolverSettings { bool save_best_primal_so_far = 28; bool first_primal_feasible = 29; int32 pdlp_precision = 30; + // Batch-only PDLP settings (e.g. all_primal_feasible, new_bounds, + // fixed_batch_size, generate_batch_primal_dual_solution) are intentionally + // not exposed on the wire: the gRPC SolveLPRequest is single-problem only, + // and these knobs only have meaning under a batch entry point. // Warm start data (if provided) PDLPWarmStartData warm_start_data = 50; diff --git a/cpp/src/grpc/grpc_problem_mapper.cpp b/cpp/src/grpc/grpc_problem_mapper.cpp index bc5342defe..14461a5a7c 100644 --- a/cpp/src/grpc/grpc_problem_mapper.cpp +++ b/cpp/src/grpc/grpc_problem_mapper.cpp @@ -111,6 +111,9 @@ void map_problem_to_proto(const cpu_optimization_problem_t& cpu_proble switch (vt) { case var_t::CONTINUOUS: pb_problem->add_variable_types(cuopt::remote::CONTINUOUS); break; case var_t::INTEGER: pb_problem->add_variable_types(cuopt::remote::INTEGER); break; + case var_t::SEMI_CONTINUOUS: + pb_problem->add_variable_types(cuopt::remote::SEMI_CONTINUOUS); + break; default: throw std::runtime_error("map_problem_to_proto: unknown var_t value " + std::to_string(static_cast(vt))); @@ -214,6 +217,7 @@ void map_proto_to_problem(const cuopt::remote::OptimizationProblem& pb_problem, switch (pb_problem.variable_types(i)) { case cuopt::remote::CONTINUOUS: var_types.push_back(var_t::CONTINUOUS); break; case cuopt::remote::INTEGER: var_types.push_back(var_t::INTEGER); break; + case cuopt::remote::SEMI_CONTINUOUS: var_types.push_back(var_t::SEMI_CONTINUOUS); break; default: throw std::runtime_error("Unknown VariableType enum value " + std::to_string(pb_problem.variable_types(i))); @@ -513,6 +517,10 @@ void map_chunked_arrays_to_problem(const cuopt::remote::ChunkedProblemHeader& he vtypes.push_back(var_t::INTEGER); has_ints = true; break; + case cuopt::remote::SEMI_CONTINUOUS: + vtypes.push_back(var_t::SEMI_CONTINUOUS); + has_ints = true; + break; default: throw std::runtime_error("Unknown VariableType enum value " + std::to_string(v) + " in chunked variable_types"); @@ -641,6 +649,7 @@ std::vector build_array_chunk_requests( switch (vt) { case var_t::CONTINUOUS: vt_enums.push_back(cuopt::remote::CONTINUOUS); break; case var_t::INTEGER: vt_enums.push_back(cuopt::remote::INTEGER); break; + case var_t::SEMI_CONTINUOUS: vt_enums.push_back(cuopt::remote::SEMI_CONTINUOUS); break; default: throw std::runtime_error("chunk_problem_to_proto: unknown var_t value " + std::to_string(static_cast(vt))); diff --git a/cpp/src/grpc/server/grpc_server_main.cpp b/cpp/src/grpc/server/grpc_server_main.cpp index d638c191b1..3c2f6e0c15 100644 --- a/cpp/src/grpc/server/grpc_server_main.cpp +++ b/cpp/src/grpc/server/grpc_server_main.cpp @@ -189,16 +189,16 @@ int main(int argc, char** argv) ensure_log_dir_exists(); - shm_unlink(SHM_JOB_QUEUE); - shm_unlink(SHM_RESULT_QUEUE); - shm_unlink(SHM_CONTROL); + shm_unlink(SHM_JOB_QUEUE.c_str()); + shm_unlink(SHM_RESULT_QUEUE.c_str()); + shm_unlink(SHM_CONTROL.c_str()); job_queue = static_cast( - create_shared_memory(SHM_JOB_QUEUE, sizeof(JobQueueEntry) * MAX_JOBS)); + create_shared_memory(SHM_JOB_QUEUE.c_str(), sizeof(JobQueueEntry) * MAX_JOBS)); result_queue = static_cast( - create_shared_memory(SHM_RESULT_QUEUE, sizeof(ResultQueueEntry) * MAX_RESULTS)); + create_shared_memory(SHM_RESULT_QUEUE.c_str(), sizeof(ResultQueueEntry) * MAX_RESULTS)); shm_ctrl = static_cast( - create_shared_memory(SHM_CONTROL, sizeof(SharedMemoryControl))); + create_shared_memory(SHM_CONTROL.c_str(), sizeof(SharedMemoryControl))); new (shm_ctrl) SharedMemoryControl{}; for (size_t i = 0; i < MAX_JOBS; ++i) { diff --git a/cpp/src/grpc/server/grpc_server_types.hpp b/cpp/src/grpc/server/grpc_server_types.hpp index dc6684dea5..a88d272242 100644 --- a/cpp/src/grpc/server/grpc_server_types.hpp +++ b/cpp/src/grpc/server/grpc_server_types.hpp @@ -255,9 +255,16 @@ inline std::map chunked_uploads; inline std::mutex chunked_downloads_mutex; inline std::map chunked_downloads; -inline const char* SHM_JOB_QUEUE = "/cuopt_job_queue"; -inline const char* SHM_RESULT_QUEUE = "/cuopt_result_queue"; -inline const char* SHM_CONTROL = "/cuopt_control"; +// Shared memory names include PID to prevent local users from accessing +// segments belonging to other server instances on the same host. +inline std::string make_shm_name(const char* base) +{ + return std::string(base) + "_" + std::to_string(getpid()); +} + +inline std::string SHM_JOB_QUEUE = make_shm_name("/cuopt_job_queue"); +inline std::string SHM_RESULT_QUEUE = make_shm_name("/cuopt_result_queue"); +inline std::string SHM_CONTROL = make_shm_name("/cuopt_control"); inline const std::string LOG_DIR = "/tmp/cuopt_logs"; diff --git a/cpp/src/grpc/server/grpc_worker_infra.cpp b/cpp/src/grpc/server/grpc_worker_infra.cpp index b2e28b4550..b1726ffc8b 100644 --- a/cpp/src/grpc/server/grpc_worker_infra.cpp +++ b/cpp/src/grpc/server/grpc_worker_infra.cpp @@ -12,15 +12,15 @@ void cleanup_shared_memory() { if (job_queue) { munmap(job_queue, sizeof(JobQueueEntry) * MAX_JOBS); - shm_unlink(SHM_JOB_QUEUE); + shm_unlink(SHM_JOB_QUEUE.c_str()); } if (result_queue) { munmap(result_queue, sizeof(ResultQueueEntry) * MAX_RESULTS); - shm_unlink(SHM_RESULT_QUEUE); + shm_unlink(SHM_RESULT_QUEUE.c_str()); } if (shm_ctrl) { munmap(shm_ctrl, sizeof(SharedMemoryControl)); - shm_unlink(SHM_CONTROL); + shm_unlink(SHM_CONTROL.c_str()); } } diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index c23b1d27ca..b968ad18ea 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -113,6 +113,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_HYPER_HEURISTIC_INITIAL_INFEASIBILITY_WEIGHT, &mip_settings.heuristic_params.initial_infeasibility_weight, f_t(1e-9), std::numeric_limits::infinity(), f_t(1000.0), "constraint violation penalty seed"}, {CUOPT_MIP_HYPER_HEURISTIC_RELAXED_LP_TIME_LIMIT, &mip_settings.heuristic_params.relaxed_lp_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(1.0), "base relaxed LP time cap in heuristics"}, {CUOPT_MIP_HYPER_HEURISTIC_RELATED_VARS_TIME_LIMIT, &mip_settings.heuristic_params.related_vars_time_limit, f_t(1e-9), std::numeric_limits::infinity(), f_t(30.0), "time for related-variable structure build"}, + {CUOPT_MIP_SEMICONTINUOUS_BIG_M, &mip_settings.semi_continuous_big_m, f_t(1.0), std::numeric_limits::infinity(), f_t(1e10), "big-M value for semi-continuous variables with no finite upper bound"}, }; // Int parameters @@ -146,7 +147,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_RANDOM_SEED, &mip_settings.seed, -1, std::numeric_limits::max(), -1}, {CUOPT_MIP_RELIABILITY_BRANCHING, &mip_settings.reliability_branching, -1, std::numeric_limits::max(), -1}, {CUOPT_PDLP_PRECISION, reinterpret_cast(&pdlp_settings.pdlp_precision), CUOPT_PDLP_DEFAULT_PRECISION, CUOPT_PDLP_MIXED_PRECISION, CUOPT_PDLP_DEFAULT_PRECISION}, - {CUOPT_MIP_SCALING, &mip_settings.mip_scaling, CUOPT_MIP_SCALING_OFF, CUOPT_MIP_SCALING_NO_OBJECTIVE, CUOPT_MIP_SCALING_ON}, + {CUOPT_MIP_SCALING, &mip_settings.mip_scaling, CUOPT_MIP_SCALING_OFF, CUOPT_MIP_SCALING_NO_OBJECTIVE, CUOPT_MIP_SCALING_NO_OBJECTIVE}, // MIP heuristic hyper-parameters (hidden from default --help: name contains "hyper_") {CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE, &mip_settings.heuristic_params.population_size, 1, std::numeric_limits::max(), 32, "max solutions in pool"}, {CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS, &mip_settings.heuristic_params.num_cpufj_threads, 0, std::numeric_limits::max(), 8, "parallel CPU FJ climbers"}, diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt index 13649682a6..9d5ef320f2 100644 --- a/cpp/src/mip_heuristics/CMakeLists.txt +++ b/cpp/src/mip_heuristics/CMakeLists.txt @@ -36,6 +36,7 @@ set(MIP_NON_LP_FILES ${CMAKE_CURRENT_SOURCE_DIR}/local_search/line_segment_search/line_segment_search.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/bounds_presolve.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/bounds_update_data.cu + ${CMAKE_CURRENT_SOURCE_DIR}/presolve/semi_continuous.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conditional_bound_strengthening.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/multi_probe.cu ${CMAKE_CURRENT_SOURCE_DIR}/presolve/probing_cache.cu diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu index b8dc3d33bf..ddc7b9836d 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu @@ -599,22 +599,20 @@ solution_t diversity_manager_t::run_solver() run_fj_alone(sol); return sol; } - rins.enable(); + + if (omp_get_num_threads() > CUOPT_MIP_RINS_REQUIRED_THREAD_COUNT) { rins.enable(); } generate_solution(timer.remaining_time(), false); if (timer.check_time_limit()) { - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); } if (check_b_b_preemption()) { - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); } run_fp_alone(); - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); }; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c4331343de..9396d7158a 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -24,6 +24,7 @@ #include #include +#include namespace cuopt::linear_programming::detail { template @@ -36,19 +37,6 @@ rins_t::rins_t(mip_solver_context_t& context_, time_limit = context.settings.heuristic_params.rins_time_limit; } -template -rins_thread_t::~rins_thread_t() -{ - this->request_termination(); -} - -template -void rins_thread_t::run_worker() -{ - raft::common::nvtx::range fun_scope("Running RINS"); - rins_ptr->run_rins(); -} - template void rins_t::new_best_incumbent_callback(const std::vector& solution) { @@ -59,23 +47,27 @@ template void rins_t::node_callback(const std::vector& solution, f_t objective) { if (!enabled) return; - node_count++; if (node_count - node_count_at_last_improvement < settings.nodes_after_later_improvement) return; - if (node_count - node_count_at_last_rins > settings.node_freq) { // opportunistic early test w/ atomic to avoid having to take the lock - if (!rins_thread->cpu_thread_done) return; - std::lock_guard lock(rins_mutex); + if (!launch_new_task.exchange(false)) return; + bool population_ready = false; - if (rins_thread->cpu_thread_done) { + { std::lock_guard pop_lock(dm.population.write_mutex); population_ready = dm.population.current_size() > 0 && dm.population.is_feasible(); } + if (population_ready) { lp_optimal_solution = solution; - rins_thread->start_cpu_solver(); + + CUOPT_LOG_DEBUG("Launching RINS task"); +#pragma omp task default(none) + run_rins(); + } else { + launch_new_task = true; } } } @@ -83,27 +75,19 @@ void rins_t::node_callback(const std::vector& solution, f_t objec template void rins_t::enable() { - rins_thread = std::make_unique>(); - rins_thread->rins_ptr = this; - seed = cuopt::seed_generator::get_seed(); + seed = cuopt::seed_generator::get_seed(); problem_ptr->handle_ptr->sync_stream(); problem_copy = std::make_unique>(*problem_ptr, &rins_handle); enabled = true; } -template -void rins_t::stop_rins() -{ - enabled = false; - if (rins_thread) rins_thread->request_termination(); - rins_thread.reset(); -} - template void rins_t::run_rins() { - if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); + raft::common::nvtx::range fun_scope("Running RINS"); + scope_guard guard([this]() { this->launch_new_task = true; }); + RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); // Do not make assertions based on problem_ptr. The original problem may have been modified within @@ -229,18 +213,20 @@ void rins_t::run_rins() solution_t fj_solution(fixed_problem); fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream())); std::vector default_weights(fixed_problem.n_constraints, 1.); - cpu_fj_thread_t cpu_fj_thread; - cpu_fj_thread.fj_cpu = fj.create_cpu_climber(fj_solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); - cpu_fj_thread.fj_ptr = &fj; - cpu_fj_thread.fj_cpu->log_prefix = "[RINS] "; - cpu_fj_thread.time_limit = time_limit; - cpu_fj_thread.start_cpu_solver(); + + std::unique_ptr> fj_cpu = + fj.create_cpu_climber(fj_solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); + fj_cpu->log_prefix = "[RINS] "; + + CUOPT_LOG_DEBUG("Launching CPUFJ (RINS) task"); +#pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none) + cpufj_solve(fj_cpu.get(), time_limit); f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound() : -std::numeric_limits::infinity(); @@ -311,13 +297,13 @@ void rins_t::run_rins() static_cast(context.settings.heuristic_params.rins_max_time_limit)); } - cpu_fj_thread.stop_cpu_solver(); - bool fj_solution_found = cpu_fj_thread.wait_for_cpu_solver(); - CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", cpu_fj_thread.fj_cpu->iterations); - if (fj_solution_found) { - CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", - cpu_fj_thread.fj_cpu->h_best_objective); - rins_solution_queue.push_back(cpu_fj_thread.fj_cpu->h_best_assignment); +#pragma omp taskwait // Wait for the CPU FJ (RINS) to finish + CUOPT_LOG_DEBUG("CPUFJ (RINS) task was stopped"); + + CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations); + if (fj_cpu->feasible_found) { + CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", fj_cpu->h_best_objective); + rins_solution_queue.push_back(fj_cpu->h_best_assignment); } // Thread will be automatically terminated and joined by destructor @@ -357,12 +343,10 @@ void rins_t::run_rins() } #if MIP_INSTANTIATE_FLOAT -template class rins_thread_t; template class rins_t; #endif #if MIP_INSTANTIATE_DOUBLE -template class rins_thread_t; template class rins_t; #endif diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cuh b/cpp/src/mip_heuristics/diversity/lns/rins.cuh index 0a9133f848..b1b62bd1ae 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cuh +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cuh @@ -17,19 +17,11 @@ #pragma once -#include #include #include -#include -#include +#include -#include -#include -#include -#include -#include -#include #include namespace cuopt::linear_programming::detail { @@ -52,18 +44,6 @@ struct rins_settings_t { template class rins_t; -template -struct rins_thread_t : public cpu_worker_thread_base_t> { - ~rins_thread_t(); - - void run_worker(); - void on_terminate() {} - void on_start() {} - bool get_result() { return true; } - - rins_t* rins_ptr{nullptr}; -}; - template class rins_t { public: @@ -74,7 +54,6 @@ class rins_t { void node_callback(const std::vector& solution, f_t objective); void new_best_incumbent_callback(const std::vector& solution); void enable(); - void stop_rins(); void run_rins(); @@ -96,15 +75,13 @@ class rins_t { f_t time_limit{10.}; i_t seed; - std::atomic enabled{false}; - std::atomic lower_bound{0.}; - - std::atomic node_count{0}; - std::atomic node_count_at_last_rins{0}; - std::atomic node_count_at_last_improvement{0}; - std::mutex rins_mutex; + omp_atomic_t enabled{false}; + omp_atomic_t lower_bound{0.}; - std::unique_ptr> rins_thread; + omp_atomic_t node_count{0}; + omp_atomic_t node_count_at_last_rins{0}; + omp_atomic_t node_count_at_last_improvement{0}; + omp_atomic_t launch_new_task{true}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/diversity/population.cu b/cpp/src/mip_heuristics/diversity/population.cu index bb0fdd6d11..a870f654de 100644 --- a/cpp/src/mip_heuristics/diversity/population.cu +++ b/cpp/src/mip_heuristics/diversity/population.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -279,6 +280,13 @@ void population_t::invoke_get_solution_callback( temp_sol.assignment.size(), temp_sol.handle_ptr->get_stream()); temp_sol.handle_ptr->sync_stream(); + if (detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + context.settings)) { + detail::strip_semi_continuous_auxiliaries_from_assignment( + user_assignment_vec, + detail::mip_solver_settings_accessor::get_semi_continuous_original_num_variables( + context.settings)); + } callback->get_solution(user_assignment_vec.data(), user_objective_vec.data(), user_bound_vec.data(), @@ -314,6 +322,13 @@ void population_t::run_solution_callbacks(solution_t& sol) auto set_sol_callback = static_cast(callback); f_t user_bound = context.stats.get_solution_bound(); auto callback_num_variables = problem_ptr->original_problem_ptr->get_n_variables(); + const bool has_semi_continuous_callback_translation = + detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + context.settings); + if (has_semi_continuous_callback_translation) { + callback_num_variables = detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(context.settings); + } rmm::device_uvector incumbent_assignment(callback_num_variables, sol.handle_ptr->get_stream()); solution_t outside_sol(sol); @@ -333,6 +348,14 @@ void population_t::run_solution_callbacks(solution_t& sol) // asserts if (outside_sol_objective == inf) { return; } d_outside_sol_objective.set_value_async(outside_sol_objective, sol.handle_ptr->get_stream()); + if (has_semi_continuous_callback_translation) { + detail::append_semi_continuous_auxiliaries_to_assignment( + h_incumbent_assignment, + detail::mip_solver_settings_accessor:: + get_semi_continuous_binary_to_original_indices(context.settings), + context.settings.get_tolerances()); + } + incumbent_assignment.resize(h_incumbent_assignment.size(), sol.handle_ptr->get_stream()); raft::copy(incumbent_assignment.data(), h_incumbent_assignment.data(), incumbent_assignment.size(), diff --git a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh index 5a637aae8e..1d0b9245d7 100644 --- a/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh +++ b/cpp/src/mip_heuristics/diversity/recombiners/sub_mip.cuh @@ -83,6 +83,7 @@ class sub_mip_recombiner_t : public recombiner_t { fixed_problem.reverse_constraints, nullptr, context.settings.hyper_params, + static_cast(1), true); scaling.scale_problem(); fixed_problem.presolve_data.reset_additional_vars(fixed_problem, offspring.handle_ptr); diff --git a/cpp/src/mip_heuristics/feasibility_jump/cpu_fj_thread.cuh b/cpp/src/mip_heuristics/feasibility_jump/cpu_fj_thread.cuh new file mode 100644 index 0000000000..040674e47a --- /dev/null +++ b/cpp/src/mip_heuristics/feasibility_jump/cpu_fj_thread.cuh @@ -0,0 +1,56 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +template +struct fj_cpu_climber_t; + +template +struct fj_cpu_task_t { + struct fj_cpu_deleter_t { + void operator()(fj_cpu_climber_t* ptr) const; + }; + std::atomic preemption_flag{false}; + std::unique_ptr, fj_cpu_deleter_t> fj_cpu; +}; + +// `seed` selects the FJ RNG seed: pass a non-negative value for a deterministic seed, +// or -1 to draw from the global cuopt::seed_generator (the historical behavior). +// In deterministic mode the caller MUST pass an explicit seed, otherwise the underlying +// seed_generator::get_seed() racing with concurrent callers breaks reproducibility. +template +std::unique_ptr> make_fj_cpu_task_from_host_lp( + const dual_simplex::lp_problem_t& problem, + const std::vector& variable_types, + const std::vector& seed_assignment, + const dual_simplex::simplex_solver_settings_t& settings, + std::function&, double)> improvement_callback, + std::string log_prefix, + int64_t seed = -1); + +template +void run_fj_cpu_task(fj_cpu_task_t& task, + f_t time_limit = std::numeric_limits::infinity(), + double work_unit_limit = std::numeric_limits::infinity()); + +template +void stop_fj_cpu_task(fj_cpu_task_t& task); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 8109653e6f..12b6c04070 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -7,9 +7,7 @@ #include "early_cpufj.cuh" -#include #include -#include namespace cuopt::linear_programming::detail { @@ -32,40 +30,40 @@ early_cpufj_t::~early_cpufj_t() template void early_cpufj_t::start() { - if (cpu_fj_thread_) { return; } + // 1: presolve, 1: early GPU FJ, 1: early CPU FJ + if (fj_cpu_ || omp_get_num_threads() < CUOPT_MIP_EARLY_CPUFJ_REQUIRED_THREAD_COUNT) { return; } this->preemption_flag_.store(false); this->start_time_ = std::chrono::steady_clock::now(); - cpu_fj_thread_ = std::make_unique>(); - cpu_fj_thread_->fj_cpu = - init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_); - cpu_fj_thread_->time_limit = std::numeric_limits::infinity(); + fj_cpu_ = init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_); - cpu_fj_thread_->fj_cpu->log_prefix = "[Early CPUFJ] "; + fj_cpu_->log_prefix = "[Early CPUFJ] "; - cpu_fj_thread_->fj_cpu->improvement_callback = - [this](f_t solver_obj, const std::vector& assignment, double) { - this->try_update_best(solver_obj, assignment); - }; + fj_cpu_->improvement_callback = [this](f_t solver_obj, + const std::vector& assignment, + double) { this->try_update_best(solver_obj, assignment); }; - cpu_fj_thread_->start_cpu_solver(); + CUOPT_LOG_DEBUG("Launching early CPUFJ task"); +#pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none) + cpufj_solve(fj_cpu_.get()); } template void early_cpufj_t::stop() { - if (!cpu_fj_thread_) { return; } + if (!fj_cpu_) { return; } preemption_flag_.store(true); - cpu_fj_thread_->stop_cpu_solver(); - cpu_fj_thread_->wait_for_cpu_solver(); + + fj_cpu_->halted = true; +#pragma omp taskwait depend(in : *fj_cpu_) // Wait for the early CPUFJ task to finish CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", - cpu_fj_thread_->fj_cpu ? cpu_fj_thread_->fj_cpu->iterations : 0, + fj_cpu_ ? fj_cpu_->iterations : 0, this->solution_found_); - cpu_fj_thread_.reset(); + fj_cpu_.reset(); } #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh index 911e846551..fd85e4b9f3 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh @@ -8,15 +8,13 @@ #pragma once #include +#include #include #include namespace cuopt::linear_programming::detail { -template -struct cpu_fj_thread_t; - template class early_cpufj_t : public early_heuristic_t> { public: @@ -32,7 +30,7 @@ class early_cpufj_t : public early_heuristic_t void stop(); private: - std::unique_ptr> cpu_fj_thread_; + std::unique_ptr> fj_cpu_; std::atomic preemption_flag_{false}; }; diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 3f77427d87..758c6272c1 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -38,7 +38,8 @@ early_gpufj_t::~early_gpufj_t() template void early_gpufj_t::start() { - if (worker_thread_) { return; } + // 1: presolve, 1: early GPU FJ, 1: early CPU FJ + if (fj_ptr_ || omp_get_num_threads() < CUOPT_MIP_EARLY_GPUFJ_REQUIRED_THREAD_COUNT) { return; } this->start_time_ = std::chrono::steady_clock::now(); @@ -57,29 +58,26 @@ void early_gpufj_t::start() this->try_update_best(solver_obj, h_assignment); }; - worker_thread_ = std::make_unique(&early_gpufj_t::run_worker, this); -} + CUOPT_LOG_DEBUG("Launching early GPUFJ task"); -template -void early_gpufj_t::run_worker() -{ - RAFT_CUDA_TRY(cudaSetDevice(this->device_id_)); - fj_ptr_->solve(*this->solution_ptr_); +#pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_) + { + RAFT_CUDA_TRY(cudaSetDevice(this->device_id_)); + fj_ptr_->solve(*this->solution_ptr_); + } } template void early_gpufj_t::stop() { - if (!worker_thread_) { return; } + if (!fj_ptr_) { return; } context_ptr_->preempt_heuristic_solver_.store(true); - - if (worker_thread_->joinable()) { worker_thread_->join(); } +#pragma omp taskwait depend(in : *fj_ptr_) // Wait for the early GPU FJ task to finish CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); fj_ptr_.reset(); - worker_thread_.reset(); } #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh index 4a7769143e..e5ceaaeb61 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh @@ -10,7 +10,6 @@ #include #include -#include namespace cuopt::linear_programming::detail { @@ -35,11 +34,8 @@ class early_gpufj_t : public early_heuristic_t void stop(); private: - void run_worker(); - std::unique_ptr> context_ptr_; std::unique_ptr> fj_ptr_; - std::unique_ptr worker_thread_; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu index 748dd41dfb..6b440aed4f 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -705,7 +706,9 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream data.cub_storage_bytes.resize(compaction_temp_storage_bytes, climber_stream); } - if (use_graph) { cudaStreamBeginCapture(climber_stream, cudaStreamCaptureModeThreadLocal); } + if (use_graph) { + RAFT_CUDA_TRY(cudaStreamBeginCapture(climber_stream, cudaStreamCaptureModeThreadLocal)); + } for (i_t i = 0; i < (use_graph ? iterations_per_graph : 1); ++i) { { // related varialbe array has to be dynamically computed each iteration @@ -718,52 +721,52 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream load_balancing_score_update(climber_stream, climber_idx); } else { if (is_binary_pb) { - cudaLaunchCooperativeKernel( + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( (void*)compute_mtm_moves_kernel, grid_resetmoves_bin, blocks_resetmoves_bin, reset_moves_args, 0, - climber_stream); + climber_stream)); } else { - cudaLaunchCooperativeKernel( + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( (void*)compute_mtm_moves_kernel, grid_resetmoves, blocks_resetmoves, reset_moves_args, 0, - climber_stream); + climber_stream)); } } #if FJ_DEBUG_LOAD_BALANCING if (use_load_balancing) { - cudaLaunchCooperativeKernel((void*)compute_mtm_moves_kernel, - grid_resetmoves_bin, - blocks_resetmoves_bin, - reset_moves_args, - 0, - climber_stream); - cudaLaunchCooperativeKernel((void*)load_balancing_sanity_checks, - 512, - 128, - kernel_args, - 0, - climber_stream); + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel((void*)compute_mtm_moves_kernel, + grid_resetmoves_bin, + blocks_resetmoves_bin, + reset_moves_args, + 0, + climber_stream)); + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel((void*)load_balancing_sanity_checks, + 512, + 128, + kernel_args, + 0, + climber_stream)); } #endif - cudaLaunchKernel((void*)update_lift_moves_kernel, - grid_lift_move, - blocks_lift_move, - kernel_args, - 0, - climber_stream); - cudaLaunchKernel((void*)update_breakthrough_moves_kernel, - grid_lift_move, - blocks_lift_move, - kernel_args, - 0, - climber_stream); + RAFT_CUDA_TRY(cudaLaunchKernel((void*)update_lift_moves_kernel, + grid_lift_move, + blocks_lift_move, + kernel_args, + 0, + climber_stream)); + RAFT_CUDA_TRY(cudaLaunchKernel((void*)update_breakthrough_moves_kernel, + grid_lift_move, + blocks_lift_move, + kernel_args, + 0, + climber_stream)); } // compaction kernel @@ -776,44 +779,49 @@ void fj_t::run_step_device(const rmm::cuda_stream_view& climber_stream pb_ptr->n_variables, climber_stream); - cudaLaunchKernel((void*)select_variable_kernel, - dim3(1), - dim3(256), - kernel_args, - 0, - climber_stream); - - cudaLaunchCooperativeKernel((void*)handle_local_minimum_kernel, - grid_update_weights, - blocks_update_weights, - kernel_args, - 0, - climber_stream); + RAFT_CUDA_TRY(cudaLaunchKernel((void*)select_variable_kernel, + dim3(1), + dim3(256), + kernel_args, + 0, + climber_stream)); + + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel((void*)handle_local_minimum_kernel, + grid_update_weights, + blocks_update_weights, + kernel_args, + 0, + climber_stream)); raft::copy(data.break_condition.data(), data.temp_break_condition.data(), 1, climber_stream); - cudaLaunchKernel((void*)update_assignment_kernel, - grid_setval, - blocks_setval, - update_assignment_args, - 0, - climber_stream); - cudaLaunchKernel((void*)update_changed_constraints_kernel, - 1, - blocks_update_changed_constraints, - kernel_args, - 0, - climber_stream); + RAFT_CUDA_TRY(cudaLaunchKernel((void*)update_assignment_kernel, + grid_setval, + blocks_setval, + update_assignment_args, + 0, + climber_stream)); + RAFT_CUDA_TRY(cudaLaunchKernel((void*)update_changed_constraints_kernel, + 1, + blocks_update_changed_constraints, + kernel_args, + 0, + climber_stream)); } if (use_graph) { - cudaStreamEndCapture(climber_stream, &graph); - cudaGraphInstantiate(&graph_instance, graph); + RAFT_CUDA_TRY(cudaStreamEndCapture(climber_stream, &graph)); + try { + RAFT_CUDA_TRY(cudaGraphInstantiate(&graph_instance, graph)); + } catch (...) { + RAFT_CUDA_TRY(cudaGraphDestroy(graph)); + throw; + } RAFT_CHECK_CUDA(climber_stream); - cudaGraphDestroy(graph); + RAFT_CUDA_TRY(cudaGraphDestroy(graph)); graph_created = true; } } - if (use_graph) cudaGraphLaunch(graph_instance, climber_stream); + if (use_graph) RAFT_CUDA_TRY(cudaGraphLaunch(graph_instance, climber_stream)); } template diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh index 50b451a86e..33d1ac527f 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh @@ -216,8 +216,6 @@ class fj_t { std::atomic& preemption_flag, fj_settings_t settings = fj_settings_t{}, bool randomize_params = false); - bool cpu_solve(fj_cpu_climber_t& fj_cpu, - f_t time_limit = +std::numeric_limits::infinity()); i_t alloc_max_climbers(i_t desired_climbers); void resize_vectors(const raft::handle_t* handle_ptr); void device_init(const rmm::cuda_stream_view& stream); diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu index ebbb761277..e9137503a5 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump_kernels.cu @@ -14,6 +14,8 @@ #include +#include + #include #include "feasibility_jump_impl_common.cuh" diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index b16f299bf1..575228895b 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -7,6 +7,10 @@ #include +#include +#include + +#include "cpu_fj_thread.cuh" #include "feasibility_jump.cuh" #include "feasibility_jump_impl_common.cuh" #include "fj_cpu.cuh" @@ -15,7 +19,12 @@ #include +#include +#include + +#include #include +#include #include #include #include @@ -38,6 +47,15 @@ namespace cuopt::linear_programming::detail { +template +void finalize_fj_cpu_host_initialization( + fj_cpu_climber_t& fj_cpu, + i_t n_variables, + i_t n_constraints, + i_t n_integer_vars, + i_t nnz, + const typename mip_solver_settings_t::tolerances_t& tolerances); + template thrust::tuple get_mtm_for_bound(const typename fj_t::climber_data_t::view_t& fj, i_t var_idx, @@ -353,7 +371,7 @@ static void log_regression_features(fj_cpu_climber_t& fj_cpu, double violated_ratio = (double)fj_cpu.violated_constraints.size() / n_cstrs; // Compute per-iteration metrics - double nnz_per_move = 0.0; + [[maybe_unused]] double nnz_per_move = 0.0; i_t total_moves = fj_cpu.n_lift_moves_window + fj_cpu.n_mtm_viol_moves_window + fj_cpu.n_mtm_sat_moves_window; if (total_moves > 0) { nnz_per_move = (double)fj_cpu.nnz_processed_window / total_moves; } @@ -789,9 +807,8 @@ static void apply_move(fj_cpu_climber_t& fj_cpu, fj_cpu.h_incumbent_objective - fj_cpu.settings.parameters.breakthrough_move_epsilon; fj_cpu.h_best_assignment = fj_cpu.h_assignment; fj_cpu.iterations_since_best = 0; - CUOPT_LOG_TRACE("%sCPUFJ: new best objective: %g", - fj_cpu.log_prefix.c_str(), - fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_incumbent_objective)); + CUOPT_LOG_TRACE( + "%sCPUFJ: new best objective: %g", fj_cpu.log_prefix.c_str(), fj_cpu.h_incumbent_objective); if (fj_cpu.improvement_callback) { double current_work_units = fj_cpu.work_units_elapsed.load(std::memory_order_acquire); fj_cpu.improvement_callback( @@ -826,7 +843,6 @@ static thrust::tuple find_mtm_move( fj_cpu_climber_t& fj_cpu, const std::vector& target_cstrs, bool localmin = false) { CPUFJ_NVTX_RANGE("CPUFJ::find_mtm_move"); - auto& problem = *fj_cpu.pb_ptr; raft::random::PCGenerator rng(fj_cpu.settings.seed + fj_cpu.iterations, 0, 0); @@ -1255,33 +1271,29 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, fj_cpu.h_tabu_lastinc.resize(fj_cpu.pb_ptr->n_variables, 0); fj_cpu.iterations = 0; - // set pointers to host copies - // technically not 'device_span's but raft doesn't have a universal span. - // cuda::std::span? - fj_cpu.view.cstr_left_weights = - raft::device_span(fj_cpu.h_cstr_left_weights.data(), fj_cpu.h_cstr_left_weights.size()); - fj_cpu.view.cstr_right_weights = - raft::device_span(fj_cpu.h_cstr_right_weights.data(), fj_cpu.h_cstr_right_weights.size()); - fj_cpu.view.objective_weight = &fj_cpu.h_objective_weight; - fj_cpu.view.incumbent_assignment = - raft::device_span(fj_cpu.h_assignment.data(), fj_cpu.h_assignment.size()); - fj_cpu.view.incumbent_lhs = raft::device_span(fj_cpu.h_lhs.data(), fj_cpu.h_lhs.size()); - fj_cpu.view.incumbent_lhs_sumcomp = - raft::device_span(fj_cpu.h_lhs_sumcomp.data(), fj_cpu.h_lhs_sumcomp.size()); - fj_cpu.view.tabu_nodec_until = - raft::device_span(fj_cpu.h_tabu_nodec_until.data(), fj_cpu.h_tabu_nodec_until.size()); - fj_cpu.view.tabu_noinc_until = - raft::device_span(fj_cpu.h_tabu_noinc_until.data(), fj_cpu.h_tabu_noinc_until.size()); - fj_cpu.view.tabu_lastdec = - raft::device_span(fj_cpu.h_tabu_lastdec.data(), fj_cpu.h_tabu_lastdec.size()); - fj_cpu.view.tabu_lastinc = - raft::device_span(fj_cpu.h_tabu_lastinc.data(), fj_cpu.h_tabu_lastinc.size()); - fj_cpu.view.objective_vars = - raft::device_span(fj_cpu.h_objective_vars.data(), fj_cpu.h_objective_vars.size()); - fj_cpu.view.incumbent_objective = &fj_cpu.h_incumbent_objective; - fj_cpu.view.best_objective = &fj_cpu.h_best_objective; + finalize_fj_cpu_host_initialization(fj_cpu, + problem.n_variables, + problem.n_constraints, + problem.n_integer_vars, + problem.nnz, + problem.tolerances); +} + +template +static void set_host_data_view( + fj_cpu_climber_t& fj_cpu, + i_t n_variables, + i_t n_constraints, + i_t n_integer_vars, + i_t nnz, + const typename mip_solver_settings_t::tolerances_t& tolerances) +{ + fj_cpu.view.pb.tolerances = tolerances; + fj_cpu.view.pb.n_variables = n_variables; + fj_cpu.view.pb.n_integer_vars = n_integer_vars; + fj_cpu.view.pb.n_constraints = n_constraints; + fj_cpu.view.pb.nnz = nnz; - fj_cpu.view.settings = &fj_cpu.settings; fj_cpu.view.pb.constraint_lower_bounds = raft::device_span(fj_cpu.h_cstr_lb.data(), fj_cpu.h_cstr_lb.size()); fj_cpu.view.pb.constraint_upper_bounds = @@ -1292,6 +1304,8 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, raft::device_span(fj_cpu.h_var_types.data(), fj_cpu.h_var_types.size()); fj_cpu.view.pb.is_binary_variable = raft::device_span(fj_cpu.h_is_binary_variable.data(), fj_cpu.h_is_binary_variable.size()); + fj_cpu.view.pb.binary_indices = + raft::device_span(fj_cpu.h_binary_indices.data(), fj_cpu.h_binary_indices.size()); fj_cpu.view.pb.coefficients = raft::device_span(fj_cpu.h_coefficients.data(), fj_cpu.h_coefficients.size()); fj_cpu.view.pb.offsets = raft::device_span(fj_cpu.h_offsets.data(), fj_cpu.h_offsets.size()); @@ -1305,13 +1319,61 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, raft::device_span(fj_cpu.h_reverse_offsets.data(), fj_cpu.h_reverse_offsets.size()); fj_cpu.view.pb.objective_coefficients = raft::device_span(fj_cpu.h_obj_coeffs.data(), fj_cpu.h_obj_coeffs.size()); - fj_cpu.h_objective_vars.resize(problem.n_variables); +} + +template +void finalize_fj_cpu_host_initialization( + fj_cpu_climber_t& fj_cpu, + i_t n_variables, + i_t n_constraints, + i_t n_integer_vars, + i_t nnz, + const typename mip_solver_settings_t::tolerances_t& tolerances) +{ + raft::common::nvtx::range scope("finalize_fj_cpu_host_initialization"); + + cuopt_assert(n_variables >= 0, "invalid variable count"); + cuopt_assert(n_constraints >= 0, "invalid constraint count"); + cuopt_assert(fj_cpu.h_offsets.size() == static_cast(n_constraints + 1), + "invalid CSR offsets"); + cuopt_assert(fj_cpu.h_reverse_offsets.size() == static_cast(n_variables + 1), + "invalid reverse offsets"); + cuopt_assert(fj_cpu.h_assignment.size() == static_cast(n_variables), + "seed assignment size mismatch"); + + set_host_data_view(fj_cpu, n_variables, n_constraints, n_integer_vars, nnz, tolerances); + + fj_cpu.view.cstr_left_weights = + raft::device_span(fj_cpu.h_cstr_left_weights.data(), fj_cpu.h_cstr_left_weights.size()); + fj_cpu.view.cstr_right_weights = + raft::device_span(fj_cpu.h_cstr_right_weights.data(), fj_cpu.h_cstr_right_weights.size()); + fj_cpu.view.objective_weight = &fj_cpu.h_objective_weight; + fj_cpu.view.incumbent_assignment = + raft::device_span(fj_cpu.h_assignment.data(), fj_cpu.h_assignment.size()); + fj_cpu.view.incumbent_lhs = raft::device_span(fj_cpu.h_lhs.data(), fj_cpu.h_lhs.size()); + fj_cpu.view.incumbent_lhs_sumcomp = + raft::device_span(fj_cpu.h_lhs_sumcomp.data(), fj_cpu.h_lhs_sumcomp.size()); + fj_cpu.view.tabu_nodec_until = + raft::device_span(fj_cpu.h_tabu_nodec_until.data(), fj_cpu.h_tabu_nodec_until.size()); + fj_cpu.view.tabu_noinc_until = + raft::device_span(fj_cpu.h_tabu_noinc_until.data(), fj_cpu.h_tabu_noinc_until.size()); + fj_cpu.view.tabu_lastdec = + raft::device_span(fj_cpu.h_tabu_lastdec.data(), fj_cpu.h_tabu_lastdec.size()); + fj_cpu.view.tabu_lastinc = + raft::device_span(fj_cpu.h_tabu_lastinc.data(), fj_cpu.h_tabu_lastinc.size()); + fj_cpu.view.incumbent_objective = &fj_cpu.h_incumbent_objective; + fj_cpu.view.best_objective = &fj_cpu.h_best_objective; + fj_cpu.view.settings = &fj_cpu.settings; + + fj_cpu.h_objective_vars.resize(n_variables); auto end = std::copy_if( thrust::counting_iterator(0), - thrust::counting_iterator(problem.n_variables), + thrust::counting_iterator(n_variables), fj_cpu.h_objective_vars.begin(), [&fj_cpu](i_t idx) { return !fj_cpu.view.pb.integer_equal(fj_cpu.h_obj_coeffs[idx], (f_t)0); }); fj_cpu.h_objective_vars.resize(end - fj_cpu.h_objective_vars.begin()); + fj_cpu.view.objective_vars = + raft::device_span(fj_cpu.h_objective_vars.data(), fj_cpu.h_objective_vars.size()); fj_cpu.h_best_objective = +std::numeric_limits::infinity(); @@ -1320,7 +1382,7 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, std::make_pair(0, fj_staged_score_t::zero())); fj_cpu.cached_cstr_bounds.resize(fj_cpu.h_reverse_coefficients.size()); - for (i_t var_idx = 0; var_idx < (i_t)fj_cpu.view.pb.n_variables; ++var_idx) { + for (i_t var_idx = 0; var_idx < n_variables; ++var_idx) { auto [offset_begin, offset_end] = reverse_range_for_var(fj_cpu, var_idx); for (i_t i = offset_begin; i < offset_end; ++i) { fj_cpu.cached_cstr_bounds[i] = @@ -1329,9 +1391,9 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, } } - fj_cpu.flip_move_computed.resize(fj_cpu.view.pb.n_variables, false); - fj_cpu.var_bitmap.resize(fj_cpu.view.pb.n_variables, false); - fj_cpu.iter_mtm_vars.reserve(fj_cpu.view.pb.n_variables); + fj_cpu.flip_move_computed.resize(n_variables, false); + fj_cpu.var_bitmap.resize(n_variables, false); + fj_cpu.iter_mtm_vars.reserve(n_variables); recompute_lhs(fj_cpu); @@ -1339,6 +1401,119 @@ static void init_fj_cpu(fj_cpu_climber_t& fj_cpu, precompute_problem_features(fj_cpu); } +template +static std::unique_ptr> init_fj_cpu_from_host_lp( + const dual_simplex::lp_problem_t& problem, + const std::vector& variable_types, + const std::vector& seed_assignment, + const dual_simplex::simplex_solver_settings_t& settings, + std::atomic& preemption_flag, + int64_t seed) +{ + using f_t2 = typename type_2::type; + + cuopt_assert(variable_types.size() >= static_cast(problem.num_cols), + "variable type size mismatch"); + + typename mip_solver_settings_t::tolerances_t tolerances{}; + tolerances.absolute_tolerance = settings.primal_tol; + tolerances.relative_tolerance = settings.zero_tol; + tolerances.integrality_tolerance = settings.integer_tol; + tolerances.absolute_mip_gap = settings.absolute_mip_gap_tol; + tolerances.relative_mip_gap = settings.relative_mip_gap_tol; + + const i_t n_variables = problem.num_cols; + const i_t n_constraints = problem.num_rows; + + dual_simplex::csr_matrix_t csr_A(problem.num_rows, problem.num_cols, problem.A.nnz()); + problem.A.to_compressed_row(csr_A); + std::vector coefficients = csr_A.x; + std::vector variables = csr_A.j; + std::vector offsets = csr_A.row_start; + std::vector constraint_lower_bounds = problem.rhs; + std::vector constraint_upper_bounds = problem.rhs; + std::vector variable_bounds(n_variables); + std::vector cpufj_variable_types(n_variables); + std::vector is_binary_variable(n_variables, 0); + i_t n_integer_vars = 0; + + for (i_t j = 0; j < n_variables; ++j) { + variable_bounds[j] = f_t2{problem.lower[j], problem.upper[j]}; + const auto var_type = variable_types[j]; + cpufj_variable_types[j] = + var_type == dual_simplex::variable_type_t::CONTINUOUS ? var_t::CONTINUOUS : var_t::INTEGER; + + const bool is_integer = cpufj_variable_types[j] == var_t::INTEGER; + const bool is_binary = is_integer && + integer_equal(problem.lower[j], f_t{0}, settings.integer_tol) && + integer_equal(problem.upper[j], f_t{1}, settings.integer_tol); + if (is_integer) { ++n_integer_vars; } + if (is_binary) { is_binary_variable[j] = 1; } + } + + const i_t nnz = static_cast(variables.size()); + dual_simplex::csc_matrix_t reverse_csc(n_constraints, n_variables, nnz); + csr_A.to_compressed_col(reverse_csc); + std::vector reverse_coefficients = std::move(reverse_csc.x); + std::vector reverse_constraints = std::move(reverse_csc.i); + std::vector reverse_offsets = std::move(reverse_csc.col_start); + + std::vector projected_seed(n_variables, f_t{0}); + for (i_t j = 0; j < n_variables; ++j) { + f_t value = j < static_cast(seed_assignment.size()) ? seed_assignment[j] : f_t{0}; + value = std::clamp(value, problem.lower[j], problem.upper[j]); + if (variable_types[j] != dual_simplex::variable_type_t::CONTINUOUS) { + value = std::clamp(std::round(value), problem.lower[j], problem.upper[j]); + } + projected_seed[j] = value; + } + + fj_settings_t fj_settings; + fj_settings.mode = fj_mode_t::EXIT_NON_IMPROVING; + fj_settings.n_of_minimums_for_exit = std::numeric_limits::max(); + fj_settings.time_limit = std::numeric_limits::infinity(); + fj_settings.iteration_limit = std::numeric_limits::max(); + fj_settings.update_weights = true; + fj_settings.feasibility_run = false; + fj_settings.seed = seed >= 0 ? seed : cuopt::seed_generator::get_seed(); + + auto fj_cpu = std::make_unique>(preemption_flag); + fj_cpu->view = typename fj_t::climber_data_t::view_t{}; + fj_cpu->pb_ptr = nullptr; + fj_cpu->settings = fj_settings; + + fj_cpu->h_reverse_coefficients = std::move(reverse_coefficients); + fj_cpu->h_reverse_constraints = std::move(reverse_constraints); + fj_cpu->h_reverse_offsets = std::move(reverse_offsets); + fj_cpu->h_coefficients = std::move(coefficients); + fj_cpu->h_offsets = std::move(offsets); + fj_cpu->h_variables = std::move(variables); + fj_cpu->h_obj_coeffs = problem.objective; + fj_cpu->h_var_bounds = std::move(variable_bounds); + fj_cpu->h_cstr_lb = std::move(constraint_lower_bounds); + fj_cpu->h_cstr_ub = std::move(constraint_upper_bounds); + fj_cpu->h_var_types = std::move(cpufj_variable_types); + fj_cpu->h_is_binary_variable = std::move(is_binary_variable); + + fj_cpu->h_cstr_left_weights.resize(n_constraints, 1.0); + fj_cpu->h_cstr_right_weights.resize(n_constraints, 1.0); + fj_cpu->max_weight = 1.0; + fj_cpu->h_objective_weight = 0.0; + fj_cpu->h_assignment = projected_seed; + fj_cpu->h_best_assignment = std::move(projected_seed); + fj_cpu->h_lhs.resize(n_constraints); + fj_cpu->h_lhs_sumcomp.resize(n_constraints, 0); + fj_cpu->h_tabu_nodec_until.resize(n_variables, 0); + fj_cpu->h_tabu_noinc_until.resize(n_variables, 0); + fj_cpu->h_tabu_lastdec.resize(n_variables, 0); + fj_cpu->h_tabu_lastinc.resize(n_variables, 0); + fj_cpu->iterations = 0; + + finalize_fj_cpu_host_initialization( + *fj_cpu, n_variables, n_constraints, n_integer_vars, nnz, tolerances); + return fj_cpu; +} + template static void sanity_checks(fj_cpu_climber_t& fj_cpu) { @@ -1414,45 +1589,45 @@ std::unique_ptr> fj_t::create_cpu_climber( } template -static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) +void cpufj_solve(fj_cpu_climber_t* fj_cpu, f_t in_time_limit, double work_unit_limit) { - i_t local_mins = 0; - auto loop_start = std::chrono::high_resolution_clock::now(); - auto time_limit = std::chrono::milliseconds((int)(in_time_limit * 1000)); + i_t local_mins = 0; + auto loop_start = std::chrono::high_resolution_clock::now(); + auto time_limit = std::chrono::milliseconds(static_cast(std::floor(in_time_limit * 1000.0))); auto loop_time_start = std::chrono::high_resolution_clock::now(); // Initialize feature tracking - fj_cpu.last_feature_log_time = loop_start; - fj_cpu.prev_best_objective = fj_cpu.h_best_objective; - fj_cpu.iterations_since_best = 0; + fj_cpu->last_feature_log_time = loop_start; + fj_cpu->prev_best_objective = fj_cpu->h_best_objective; + fj_cpu->iterations_since_best = 0; - while (!fj_cpu.halted && !fj_cpu.preemption_flag.load()) { + while (!fj_cpu->halted && !fj_cpu->preemption_flag.load()) { // Check if 5 seconds have passed auto now = std::chrono::high_resolution_clock::now(); if (in_time_limit < std::numeric_limits::infinity() && now - loop_time_start > time_limit) { CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d", - fj_cpu.log_prefix.c_str(), + fj_cpu->log_prefix.c_str(), time_limit.count() / 1000.f, - fj_cpu.iterations); + fj_cpu->iterations); break; } - if (fj_cpu.iterations >= fj_cpu.settings.iteration_limit) { + if (fj_cpu->iterations >= fj_cpu->settings.iteration_limit) { CUOPT_LOG_TRACE("%sIteration limit of %d reached, breaking loop at iteration %d", - fj_cpu.log_prefix.c_str(), - fj_cpu.settings.iteration_limit, - fj_cpu.iterations); + fj_cpu->log_prefix.c_str(), + fj_cpu->settings.iteration_limit, + fj_cpu->iterations); break; } // periodically recompute the LHS and violation scores // to correct any accumulated numerical errors - cuopt_assert(fj_cpu.settings.parameters.lhs_refresh_period > 0, + cuopt_assert(fj_cpu->settings.parameters.lhs_refresh_period > 0, "lhs_refresh_period should be positive"); - if (fj_cpu.iterations % fj_cpu.settings.parameters.lhs_refresh_period == 0 || - fj_cpu.trigger_early_lhs_recomputation) { - recompute_lhs(fj_cpu); - fj_cpu.trigger_early_lhs_recomputation = false; + if (fj_cpu->iterations % fj_cpu->settings.parameters.lhs_refresh_period == 0 || + fj_cpu->trigger_early_lhs_recomputation) { + recompute_lhs(*fj_cpu); + fj_cpu->trigger_early_lhs_recomputation = false; } fj_move_t move = fj_move_t{-1, 0}; @@ -1462,192 +1637,247 @@ static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_lim bool is_mtm_sat = false; // Perform lift moves - if (fj_cpu.violated_constraints.empty()) { - thrust::tie(move, score) = find_lift_move(fj_cpu); + if (fj_cpu->violated_constraints.empty()) { + thrust::tie(move, score) = find_lift_move(*fj_cpu); if (score > fj_staged_score_t::zero()) is_lift = true; } // Regular MTM if (!(score > fj_staged_score_t::zero())) { - thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, fj_cpu.mtm_viol_samples); + thrust::tie(move, score) = find_mtm_move_viol(*fj_cpu, fj_cpu->mtm_viol_samples); if (score > fj_staged_score_t::zero()) is_mtm_viol = true; } // try with MTM in satisfied constraints - if (fj_cpu.feasible_found && !(score > fj_staged_score_t::zero())) { - thrust::tie(move, score) = find_mtm_move_sat(fj_cpu, fj_cpu.mtm_sat_samples); + if (fj_cpu->feasible_found && !(score > fj_staged_score_t::zero())) { + thrust::tie(move, score) = find_mtm_move_sat(*fj_cpu, fj_cpu->mtm_sat_samples); if (score > fj_staged_score_t::zero()) is_mtm_sat = true; } // if we're in the feasible region but haven't found improvements in the last n iterations, // perturb bool should_perturb = false; - if (fj_cpu.violated_constraints.empty() && - fj_cpu.iterations - fj_cpu.last_feasible_entrance_iter > fj_cpu.perturb_interval) { - should_perturb = true; - fj_cpu.last_feasible_entrance_iter = fj_cpu.iterations; + if (fj_cpu->violated_constraints.empty() && + fj_cpu->iterations - fj_cpu->last_feasible_entrance_iter > fj_cpu->perturb_interval) { + should_perturb = true; + fj_cpu->last_feasible_entrance_iter = fj_cpu->iterations; } if (score > fj_staged_score_t::zero() && !should_perturb) { - apply_move(fj_cpu, move.var_idx, move.value, false); + apply_move(*fj_cpu, move.var_idx, move.value, false); // Track move types - if (is_lift) fj_cpu.n_lift_moves_window++; - if (is_mtm_viol) fj_cpu.n_mtm_viol_moves_window++; - if (is_mtm_sat) fj_cpu.n_mtm_sat_moves_window++; + if (is_lift) fj_cpu->n_lift_moves_window++; + if (is_mtm_viol) fj_cpu->n_mtm_viol_moves_window++; + if (is_mtm_sat) fj_cpu->n_mtm_sat_moves_window++; } else { // Local Min - update_weights(fj_cpu); + update_weights(*fj_cpu); if (should_perturb) { - perturb(fj_cpu); - for (size_t i = 0; i < fj_cpu.cached_mtm_moves.size(); i++) - fj_cpu.cached_mtm_moves[i].first = 0; + perturb(*fj_cpu); + for (size_t i = 0; i < fj_cpu->cached_mtm_moves.size(); i++) + fj_cpu->cached_mtm_moves[i].first = 0; } thrust::tie(move, score) = - find_mtm_move_viol(fj_cpu, 1, true); // pick a single random violated constraint + find_mtm_move_viol(*fj_cpu, 1, true); // pick a single random violated constraint i_t var_idx = move.var_idx >= 0 ? move.var_idx : 0; f_t delta = move.var_idx >= 0 ? move.value : 0; - apply_move(fj_cpu, var_idx, delta, true); + apply_move(*fj_cpu, var_idx, delta, true); ++local_mins; - ++fj_cpu.n_local_minima_window; + ++fj_cpu->n_local_minima_window; } // number of violated constraints is usually small (<100). recomputing from all LHSs is cheap // and more numerically precise than just adding to the accumulator in apply_move - fj_cpu.total_violations = 0; - for (auto cstr_idx : fj_cpu.violated_constraints) { - fj_cpu.total_violations += fj_cpu.view.excess_score(cstr_idx, fj_cpu.h_lhs[cstr_idx]); + fj_cpu->total_violations = 0; + for (auto cstr_idx : fj_cpu->violated_constraints) { + fj_cpu->total_violations += fj_cpu->view.excess_score(cstr_idx, fj_cpu->h_lhs[cstr_idx]); } - if (fj_cpu.iterations % fj_cpu.log_interval == 0) { - CUOPT_LOG_TRACE( + if (fj_cpu->iterations % fj_cpu->log_interval == 0) { + CUOPT_LOG_DEBUG( "%sCPUFJ iteration: %d/%d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, " "maxw %g", - fj_cpu.log_prefix.c_str(), - fj_cpu.iterations, - fj_cpu.settings.iteration_limit != std::numeric_limits::max() - ? fj_cpu.settings.iteration_limit + fj_cpu->log_prefix.c_str(), + fj_cpu->iterations, + fj_cpu->settings.iteration_limit != std::numeric_limits::max() + ? fj_cpu->settings.iteration_limit : -1, local_mins, - fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective), - fj_cpu.violated_constraints.size(), - fj_cpu.h_objective_weight, - fj_cpu.max_weight); + fj_cpu->h_best_objective, + fj_cpu->violated_constraints.size(), + fj_cpu->h_objective_weight, + fj_cpu->max_weight); } // send current solution to callback every 3000 steps for diversity - if (fj_cpu.iterations % fj_cpu.diversity_callback_interval == 0) { - if (fj_cpu.diversity_callback) { - fj_cpu.diversity_callback(fj_cpu.h_incumbent_objective, fj_cpu.h_assignment); + if (fj_cpu->iterations % fj_cpu->diversity_callback_interval == 0) { + if (fj_cpu->diversity_callback) { + fj_cpu->diversity_callback(fj_cpu->h_incumbent_objective, fj_cpu->h_assignment); } } // Print timing statistics every N iterations #if CPUFJ_TIMING_TRACE - if (fj_cpu.iterations % fj_cpu.timing_stats_interval == 0 && fj_cpu.iterations > 0) { - print_timing_stats(fj_cpu); + if (fj_cpu->iterations % fj_cpu->timing_stats_interval == 0 && fj_cpu->iterations > 0) { + print_timing_stats(*fj_cpu); } #endif - if (fj_cpu.iterations % 100 == 0 && fj_cpu.iterations > 0) { - // Collect memory statistics - auto [loads, stores] = fj_cpu.memory_aggregator.collect(); - double biased_work = (loads + stores) * fj_cpu.work_unit_bias / 1e10; - fj_cpu.work_units_elapsed += biased_work; - - if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); } + if (fj_cpu->iterations % 100 == 0 && fj_cpu->iterations > 0) { + // Use cumulative byte counts (collect() without flush). Each window's contribution to + // work_units_elapsed therefore grows roughly with the running total of bytes touched, + // i.e. quadratically in iterations rather than linearly. This is intentional: the + // memory_aggregator is calibrated for medium/large MIPs, and a strictly-linear scheme + // forces tiny instances (few KB per iteration) to run for tens of seconds before the + // accumulated bytes cross a 0.5 horizon, causing the deterministic producer_sync to + // stall and B&B to time out on instances that should solve in milliseconds. The + // accumulation is still deterministic across runs of the same problem, which is what + // the producer_sync contract actually requires. + auto [loads, stores] = fj_cpu->memory_aggregator.collect(); + double biased_work = (loads + stores) * fj_cpu->work_unit_bias / 1e10; + fj_cpu->work_units_elapsed += biased_work; + + if (fj_cpu->producer_sync != nullptr) { fj_cpu->producer_sync->notify_progress(); } + if (fj_cpu->work_units_elapsed >= work_unit_limit) { break; } } - cuopt_func_call(sanity_checks(fj_cpu)); - fj_cpu.iterations++; - fj_cpu.iterations_since_best++; + cuopt_func_call(sanity_checks(*fj_cpu)); + fj_cpu->iterations++; + fj_cpu->iterations_since_best++; } auto loop_end = std::chrono::high_resolution_clock::now(); double total_time = std::chrono::duration_cast>(loop_end - loop_start).count(); - double avg_time_per_iter = total_time / fj_cpu.iterations; + double avg_time_per_iter = fj_cpu->iterations > 0 ? total_time / fj_cpu->iterations : 0; CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms", - fj_cpu.log_prefix.c_str(), + fj_cpu->log_prefix.c_str(), avg_time_per_iter * 1000.0); #if CPUFJ_TIMING_TRACE // Print final timing statistics CUOPT_LOG_TRACE("=== Final Timing Statistics ==="); - print_timing_stats(fj_cpu); + print_timing_stats(*fj_cpu); #endif - - return fj_cpu.feasible_found; } template -bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) +std::unique_ptr> init_fj_cpu_standalone( + problem_t& problem, + solution_t& solution, + std::atomic& preemption_flag, + fj_settings_t settings) { - raft::common::nvtx::range scope("fj_cpu"); - return cpufj_solve_loop(fj_cpu, in_time_limit); -} + raft::common::nvtx::range scope("init_fj_cpu_standalone"); -template -cpu_fj_thread_t::~cpu_fj_thread_t() -{ - this->request_termination(); -} + auto fj_cpu = std::make_unique>(preemption_flag); -template -void cpu_fj_thread_t::run_worker() -{ - cpu_fj_solution_found = cpufj_solve_loop(*fj_cpu, time_limit); + std::vector default_weights(problem.n_constraints, 1.0); + init_fj_cpu(*fj_cpu, solution, default_weights, default_weights, 0.0); + fj_cpu->settings = settings; + fj_cpu->settings.seed = cuopt::seed_generator::get_seed(); + + return fj_cpu; } template -void cpu_fj_thread_t::on_terminate() +void fj_cpu_task_t::fj_cpu_deleter_t::operator()(fj_cpu_climber_t* ptr) const { - if (fj_cpu) fj_cpu->halted = true; + delete ptr; } template -void cpu_fj_thread_t::on_start() +std::unique_ptr> make_fj_cpu_task_from_host_lp( + const dual_simplex::lp_problem_t& problem, + const std::vector& variable_types, + const std::vector& seed_assignment, + const dual_simplex::simplex_solver_settings_t& settings, + std::function&, double)> improvement_callback, + std::string log_prefix, + int64_t seed) { - cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null"); - fj_cpu->halted = false; + auto task = std::make_unique>(); + auto fj_cpu = init_fj_cpu_from_host_lp( + problem, variable_types, seed_assignment, settings, task->preemption_flag, seed); + fj_cpu->log_prefix = std::move(log_prefix); + fj_cpu->improvement_callback = std::move(improvement_callback); + task->fj_cpu.reset(fj_cpu.release()); + return task; } template -void cpu_fj_thread_t::stop_cpu_solver() +void run_fj_cpu_task(fj_cpu_task_t& task, f_t time_limit, double work_unit_limit) { - fj_cpu->halted = true; + cuopt_assert(task.fj_cpu != nullptr, "CPUFJ task has no climber"); + cpufj_solve(task.fj_cpu.get(), time_limit, work_unit_limit); } template -std::unique_ptr> init_fj_cpu_standalone( - problem_t& problem, - solution_t& solution, - std::atomic& preemption_flag, - fj_settings_t settings) +void stop_fj_cpu_task(fj_cpu_task_t& task) { - raft::common::nvtx::range scope("init_fj_cpu_standalone"); - - auto fj_cpu = std::make_unique>(preemption_flag); - - std::vector default_weights(problem.n_constraints, 1.0); - init_fj_cpu(*fj_cpu, solution, default_weights, default_weights, 0.0); - fj_cpu->settings = settings; - fj_cpu->settings.seed = cuopt::seed_generator::get_seed(); - - return fj_cpu; + if (task.fj_cpu) { + auto& fj_cpu = *task.fj_cpu; + fj_cpu.preemption_flag = true; + fj_cpu.halted = true; + } } #if MIP_INSTANTIATE_FLOAT template class fj_t; -template class cpu_fj_thread_t; +template struct fj_cpu_task_t; +template void cpufj_solve(fj_cpu_climber_t* fj_cpu, + float in_time_limit, + double work_unit_limit); template std::unique_ptr> init_fj_cpu_standalone( problem_t& problem, solution_t& solution, std::atomic& preemption_flag, fj_settings_t settings); +template std::unique_ptr> make_fj_cpu_task_from_host_lp( + const dual_simplex::lp_problem_t& problem, + const std::vector& variable_types, + const std::vector& seed_assignment, + const dual_simplex::simplex_solver_settings_t& settings, + std::function&, double)> improvement_callback, + std::string log_prefix, + int64_t seed); +template void run_fj_cpu_task(fj_cpu_task_t& task, + float time_limit, + double work_unit_limit); +template void stop_fj_cpu_task(fj_cpu_task_t& task); +template void finalize_fj_cpu_host_initialization( + fj_cpu_climber_t& fj_cpu, + int n_variables, + int n_constraints, + int n_integer_vars, + int nnz, + const typename mip_solver_settings_t::tolerances_t& tolerances); #endif #if MIP_INSTANTIATE_DOUBLE template class fj_t; -template class cpu_fj_thread_t; +template struct fj_cpu_task_t; +template void cpufj_solve(fj_cpu_climber_t* fj_cpu, + double in_time_limit, + double work_unit_limit); template std::unique_ptr> init_fj_cpu_standalone( problem_t& problem, solution_t& solution, std::atomic& preemption_flag, fj_settings_t settings); +template std::unique_ptr> make_fj_cpu_task_from_host_lp( + const dual_simplex::lp_problem_t& problem, + const std::vector& variable_types, + const std::vector& seed_assignment, + const dual_simplex::simplex_solver_settings_t& settings, + std::function&, double)> improvement_callback, + std::string log_prefix, + int64_t seed); +template void run_fj_cpu_task(fj_cpu_task_t& task, + double time_limit, + double work_unit_limit); +template void stop_fj_cpu_task(fj_cpu_task_t& task); +template void finalize_fj_cpu_host_initialization( + fj_cpu_climber_t& fj_cpu, + int n_variables, + int n_constraints, + int n_integer_vars, + int nnz, + const typename mip_solver_settings_t::tolerances_t& tolerances); #endif } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh index 3263609a2b..cdf3a2f58a 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh @@ -8,16 +8,13 @@ #pragma once #include -#include #include #include -#include -#include #include #include +#include #include -#include #include #include @@ -126,7 +123,7 @@ struct fj_cpu_climber_t { // vector is actually likely beneficial here since we're memory bound std::vector flip_move_computed; - ; + // CSR nnz offset -> (delta, score) std::vector> cached_mtm_moves; @@ -194,21 +191,9 @@ struct fj_cpu_climber_t { }; template -struct cpu_fj_thread_t : public cpu_worker_thread_base_t> { - ~cpu_fj_thread_t(); - - void run_worker(); - void on_terminate(); - void on_start(); - bool get_result() { return cpu_fj_solution_found; } - - void stop_cpu_solver(); - - std::atomic cpu_fj_solution_found{false}; - f_t time_limit{+std::numeric_limits::infinity()}; - std::unique_ptr> fj_cpu; - fj_t* fj_ptr{nullptr}; -}; +void cpufj_solve(fj_cpu_climber_t* fj_cpu, + f_t in_time_limit = std::numeric_limits::infinity(), + double work_unit_limit = std::numeric_limits::infinity()); // Standalone CPUFJ init for running without full fj_t infrastructure (avoids GPU allocations). // Used for early CPUFJ during presolve. diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index da29511d70..4a13425437 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -20,10 +20,6 @@ #include -#include - -#include - namespace cuopt::linear_programming::detail { template @@ -47,25 +43,18 @@ local_search_t::local_search_t(mip_solver_context_t& context problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr) { const int n_cpufj = context.settings.heuristic_params.num_cpufj_threads; - for (int i = 0; i < n_cpufj; ++i) { - ls_cpu_fj.push_back(std::make_unique>()); - ls_cpu_fj.back()->fj_ptr = &fj; - } - scratch_cpu_fj.push_back(std::make_unique>()); - scratch_cpu_fj.back()->fj_ptr = &fj; - scratch_cpu_fj_on_lp_opt.fj_ptr = &fj; - + ls_cpu_fj.resize(n_cpufj); + scratch_cpu_fj.resize(1); fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit; } -static double local_search_best_obj = std::numeric_limits::max(); -static population_t* pop_ptr = nullptr; - template void local_search_t::start_cpufj_scratch_threads(population_t& population) { - pop_ptr = &population; + // TODO: Find a way to enable this in low core count scenarios + if (omp_get_num_threads() < CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT) return; + pop_ptr = &population; std::vector default_weights(context.problem_ptr->n_constraints, 1.); solution_t solution(*context.problem_ptr); @@ -75,37 +64,40 @@ void local_search_t::start_cpufj_scratch_threads(population_t 0) solution.assign_random_within_bounds(0.4); - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - /*randomize=*/counter > 0); - - cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": "; - cpu_fj.fj_cpu->improvement_callback = - [&population, problem_ptr = context.problem_ptr]( + cpu_fj = fj.create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/counter > 0); + + cpu_fj->log_prefix = "******* scratch " + std::to_string(counter) + ": "; + cpu_fj->improvement_callback = + [this, &population, problem_ptr = context.problem_ptr]( f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); (void)problem_ptr; - if (obj < local_search_best_obj) { + if (obj < this->local_search_best_obj) { CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", problem_ptr->get_user_obj_from_solver_obj(obj), problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); - local_search_best_obj = obj; + this->local_search_best_obj = obj; } }; counter++; }; - for (auto& cpu_fj_ptr : scratch_cpu_fj) { - cpu_fj_ptr->start_cpu_solver(); + CUOPT_LOG_DEBUG("Launching %d scratch CPUFJ tasks", scratch_cpu_fj.size()); + + for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) { + auto ptr = scratch_cpu_fj[i].get(); +#pragma omp task firstprivate(ptr) depend(out : *ptr) default(none) + cpufj_solve(ptr); } } @@ -113,6 +105,9 @@ template void local_search_t::start_cpufj_lptopt_scratch_threads( population_t& population) { + // TODO: Find a way to enable this in low core count scenarios + if (omp_get_num_threads() < CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT) return; + pop_ptr = &population; std::vector default_weights(context.problem_ptr->n_constraints, 1.); @@ -121,40 +116,59 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp.copy_new_assignment( host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream())); solution_lp.round_random_nearest(500); - scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber( + scratch_cpu_fj_on_lp_opt = fj.create_cpu_climber( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); - scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; - scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = + scratch_cpu_fj_on_lp_opt->log_prefix = "******* scratch on LP optimal: "; + scratch_cpu_fj_on_lp_opt->improvement_callback = [this, &population](f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - if (obj < local_search_best_obj) { + if (obj < this->local_search_best_obj) { CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", context.problem_ptr->get_user_obj_from_solver_obj(obj), context.problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); - local_search_best_obj = obj; + this->local_search_best_obj = obj; } }; - // default weights - cudaDeviceSynchronize(); - scratch_cpu_fj_on_lp_opt.start_cpu_solver(); + CUOPT_LOG_DEBUG("Launching scratch CPUFJ (on LP optimal) task"); + +#pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ + depend(out : *scratch_cpu_fj_on_lp_opt) + cpufj_solve(scratch_cpu_fj_on_lp_opt.get()); } template void local_search_t::stop_cpufj_scratch_threads() { - for (auto& cpu_fj_ptr : scratch_cpu_fj) { - cpu_fj_ptr->request_termination(); + if (omp_get_num_threads() < CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT) return; + + for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) { + scratch_cpu_fj[i]->halted = true; +#pragma omp taskwait depend(in : *scratch_cpu_fj[i]) // Wait for each scratch CPU FJ task to finish + } + + if (scratch_cpu_fj_on_lp_opt) { + scratch_cpu_fj_on_lp_opt->halted = true; +#pragma omp taskwait depend( \ + in : *scratch_cpu_fj_on_lp_opt) // Wait for the scratch CPU FJ (LP optimal) task to finish + + CUOPT_LOG_DEBUG("All scratch CPUFJ tasks were stopped"); } - scratch_cpu_fj_on_lp_opt.request_termination(); } template void local_search_t::start_cpufj_deterministic( dual_simplex::branch_and_bound_t& bb) { + producer_sync_t& producer_sync = bb.get_producer_sync(); + + if (omp_get_num_threads() < CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT) { + producer_sync.registration_complete(); + return; + } + std::vector default_weights(context.problem_ptr->n_constraints, 1.); solution_t solution(*context.problem_ptr); @@ -164,29 +178,29 @@ void local_search_t::start_cpufj_deterministic( 0.0); solution.clamp_within_bounds(); - deterministic_cpu_fj.fj_ptr = &fj; - deterministic_cpu_fj.fj_cpu = fj.create_cpu_climber(solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - /*randomize=*/true); + deterministic_cpu_fj = fj.create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/true); - deterministic_cpu_fj.fj_cpu->log_prefix = "******* deterministic CPUFJ: "; + deterministic_cpu_fj->log_prefix = "******* deterministic CPUFJ: "; // Register with producer_sync for B&B synchronization - producer_sync_t& producer_sync = bb.get_producer_sync(); - deterministic_cpu_fj.fj_cpu->producer_sync = &producer_sync; - producer_sync.register_producer(&deterministic_cpu_fj.fj_cpu->work_units_elapsed); + deterministic_cpu_fj->producer_sync = &producer_sync; + producer_sync.register_producer(&deterministic_cpu_fj->work_units_elapsed); // Set up callback to send solutions to B&B with work unit timestamps - deterministic_cpu_fj.fj_cpu->improvement_callback = + deterministic_cpu_fj->improvement_callback = [&bb](f_t obj, const std::vector& h_vec, double work_units) { bb.queue_external_solution_deterministic(h_vec, work_units); }; - deterministic_cpu_fj.start_cpu_solver(); + CUOPT_LOG_DEBUG("Launching deterministic CPUFJ task"); +#pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj) + cpufj_solve(deterministic_cpu_fj.get()); // Signal that registration is complete - B&B can now wait on producers producer_sync.registration_complete(); @@ -195,12 +209,16 @@ void local_search_t::start_cpufj_deterministic( template void local_search_t::stop_cpufj_deterministic() { - if (deterministic_cpu_fj.fj_cpu) { - if (deterministic_cpu_fj.fj_cpu->producer_sync) { - deterministic_cpu_fj.fj_cpu->producer_sync->deregister_producer( - &deterministic_cpu_fj.fj_cpu->work_units_elapsed); + if (deterministic_cpu_fj) { + if (deterministic_cpu_fj->producer_sync) { + deterministic_cpu_fj->producer_sync->deregister_producer( + &deterministic_cpu_fj->work_units_elapsed); } - deterministic_cpu_fj.request_termination(); + + deterministic_cpu_fj->halted = true; +#pragma omp taskwait depend( \ + in : *deterministic_cpu_fj) // Wait for deterministic CPU FJ task to finish + CUOPT_LOG_DEBUG("Deterministic CPUFJ task was stopped"); } } @@ -233,48 +251,51 @@ bool local_search_t::do_fj_solve(solution_t& solution, } auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); - for (auto& cpu_fj_ptr : ls_cpu_fj) { - auto& cpu_fj = *cpu_fj_ptr; - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - h_weights, - h_weights, - h_objective_weight, - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); + for (auto& cpu_fj : ls_cpu_fj) { + cpu_fj = fj.create_cpu_climber(solution, + h_weights, + h_weights, + h_objective_weight, + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); } auto solution_copy = solution; // Start CPU solver in background thread - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->start_cpu_solver(); - } +#pragma omp taskgroup + { + if (ls_cpu_fj.size() > 0 && omp_get_num_threads() > CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT) { + size_t n = std::min(omp_get_num_threads() - 1, ls_cpu_fj.size()); + CUOPT_LOG_DEBUG("Launching %d CPUFJ tasks", n); + +#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(n) nogroup + for (size_t i = 0; i < n; ++i) { + cpufj_solve(ls_cpu_fj[i].get()); + } + } - // Run GPU solver and measure execution time - auto gpu_fj_start = std::chrono::high_resolution_clock::now(); - in_fj.settings.time_limit = timer.remaining_time(); - in_fj.solve(solution); + // Run GPU solver + in_fj.settings.time_limit = timer.remaining_time(); + in_fj.solve(solution); - // Stop CPU solver - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->stop_cpu_solver(); - } + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + ls_cpu_fj[i]->halted = true; + } + } // implicit barrier that waits all CPU FJ tasks to finish - auto gpu_fj_end = std::chrono::high_resolution_clock::now(); - double gpu_fj_duration = std::chrono::duration(gpu_fj_end - gpu_fj_start).count(); + CUOPT_LOG_DEBUG("All CPUFJ tasks were stopped"); solution_t solution_cpu(*solution.problem_ptr); - f_t best_cpu_obj = std::numeric_limits::max(); - // // Wait for CPU solver to finish - for (auto& cpu_fj_ptr : ls_cpu_fj) { - bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver(); - if (cpu_sol_found) { - f_t cpu_obj = cpu_fj_ptr->fj_cpu->h_best_objective; + + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + if (ls_cpu_fj[i]->feasible_found) { + f_t cpu_obj = ls_cpu_fj[i]->h_best_objective; if (cpu_obj < best_cpu_obj) { best_cpu_obj = cpu_obj; - solution_cpu.copy_new_assignment(cpu_fj_ptr->fj_cpu->h_best_assignment); + solution_cpu.copy_new_assignment(ls_cpu_fj[i]->h_best_assignment); solution_cpu.compute_feasibility(); } } diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh index 94493ebcb3..9befd34ab5 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cuh +++ b/cpp/src/mip_heuristics/local_search/local_search.cuh @@ -11,16 +11,10 @@ #include #include #include -#include #include +#include #include -#include -#include -#include -#include -#include - namespace cuopt::linear_programming::dual_simplex { template class branch_and_bound_t; @@ -126,12 +120,15 @@ class local_search_t { feasibility_pump_t fp; std::mt19937 rng; - std::vector>> ls_cpu_fj; - std::vector>> scratch_cpu_fj; - cpu_fj_thread_t scratch_cpu_fj_on_lp_opt; - cpu_fj_thread_t deterministic_cpu_fj; + std::vector>> ls_cpu_fj; + std::vector>> scratch_cpu_fj; + std::unique_ptr> scratch_cpu_fj_on_lp_opt; + std::unique_ptr> deterministic_cpu_fj; problem_t problem_with_objective_cut; bool cutting_plane_added_for_active_run{false}; + + omp_atomic_t local_search_best_obj{std::numeric_limits::max()}; + population_t* pop_ptr{nullptr}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu index f3233cc8f4..6512ad05da 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cu @@ -8,8 +8,10 @@ #include "bounds_repair.cuh" #include +#include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh index 29161c5d25..e4f1b4a866 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh +++ b/cpp/src/mip_heuristics/local_search/rounding/bounds_repair.cuh @@ -13,6 +13,9 @@ #include #include +#include +#include + namespace cuopt::linear_programming::detail { // from the paper, probability of choosing random candidate= noise parameter diff --git a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu index 8db4d7ae85..51c103c74f 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/constraint_prop.cu @@ -16,8 +16,10 @@ #include #include #include +#include #include #include +#include namespace cuopt::linear_programming::detail { diff --git a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu index 7d074aea5e..10973f1565 100644 --- a/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu +++ b/cpp/src/mip_heuristics/local_search/rounding/lb_bounds_repair.cu @@ -8,8 +8,10 @@ #include "lb_bounds_repair.cuh" #include +#include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp index 47d3d22de4..34a4b07b23 100644 --- a/cpp/src/mip_heuristics/mip_constants.hpp +++ b/cpp/src/mip_heuristics/mip_constants.hpp @@ -13,3 +13,11 @@ #define MIP_INSTANTIATE_DOUBLE CUOPT_INSTANTIATE_DOUBLE #define PDLP_INSTANTIATE_FLOAT 1 + +/* @brief Minimimum number of threads to enable each part of the MIP Solver */ +#define CUOPT_MIP_FJ_REQUIRED_THREAD_COUNT 8 +#define CUOPT_MIP_EARLY_GPUFJ_REQUIRED_THREAD_COUNT 3 +#define CUOPT_MIP_EARLY_CPUFJ_REQUIRED_THREAD_COUNT 2 +#define CUOPT_MIP_RINS_REQUIRED_THREAD_COUNT 4 +#define CUOPT_MIP_BATCH_PDLP_REQUIRED_THREAD_COUNT 3 +#define CUOPT_MIP_CLIQUE_CUTS_REQUIRED_THREAD_COUNT 3 diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu index d78f8beb16..0a7c9de41a 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cu @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh index 8b57cc7019..ed0b91466d 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh @@ -34,7 +34,7 @@ class bound_presolve_t { struct settings_t { f_t time_limit{60.0}; i_t iteration_limit{std::numeric_limits::max()}; - i_t num_threads = -1; + i_t num_tasks = -1; bool parallel_bounds_update{true}; }; diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 13412614b8..3d62b99f66 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -17,6 +17,12 @@ #include "cusparse.h" #include + +#include +#include +#include +#include + #include "conditional_bound_strengthening.cuh" #include @@ -240,11 +246,14 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( std::vector constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1}); std::unordered_set cnstr_pair; -#pragma omp parallel for private(cnstr_pair) - for (int cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { - for (int jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { + i_t num_tasks = std::max(omp_get_num_threads() - 2, 1); + + CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks); +#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared) + for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { + for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { int var = variables[jj]; - for (int kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) { + for (i_t kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) { if (reverse_constraints[kk] != cnstr) { cnstr_pair.insert(reverse_constraints[kk]); } if (cnstr_pair.size() == max_pair_per_row) { break; } } @@ -257,7 +266,7 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( constraint_pairs_h[cnstr * max_pair_per_row + counter++] = {cnstr, temp}; } cnstr_pair.clear(); - } + } // implicit barrier that waits for all iterations to finish before proceeding constraint_pairs = cuopt::device_copy(constraint_pairs_h, problem.handle_ptr->get_stream()); diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu index 82462c11ce..950e3c936c 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cu @@ -1036,7 +1036,7 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, bool modify_problem, - std::atomic* signal_extend) + omp_atomic_t* signal_extend) { cuopt::timer_t stage_timer(std::numeric_limits::infinity()); #ifdef DEBUG_CLIQUE_TABLE @@ -1141,7 +1141,7 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, std::shared_ptr> * clique_table_out, \ cuopt::timer_t & timer, \ bool modify_problem, \ - std::atomic* signal_extend); \ + omp_atomic_t* signal_extend); \ template void build_clique_table( \ const dual_simplex::user_problem_t& problem, \ clique_table_t& clique_table, \ diff --git a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh index 944241b4f0..d09051ff78 100644 --- a/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh +++ b/cpp/src/mip_heuristics/presolve/conflict_graph/clique_table.cuh @@ -105,7 +105,7 @@ void find_initial_cliques(dual_simplex::user_problem_t& problem, std::shared_ptr>* clique_table_out, cuopt::timer_t& timer, bool modify_problem, - std::atomic* signal_extend = nullptr); + omp_atomic_t* signal_extend = nullptr); template void build_clique_table(const dual_simplex::user_problem_t& problem, diff --git a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu index 3a6d1bce21..bbb58c0164 100644 --- a/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/lb_probing_cache.cu @@ -10,7 +10,9 @@ #include #include +#include #include +#include #include #include diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu index 0d16c26cae..f48eae1de8 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh index cbcd91a7d7..f276840bdf 100644 --- a/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip_heuristics/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -10,6 +10,7 @@ #include "load_balanced_bounds_presolve_kernels.cuh" #include "load_balanced_partition_helpers.cuh" +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/multi_probe.cu b/cpp/src/mip_heuristics/presolve/multi_probe.cu index 7789b3281b..f798957e1c 100644 --- a/cpp/src/mip_heuristics/presolve/multi_probe.cu +++ b/cpp/src/mip_heuristics/presolve/multi_probe.cu @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 4f5e16ddb9..36b96dceaf 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -14,11 +14,15 @@ #include #include +#include +#include #include +#include #include #include #include +#include namespace cuopt::linear_programming::detail { @@ -857,18 +861,16 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, bound_presolve.settings.iteration_limit = 50; bound_presolve.settings.time_limit = timer.remaining_time(); - size_t num_threads = bound_presolve.settings.num_threads < 0 - ? 0.2 * omp_get_max_threads() - : bound_presolve.settings.num_threads; - num_threads = std::clamp(num_threads, 1, 8); + size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? omp_get_num_threads() - 1 + : bound_presolve.settings.num_tasks; // Create a vector of multi_probe_t objects std::vector> multi_probe_presolve_pool; - std::vector>> modification_vector_pool(num_threads); - std::vector>> substitution_vector_pool(num_threads); + std::vector>> modification_vector_pool(num_tasks); + std::vector>> substitution_vector_pool(num_tasks); // Initialize multi_probe_presolve_pool - for (size_t i = 0; i < num_threads; i++) { + for (size_t i = 0; i < num_tasks; i++) { multi_probe_presolve_pool.emplace_back(bound_presolve.context); multi_probe_presolve_pool[i].resize(problem); multi_probe_presolve_pool[i].compute_stats = true; @@ -887,23 +889,28 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, // are visible before any per-thread kernel can reference that memory. problem.handle_ptr->sync_stream(); -// Main parallel loop -#pragma omp parallel num_threads(num_threads) - { - for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) { - if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } - size_t step_end = std::min(step_start + step_size, priority_indices.size()); + CUOPT_LOG_INFO("Running probing cache with %zu tasks", num_tasks); -#pragma omp for - for (size_t i = step_start; i < step_end; ++i) { - auto var_idx = priority_indices[i]; - if (timer.check_time_limit()) { continue; } + // Main parallel loop + for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) { + if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } + size_t step_end = std::min(step_start + step_size, priority_indices.size()); - int thread_idx = omp_get_thread_num(); - CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, thread_idx); +#pragma omp taskloop num_tasks(num_tasks) default(shared) + for (size_t task_id = 0; task_id < num_tasks; ++task_id) { + size_t n = step_end - step_start; + size_t begin = step_start + std::floor(static_cast(n) * task_id / num_tasks); + size_t end = step_start + std::floor(static_cast(n) * (task_id + 1) / num_tasks); + auto& multi_probe_presolve = multi_probe_presolve_pool[task_id]; + auto& modification_vector = modification_vector_pool[task_id]; + auto& substitution_vector = substitution_vector_pool[task_id]; + if (timer.check_time_limit()) { continue; } - auto& multi_probe_presolve = multi_probe_presolve_pool[thread_idx]; + for (size_t i = begin; i < end; ++i) { + auto var_idx = priority_indices[i]; + if (timer.check_time_limit()) { continue; } + CUOPT_LOG_TRACE("Computing probing cache for var %d on task %zu", var_idx, task_id); compute_cache_for_var(var_idx, bound_presolve, problem, @@ -913,30 +920,29 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, n_of_implied_singletons, n_of_cached_probings, problem_is_infeasible, - modification_vector_pool[thread_idx], - substitution_vector_pool[thread_idx], + modification_vector, + substitution_vector, timer, problem.handle_ptr->get_device()); } + } // implicit barrier that waits for all iterations to finish before proceeding + + // TODO when we have determinism, check current threads work/time counter and filter queue + // items that are smaller or equal to that + apply_modification_queue_to_problem(modification_vector_pool, problem); + // copy host bounds again, because we changed some problem bounds + raft::copy(h_var_bounds.data(), + problem.variable_bounds.data(), + h_var_bounds.size(), + problem.handle_ptr->get_stream()); + problem.handle_ptr->sync_stream(); + if (n_of_implied_singletons - last_it_implied_singletons < + (size_t)std::max(2, (min(100, problem.n_variables / 50)))) { + early_exit = true; } -#pragma omp single - { - // TODO when we have determinism, check current threads work/time counter and filter queue - // items that are smaller or equal to that - apply_modification_queue_to_problem(modification_vector_pool, problem); - // copy host bounds again, because we changed some problem bounds - raft::copy(h_var_bounds.data(), - problem.variable_bounds.data(), - h_var_bounds.size(), - problem.handle_ptr->get_stream()); - problem.handle_ptr->sync_stream(); - if (n_of_implied_singletons - last_it_implied_singletons < - (size_t)std::max(2, (min(100, problem.n_variables / 50)))) { - early_exit = true; - } - last_it_implied_singletons = n_of_implied_singletons; - } + last_it_implied_singletons = n_of_implied_singletons; } // end of step + apply_substitution_queue_to_problem(substitution_vector_pool, problem); CUOPT_LOG_DEBUG("Total number of cached probings %lu number of implied singletons %lu", n_of_cached_probings.load(), diff --git a/cpp/src/mip_heuristics/presolve/semi_continuous.cu b/cpp/src/mip_heuristics/presolve/semi_continuous.cu new file mode 100644 index 0000000000..15728d02bb --- /dev/null +++ b/cpp/src/mip_heuristics/presolve/semi_continuous.cu @@ -0,0 +1,388 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include "semi_continuous.cuh" + +#include "bounds_presolve.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +namespace { + +constexpr double sc_infinity_threshold = 1e30; + +template +bool is_effectively_infinite_sc_upper_bound(f_t ub) +{ + return !std::isfinite(ub) || ub >= static_cast(sc_infinity_threshold); +} + +template +std::vector call_host_bounds_strengthening(const optimization_problem_t& op_problem, + const mip_solver_settings_t& settings, + const std::vector& sc_indices) +{ + auto user_problem = + cuopt_problem_to_simplex_problem(op_problem.get_handle_ptr(), op_problem); + + dual_simplex::lp_problem_t lp_problem(op_problem.get_handle_ptr(), 1, 1, 1); + std::vector new_slacks; + dual_simplex::dualize_info_t dualize_info; + dual_simplex::simplex_solver_settings_t simplex_settings; + simplex_settings.primal_tol = settings.tolerances.presolve_absolute_tolerance; + simplex_settings.integer_tol = settings.tolerances.integrality_tolerance; + simplex_settings.set_log(false); + + dual_simplex::convert_user_problem( + user_problem, simplex_settings, lp_problem, new_slacks, dualize_info); + + auto var_types = user_problem.var_types; + var_types.resize(lp_problem.num_cols, dual_simplex::variable_type_t::CONTINUOUS); + + dual_simplex::csr_matrix_t Arow(1, 1, 1); + lp_problem.A.to_compressed_row(Arow); + + // convert_user_problem returns an equality-form LP. Empty row_sense makes + // bounds_strengthening_t use rhs as both lower and upper row bounds. + std::vector row_sense; + dual_simplex::bounds_strengthening_t strengthening( + lp_problem, Arow, row_sense, var_types); + std::vector bounds_changed(lp_problem.num_cols, false); + for (i_t idx : sc_indices) { + bounds_changed[idx] = true; + } + auto lower = lp_problem.lower; + auto upper = lp_problem.upper; + auto ok = strengthening.bounds_strengthening(simplex_settings, bounds_changed, lower, upper); + if (!ok) { return op_problem.get_variable_upper_bounds_host(); } + + upper.resize(user_problem.num_cols); + return upper; +} + +} // namespace + +template +bool reformulate_semi_continuous(optimization_problem_t& op_problem, + const mip_solver_settings_t& settings, + std::vector* used_fallback_big_m, + std::vector* semi_continuous_binary_to_original_indices) +{ + // 1. Identify semi-continuous variables + auto var_types = op_problem.get_variable_types_host(); + auto var_lb = op_problem.get_variable_lower_bounds_host(); + auto var_ub = op_problem.get_variable_upper_bounds_host(); + std::vector sc_indices; + bool normalized_zero_lb_sc = false; + bool normalized_large_sc_ub = false; + for (i_t i = 0; i < static_cast(var_types.size()); ++i) { + if (var_types[i] != var_t::SEMI_CONTINUOUS) { continue; } + if (var_lb[i] == f_t(0)) { + CUOPT_LOG_DEBUG("Semi-continuous variable %d has zero lower bound; treating it as continuous", + i); + var_types[i] = var_t::CONTINUOUS; + normalized_zero_lb_sc = true; + continue; + } + sc_indices.push_back(i); + if (is_effectively_infinite_sc_upper_bound(var_ub[i])) { + CUOPT_LOG_DEBUG( + "Semi-continuous variable %d upper bound %.6g exceeds semi-continuous infinity " + "threshold %.6g; treating it as +inf", + i, + static_cast(var_ub[i]), + sc_infinity_threshold); + var_ub[i] = std::numeric_limits::infinity(); + normalized_large_sc_ub = true; + } + } + if (normalized_zero_lb_sc) { op_problem.set_variable_types(var_types.data(), var_types.size()); } + if (sc_indices.empty()) { return false; } + if (normalized_large_sc_ub) { + op_problem.set_variable_upper_bounds(var_ub.data(), var_ub.size()); + } + + const i_t n_orig = op_problem.get_n_variables(); + const i_t n_sc = static_cast(sc_indices.size()); + const auto* handle_ptr = op_problem.get_handle_ptr(); + const f_t big_m = settings.semi_continuous_big_m; + if (used_fallback_big_m != nullptr) { used_fallback_big_m->assign(n_orig, uint8_t{0}); } + + CUOPT_LOG_INFO("Reformulating %d semi-continuous variables before presolve", n_sc); + + // 2. Build a relaxed copy where SC vars become continuous [0, original_ub]. + // This lets deterministic CPU bounds strengthening derive tight upper bounds from the + // constraint structure without the binary domain {0} ∪ [L, U]. + optimization_problem_t op_relaxed(op_problem); + { + auto relaxed_types = var_types; + auto relaxed_ub = var_ub; + auto relaxed_lb = op_problem.get_variable_lower_bounds_host(); + for (i_t idx : sc_indices) { + relaxed_types[idx] = var_t::CONTINUOUS; + relaxed_lb[idx] = std::min(f_t(0), relaxed_lb[idx]); + if (std::isfinite(relaxed_ub[idx])) { relaxed_ub[idx] = std::max(f_t(0), relaxed_ub[idx]); } + } + op_relaxed.set_variable_types(relaxed_types.data(), n_orig); + op_relaxed.set_variable_lower_bounds(relaxed_lb.data(), n_orig); + op_relaxed.set_variable_upper_bounds(relaxed_ub.data(), n_orig); + } + + // 3. Run deterministic CPU bounds strengthening on the relaxed problem to tighten UBs. + // Skip strengthening when there are no constraints (nothing to propagate). + auto tight_ub = var_ub; // fallback: normalized original UBs + + if (op_relaxed.get_n_constraints() > 0) { + tight_ub = call_host_bounds_strengthening(op_relaxed, settings, sc_indices); + } + + // 4. Fetch all host arrays we need to extend with the new binary variables + // and linking constraints. + auto obj_c = op_problem.get_objective_coefficients_host(); + auto A_vals = op_problem.get_constraint_matrix_values_host(); + auto A_idx = op_problem.get_constraint_matrix_indices_host(); + auto A_off = op_problem.get_constraint_matrix_offsets_host(); + auto clb = op_problem.get_constraint_lower_bounds_host(); + auto cub = op_problem.get_constraint_upper_bounds_host(); + + // Optional arrays — only extend if they were originally set + auto b_rhs = op_problem.get_constraint_bounds_host(); + auto row_types_h = op_problem.get_row_types_host(); + + // Ensure objective and variable arrays are sized to n_orig + if (obj_c.empty()) { obj_c.assign(n_orig, f_t(0)); } + + // 5. Count how many SC vars truly need the binary-variable reformulation. + // If 0 is already inside [L, U], then "x=0 OR L<=x<=U" simplifies to + // plain continuous [L, U] — no binary needed. + std::vector needs_binary(n_sc, true); + i_t n_binary_needed = 0; + for (i_t s = 0; s < n_sc; ++s) { + const i_t idx = sc_indices[s]; + needs_binary[s] = + !(var_lb[idx] <= f_t(0) && std::isfinite(var_ub[idx]) && var_ub[idx] >= f_t(0)) && + !(var_lb[idx] <= f_t(0) && !std::isfinite(var_ub[idx])); + if (needs_binary[s]) { ++n_binary_needed; } + } + + // Extend variable arrays (one binary per SC var that actually needs it) + var_types.resize(n_orig + n_binary_needed, var_t::INTEGER); + var_lb.resize(n_orig + n_binary_needed, f_t(0)); + var_ub.resize(n_orig + n_binary_needed, f_t(1)); + obj_c.resize(n_orig + n_binary_needed, f_t(0)); + if (semi_continuous_binary_to_original_indices != nullptr) { + semi_continuous_binary_to_original_indices->clear(); + semi_continuous_binary_to_original_indices->reserve(n_binary_needed); + } + + // 6. For each SC variable: derive U when needed, then either add binary + 2 + // linking constraints or simply relax to continuous if 0 is already in + // the interval [L, U]. + i_t binary_count = 0; + for (i_t s = 0; s < n_sc; ++s) { + const i_t idx = sc_indices[s]; + const f_t L = var_lb[idx]; + const f_t orig_u = var_ub[idx]; + + if (!needs_binary[s]) { + // 0 already lies in [L, U], so the SC disjunction is just the interval itself. + CUOPT_LOG_DEBUG( + "Semi-continuous variable %d interval [%.6g, %.6g] already contains 0; treating it as " + "continuous", + idx, + L, + orig_u); + var_types[idx] = var_t::CONTINUOUS; + continue; + } + + // Use CPU-strengthened upper bound for positive-side SC variables when available. + // For negative-side intervals, keep the original upper bound because the relaxed + // convex hull includes 0 and is not useful for tightening the negative upper edge. + f_t U = orig_u; + if (orig_u >= f_t(0) || !std::isfinite(orig_u)) { U = tight_ub[idx]; } + if (!std::isfinite(orig_u) && std::isfinite(U)) { + CUOPT_LOG_DEBUG( + "Semi-continuous variable %d upper bound was tightened from %.6g to %.6g by " + "CPU bounds strengthening", + idx, + static_cast(orig_u), + static_cast(U)); + } + if (!std::isfinite(U)) { U = orig_u; } + if (!std::isfinite(U)) { + cuopt_assert( + std::isfinite(big_m) && big_m >= L, + "Semi-continuous fallback mip_semi_continuous_big_m must be finite and >= lower bound"); + U = big_m; + CUOPT_LOG_DEBUG( + "Semi-continuous variable %d has no finite upper bound after bounds " + "strengthening; using fallback mip_semi_continuous_big_m %.6g", + idx, + static_cast(big_m)); + if (used_fallback_big_m != nullptr) { (*used_fallback_big_m)[idx] = uint8_t{1}; } + } + + CUOPT_LOG_DEBUG("Semi-continuous variable %d: L=%.6g, U=%.6g (after propagation)", idx, L, U); + + const i_t b_idx = n_orig + binary_count; + ++binary_count; + if (semi_continuous_binary_to_original_indices != nullptr) { + semi_continuous_binary_to_original_indices->push_back(idx); + } + + // Convert SC var to the continuous interval [0, U]. + var_types[idx] = var_t::CONTINUOUS; + var_lb[idx] = std::min(f_t(0), L); + var_ub[idx] = std::max(f_t(0), U); + + // Constraint 1: x_i - L * b_i >= 0 (clb=0, cub=+inf) + A_vals.push_back(f_t(1)); + A_idx.push_back(idx); + A_vals.push_back(-L); + A_idx.push_back(b_idx); + A_off.push_back(A_off.back() + 2); + clb.push_back(f_t(0)); + cub.push_back(std::numeric_limits::infinity()); + if (!b_rhs.empty()) { b_rhs.push_back(f_t(0)); } + if (!row_types_h.empty()) { row_types_h.push_back('G'); } + + // Constraint 2: x_i - U * b_i <= 0 (clb=-inf, cub=0) + A_vals.push_back(f_t(1)); + A_idx.push_back(idx); + A_vals.push_back(-U); + A_idx.push_back(b_idx); + A_off.push_back(A_off.back() + 2); + clb.push_back(-std::numeric_limits::infinity()); + cub.push_back(f_t(0)); + if (!b_rhs.empty()) { b_rhs.push_back(f_t(0)); } + if (!row_types_h.empty()) { row_types_h.push_back('L'); } + } + + // 7. Rebuild op_problem with the extended data. + const i_t new_n_vars = n_orig + n_binary_needed; + const i_t new_n_cons = static_cast(clb.size()); + const i_t new_nnz = static_cast(A_vals.size()); + const i_t added_constraints = 2 * n_binary_needed; + + CUOPT_LOG_INFO("Semi-continuous reformulation added %d variables and %d constraints", + n_binary_needed, + added_constraints); + + op_problem.set_objective_coefficients(obj_c.data(), new_n_vars); + op_problem.set_variable_lower_bounds(var_lb.data(), new_n_vars); + op_problem.set_variable_upper_bounds(var_ub.data(), new_n_vars); + op_problem.set_variable_types(var_types.data(), new_n_vars); + op_problem.set_csr_constraint_matrix( + A_vals.data(), new_nnz, A_idx.data(), new_nnz, A_off.data(), new_n_cons + 1); + op_problem.set_constraint_lower_bounds(clb.data(), new_n_cons); + op_problem.set_constraint_upper_bounds(cub.data(), new_n_cons); + if (!b_rhs.empty()) { op_problem.set_constraint_bounds(b_rhs.data(), new_n_cons); } + if (!row_types_h.empty()) { op_problem.set_row_types(row_types_h.data(), new_n_cons); } + + return true; +} + +template +void append_semi_continuous_auxiliaries_to_assignment( + std::vector& assignment, + const std::vector& semi_continuous_binary_to_original_indices, + typename mip_solver_settings_t::tolerances_t tolerances) +{ + if (semi_continuous_binary_to_original_indices.empty()) { return; } + + const auto original_size = static_cast(assignment.size()); + const f_t active_tol = std::max(tolerances.absolute_tolerance, tolerances.integrality_tolerance); + assignment.reserve(assignment.size() + semi_continuous_binary_to_original_indices.size()); + for (i_t idx : semi_continuous_binary_to_original_indices) { + cuopt_expects(idx >= 0 && idx < original_size, + error_type_t::ValidationError, + "Semi-continuous callback solution references an invalid parent variable index " + "%d.", + idx); + assignment.push_back(assignment[idx] <= active_tol ? f_t(0) : f_t(1)); + } +} + +template +void strip_semi_continuous_auxiliaries_from_assignment(std::vector& assignment, + i_t original_num_variables) +{ + if (assignment.size() <= static_cast(original_num_variables)) { return; } + cuopt_expects( + original_num_variables >= 0 && original_num_variables <= static_cast(assignment.size()), + error_type_t::ValidationError, + "Semi-continuous callback translation has invalid original variable count %d.", + original_num_variables); + assignment.resize(original_num_variables); +} + +template +void expand_initial_solutions_for_semi_continuous( + mip_solver_settings_t& settings, + const std::vector& semi_continuous_binary_to_original_indices, + rmm::cuda_stream_view stream) +{ + if (semi_continuous_binary_to_original_indices.empty()) { return; } + + for (auto& initial_solution : settings.initial_solutions) { + if (initial_solution == nullptr || initial_solution->is_empty()) { continue; } + + auto host_initial = cuopt::host_copy(*initial_solution, stream); + std::vector expanded_initial(host_initial.begin(), host_initial.end()); + append_semi_continuous_auxiliaries_to_assignment( + expanded_initial, semi_continuous_binary_to_original_indices, settings.get_tolerances()); + + initial_solution = std::make_shared>(expanded_initial.size(), stream); + raft::copy(initial_solution->data(), expanded_initial.data(), expanded_initial.size(), stream); + } +} + +#if MIP_INSTANTIATE_FLOAT +template bool reformulate_semi_continuous(optimization_problem_t&, + const mip_solver_settings_t&, + std::vector*, + std::vector*); +template void append_semi_continuous_auxiliaries_to_assignment( + std::vector&, const std::vector&, mip_solver_settings_t::tolerances_t); +template void strip_semi_continuous_auxiliaries_from_assignment(std::vector&, int); +template void expand_initial_solutions_for_semi_continuous(mip_solver_settings_t&, + const std::vector&, + rmm::cuda_stream_view); +#endif + +#if MIP_INSTANTIATE_DOUBLE +template bool reformulate_semi_continuous(optimization_problem_t&, + const mip_solver_settings_t&, + std::vector*, + std::vector*); +template void append_semi_continuous_auxiliaries_to_assignment( + std::vector&, const std::vector&, mip_solver_settings_t::tolerances_t); +template void strip_semi_continuous_auxiliaries_from_assignment(std::vector&, int); +template void expand_initial_solutions_for_semi_continuous(mip_solver_settings_t&, + const std::vector&, + rmm::cuda_stream_view); +#endif + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/presolve/semi_continuous.cuh b/cpp/src/mip_heuristics/presolve/semi_continuous.cuh new file mode 100644 index 0000000000..6d37c62b4d --- /dev/null +++ b/cpp/src/mip_heuristics/presolve/semi_continuous.cuh @@ -0,0 +1,70 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include + +namespace cuopt::linear_programming::detail { + +/** + * @brief Reformulate semi-continuous variables in-place inside the MIP solver. + * + * A semi-continuous variable x satisfies: x = 0 OR L <= x <= U (0 < L <= U). + * Reformulation adds a binary variable b and two linking constraints when needed. + * Added binaries are appended at the end of the variable arrays, and their linking + * constraints are appended at the end of the CSR row arrays in the same order. + * x - L * b >= 0 (forces x >= L when b=1; allows x=0 when b=0) + * x - U * b <= 0 (forces x <= U when b=1; forces x=0 when b=0) + * b in {0, 1}, x in [0, U] + * + * Deterministic CPU bounds strengthening is seeded only from SC variables to derive tight upper + * bounds for SC variables that have infinite original upper bounds. If strengthening cannot + * derive a finite bound, settings.semi_continuous_big_m is used as a fallback. + * + * This must be called before problem_t construction and Papilo presolve. + * + * @tparam i_t Integer index type + * @tparam f_t Floating-point value type + * @param[in,out] op_problem The optimization problem (modified in-place) + * @param[in] settings MIP solver settings (provides semi_continuous_big_m and tolerances) + * @param[out] used_fallback_big_m Per-original-variable flags. Entry i is set to 1 + * when variable i uses settings.semi_continuous_big_m as a + * fallback upper bound during reformulation. Used to reject the final solution if its upper bound + * lands on big-m within integrality tolerance. + * @param[out] semi_continuous_binary_to_original_indices Optional mapping for appended + * auxiliary + * binaries. Entry k stores the original semi-continuous + * variable index that produced appended binary k, in append + * order. + * @returns true if any semi-continuous variables were found and reformulated. + */ +template +bool reformulate_semi_continuous( + optimization_problem_t& op_problem, + const mip_solver_settings_t& settings, + std::vector* used_fallback_big_m, + std::vector* semi_continuous_binary_to_original_indices = nullptr); + +template +void expand_initial_solutions_for_semi_continuous( + mip_solver_settings_t& settings, + const std::vector& semi_continuous_binary_to_original_indices, + rmm::cuda_stream_view stream); + +template +void append_semi_continuous_auxiliaries_to_assignment( + std::vector& assignment, + const std::vector& semi_continuous_binary_to_original_indices, + typename mip_solver_settings_t::tolerances_t tolerances); + +template +void strip_semi_continuous_auxiliaries_from_assignment(std::vector& assignment, + i_t original_num_variables); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh index 568719dfd8..28162d7482 100644 --- a/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/trivial_presolve.cuh @@ -14,9 +14,11 @@ #include #include +#include #include #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu index 5d5fbc445a..d57bbb992f 100644 --- a/cpp/src/mip_heuristics/problem/problem.cu +++ b/cpp/src/mip_heuristics/problem/problem.cu @@ -27,9 +27,12 @@ #include #include #include +#include +#include #include #include #include +#include #include #include @@ -275,7 +278,8 @@ problem_t::problem_t(const problem_t& problem_, bool no_deep deterministic(problem_.deterministic), handle_ptr(problem_.handle_ptr), integer_fixed_problem(problem_.integer_fixed_problem), - integer_fixed_variable_map(problem_.n_variables, handle_ptr->get_stream()), + integer_fixed_variable_map((!no_deep_copy) ? 0 : problem_.n_variables, + handle_ptr->get_stream()), n_variables(problem_.n_variables), n_constraints(problem_.n_constraints), n_binary_vars(problem_.n_binary_vars), @@ -339,10 +343,7 @@ problem_t::problem_t(const problem_t& problem_, bool no_deep (!no_deep_copy) ? rmm::device_uvector(problem_.combined_bounds, handle_ptr->get_stream()) : rmm::device_uvector(problem_.combined_bounds.size(), handle_ptr->get_stream())), - variable_types( - (!no_deep_copy) - ? rmm::device_uvector(problem_.variable_types, handle_ptr->get_stream()) - : rmm::device_uvector(problem_.variable_types.size(), handle_ptr->get_stream())), + variable_types((!no_deep_copy) ? 0 : problem_.variable_types.size(), handle_ptr->get_stream()), integer_indices((!no_deep_copy) ? 0 : problem_.integer_indices.size(), handle_ptr->get_stream()), binary_indices((!no_deep_copy) ? 0 : problem_.binary_indices.size(), handle_ptr->get_stream()), @@ -351,7 +352,8 @@ problem_t::problem_t(const problem_t& problem_, bool no_deep is_binary_variable((!no_deep_copy) ? 0 : problem_.is_binary_variable.size(), handle_ptr->get_stream()), related_variables(problem_.related_variables, handle_ptr->get_stream()), - related_variables_offsets(problem_.related_variables_offsets, handle_ptr->get_stream()), + related_variables_offsets((!no_deep_copy) ? 0 : problem_.related_variables_offsets.size(), + handle_ptr->get_stream()), var_names(problem_.var_names), row_names(problem_.row_names), objective_name(problem_.objective_name), @@ -473,6 +475,7 @@ void csr_to_csc_transpose(const i_t* csr_offsets, // Copy sorted results back raft::copy(csc_indices, row_ind_sorted.data(), nnz, stream); raft::copy(csc_values, val_sorted.data(), nnz, stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } template @@ -565,8 +568,15 @@ void problem_t::check_problem_representation(bool check_transposed, "A_indices must be set before calling the solver."); } } - cuopt_assert(objective_coefficients.size() == n_variables, - "objective_coefficients size mismatch"); + if (n_variables == 0) { + cuopt_assert(objective_coefficients.is_empty(), + "objective_coefficients must be empty when n_variables is 0."); + } else { + cuopt_assert(!objective_coefficients.is_empty(), + "objective_coefficients must be set when n_variables > 0."); + cuopt_assert(objective_coefficients.size() % static_cast(n_variables) == 0, + "objective_coefficients size must be a multiple of n_variables"); + } // Check CSR validity check_csr_representation( @@ -591,8 +601,6 @@ void problem_t::check_problem_representation(bool check_transposed, // Check variable bounds are set and with the correct size if (!empty) { cuopt_assert(!variable_bounds.is_empty(), "Variable bounds must be set."); } - cuopt_assert(variable_bounds.size() == objective_coefficients.size(), - "Sizes for vectors related to the variables are not the same."); cuopt_assert(variable_bounds.size() == (std::size_t)n_variables, "Sizes for vectors related to the variables are not the same."); @@ -605,15 +613,18 @@ void problem_t::check_problem_representation(bool check_transposed, } cuopt_assert(constraint_lower_bounds.size() == constraint_upper_bounds.size(), "Sizes for vectors related to the constraints are not the same."); - cuopt_assert(constraint_lower_bounds.size() == (size_t)n_constraints, + cuopt_assert(n_constraints == 0 ? constraint_lower_bounds.size() == 0 + : constraint_lower_bounds.size() % (size_t)n_constraints == 0, "Sizes for vectors related to the constraints are not the same."); - cuopt_assert((offsets.size() - 1) == constraint_lower_bounds.size(), + cuopt_assert((offsets.size() - 1) == (size_t)n_constraints, "Sizes for vectors related to the constraints are not the same."); // Check combined bounds - cuopt_assert(combined_bounds.size() == (size_t)n_constraints, + // To handle batch case (% 0 is not allowed) + cuopt_assert(n_constraints == 0 + ? combined_bounds.size() == 0 + : combined_bounds.size() % static_cast(n_constraints) == 0, "Sizes for vectors related to the constraints are not the same."); - // Check the validity of bounds cuopt_expects(thrust::all_of(handle_ptr->get_thrust_policy(), thrust::make_counting_iterator(0), @@ -1346,26 +1357,30 @@ void problem_t::set_implied_integers(const std::vector& implied_i template void problem_t::recompute_objective_integrality() { - // FIXME: we do not consider implied integers here - // because it incorrectly considers neos-827175 as having an integer optimal. - // need to figure out if Papilo is producing an incorrect flag. - objective_is_integral = thrust::all_of(handle_ptr->get_thrust_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(n_variables), - [v = view()] __device__(i_t var_idx) -> bool { - if (v.objective_coefficients[var_idx] == 0) return true; - return v.is_integer(v.objective_coefficients[var_idx]) && - (v.variable_types[var_idx] == var_t::INTEGER); - }); - - bool objvars_all_integral = thrust::all_of(handle_ptr->get_thrust_policy(), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(n_variables), - [v = view()] __device__(i_t var_idx) -> bool { - if (v.objective_coefficients[var_idx] == 0) - return true; - return (v.variable_types[var_idx] == var_t::INTEGER); - }); + using cuopt::linear_programming::detail::is_integer; + + objective_is_integral = + thrust::all_of(handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n_variables), + [v = view()] __device__(i_t var_idx) -> bool { + if (v.objective_coefficients[var_idx] == 0) return true; + // Need a tight tolerance for integrality to weed out instances like + // neos-827175 with very small objective coefficients + return is_integer(v.objective_coefficients[var_idx], 1e-9) && + ((v.variable_types[var_idx] == var_t::INTEGER) || + (v.var_flags[var_idx] & (i_t)VAR_IMPLIED_INTEGER)); + }); + + bool objvars_all_integral = + thrust::all_of(handle_ptr->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n_variables), + [v = view()] __device__(i_t var_idx) -> bool { + if (v.objective_coefficients[var_idx] == 0) return true; + return (v.variable_types[var_idx] == var_t::INTEGER) || + (v.var_flags[var_idx] & (i_t)VAR_IMPLIED_INTEGER); + }); if (objvars_all_integral && !objective_is_integral) { auto h_objective_coefficients = cuopt::host_copy(objective_coefficients, handle_ptr->get_stream()); diff --git a/cpp/src/mip_heuristics/problem/problem_helpers.cuh b/cpp/src/mip_heuristics/problem/problem_helpers.cuh index ebc8a488ea..77cc973aa0 100644 --- a/cpp/src/mip_heuristics/problem/problem_helpers.cuh +++ b/cpp/src/mip_heuristics/problem/problem_helpers.cuh @@ -19,8 +19,10 @@ #include #include #include +#include #include #include +#include namespace cuopt::linear_programming::detail { template @@ -114,8 +116,9 @@ static void set_bounds_if_not_set(detail::problem_t& op_problem) set_variable_bounds(op_problem); if (op_problem.variable_types.is_empty() && !op_problem.objective_coefficients.is_empty()) { - op_problem.variable_types.resize(op_problem.objective_coefficients.size(), - op_problem.handle_ptr->get_stream()); + // variable_types is a per-variable quantity so use n_variables (not + // objective_coefficients.size(), which may be batch-expanded in batch mode). + op_problem.variable_types.resize(op_problem.n_variables, op_problem.handle_ptr->get_stream()); thrust::fill(op_problem.handle_ptr->get_thrust_policy(), op_problem.variable_types.begin(), op_problem.variable_types.end(), diff --git a/cpp/src/mip_heuristics/solution/solution.cu b/cpp/src/mip_heuristics/solution/solution.cu index e4192c0195..2a05a1ca56 100644 --- a/cpp/src/mip_heuristics/solution/solution.cu +++ b/cpp/src/mip_heuristics/solution/solution.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index be01516657..408a5258fd 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +49,10 @@ #include #include +#include + +#include +#include namespace cuopt::linear_programming { @@ -63,10 +69,16 @@ static void init_handler(const raft::handle_t* handle_ptr) template static void invoke_solution_callbacks( const std::vector& mip_callbacks, + bool strip_semi_continuous_auxiliaries, + int semi_continuous_original_num_variables, f_t objective, std::vector& assignment, f_t bound) { + if (strip_semi_continuous_auxiliaries) { + detail::strip_semi_continuous_auxiliaries_from_assignment( + assignment, semi_continuous_original_num_variables); + } std::vector obj_vec = {objective}; std::vector bound_vec = {bound}; for (auto callback : mip_callbacks) { @@ -80,19 +92,26 @@ static void invoke_solution_callbacks( } template -mip_solution_t run_mip(detail::problem_t& problem, - mip_solver_settings_t const& settings, - timer_t& timer, - f_t& initial_upper_bound, - std::vector& initial_incumbent_assignment) +mip_solution_t run_mip_solver(detail::problem_t& problem, + mip_solver_settings_t const& settings, + timer_t& timer, + f_t& initial_upper_bound, + std::vector& initial_incumbent_assignment) { try { raft::common::nvtx::range fun_scope("run_mip"); if (settings.get_mip_callbacks().size() > 0) { auto callback_num_variables = problem.original_problem_ptr->get_n_variables(); + const bool has_semi_continuous_callback_translation = + detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + settings); if (problem.has_papilo_presolve_data()) { callback_num_variables = problem.get_papilo_original_num_variables(); } + if (has_semi_continuous_callback_translation) { + callback_num_variables = detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(settings); + } for (auto callback : settings.get_mip_callbacks()) { callback->template setup(callback_num_variables); } @@ -132,6 +151,13 @@ mip_solution_t run_mip(detail::problem_t& problem, temp_sol.assignment.size(), temp_sol.handle_ptr->get_stream()); solution.handle_ptr->sync_stream(); + if (detail::mip_solver_settings_accessor:: + has_semi_continuous_callback_translation(settings)) { + detail::strip_semi_continuous_auxiliaries_from_assignment( + user_assignment_vec, + detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(settings)); + } get_sol_callback->get_solution(user_assignment_vec.data(), user_objective_vec.data(), user_bound_vec.data(), @@ -185,26 +211,39 @@ mip_solution_t run_mip(detail::problem_t& problem, auto* presolver_ptr = problem.presolve_data.papilo_presolve_ptr; auto mip_callbacks = settings.get_mip_callbacks(); f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20; - auto incumbent_callback = [presolver_ptr, - mip_callbacks, - no_bound, - ctx_ptr = &solver.context, - early_fj_start](f_t solver_obj, - f_t user_obj, - const std::vector& assignment, - const char* heuristic_name) { - std::vector user_assignment; - presolver_ptr->uncrush_primal_solution(assignment, user_assignment); - ctx_ptr->initial_incumbent_assignment = user_assignment; - ctx_ptr->initial_upper_bound = user_obj; - double elapsed = - std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start).count(); - CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", - heuristic_name, - user_obj, - elapsed); - invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound); - }; + auto incumbent_callback = + [presolver_ptr, + mip_callbacks, + no_bound, + has_semi_continuous_callback_translation = + detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + settings), + semi_continuous_original_num_variables = detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(settings), + ctx_ptr = &solver.context, + early_fj_start](f_t solver_obj, + f_t user_obj, + const std::vector& assignment, + const char* heuristic_name) { + std::vector user_assignment; + presolver_ptr->uncrush_primal_solution(assignment, user_assignment); + ctx_ptr->initial_incumbent_assignment = user_assignment; + ctx_ptr->initial_upper_bound = user_obj; + double elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start) + .count(); + CUOPT_LOG_INFO( + "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", + heuristic_name, + user_obj, + elapsed); + invoke_solution_callbacks(mip_callbacks, + has_semi_continuous_callback_translation, + semi_continuous_original_num_variables, + user_obj, + user_assignment, + no_bound); + }; early_cpufj = std::make_unique>( *problem.original_problem_ptr, settings.get_tolerances(), incumbent_callback); // Convert initial_upper_bound from user-space to the CPUFJ's solver-space (papilo-presolved). @@ -248,8 +287,8 @@ mip_solution_t run_mip(detail::problem_t& problem, } template -mip_solution_t solve_mip(optimization_problem_t& op_problem, - mip_solver_settings_t const& settings_const) +mip_solution_t solve_mip_helper(optimization_problem_t& op_problem, + mip_solver_settings_t const& settings_const) { try { mip_solver_settings_t settings(settings_const); @@ -279,8 +318,8 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, if (settings.seed >= 0) { cuopt::seed_generator::set_seed(settings.seed); } raft::common::nvtx::range fun_scope("Running solver"); + auto timer = timer_t(time_limit); - // This is required as user might forget to set some fields problem_checking_t::check_problem_representation(op_problem); problem_checking_t::check_initial_solution_representation(op_problem, settings); @@ -290,6 +329,29 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, op_problem.get_n_variables(), op_problem.get_n_integers(), op_problem.get_nnz()); + + // Reformulate semi-continuous variables (x = 0 OR L <= x <= U) before Papilo presolve. + // Uses deterministic CPU bounds strengthening to derive tight upper bounds for SC vars with + // infinite UB. + // Track n_orig so that auxiliary binary variables added by reformulation can be stripped + // from the solution before returning it to the caller. + const i_t n_orig_before_sc = op_problem.get_n_variables(); + const auto original_variable_names = op_problem.get_variable_names(); + std::vector sc_used_fallback_big_m; + std::vector semi_continuous_binary_to_original_indices; + const bool has_semi_continuous = detail::reformulate_semi_continuous( + op_problem, settings, &sc_used_fallback_big_m, &semi_continuous_binary_to_original_indices); + if (has_semi_continuous && !settings.initial_solutions.empty()) { + detail::expand_initial_solutions_for_semi_continuous( + settings, + semi_continuous_binary_to_original_indices, + op_problem.get_handle_ptr()->get_stream()); + } + if (has_semi_continuous) { + detail::mip_solver_settings_accessor::set_semi_continuous_callback_translation( + settings, n_orig_before_sc, semi_continuous_binary_to_original_indices); + } + op_problem.print_scaling_information(); // Check for crossing bounds. Return infeasible if there are any @@ -300,10 +362,15 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } for (auto callback : settings.get_mip_callbacks()) { - callback->template setup(op_problem.get_n_variables()); + auto callback_num_variables = op_problem.get_n_variables(); + if (detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + settings)) { + callback_num_variables = detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(settings); + } + callback->template setup(callback_num_variables); } - auto timer = timer_t(time_limit); if (settings.mip_scaling != CUOPT_MIP_SCALING_OFF) { detail::mip_scaling_strategy_t scaling(op_problem); scaling.scale_problem(settings.mip_scaling != CUOPT_MIP_SCALING_NO_OBJECTIVE); @@ -325,7 +392,7 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } } if (run_presolve && has_set_solution_callback) { - CUOPT_LOG_WARN("Presolve is disabled because set_solution callbacks are provided."); + CUOPT_LOG_INFO("Presolve is disabled because set_solution callbacks are provided."); run_presolve = false; } @@ -334,8 +401,6 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, // Start early FJ (CPU and GPU) during presolve to find incumbents ASAP // Only run if presolve is enabled (gives FJ time to find solutions) // and we're not in deterministic mode - std::unique_ptr> early_cpufj; - std::unique_ptr> early_gpufj; // Track best incumbent found during presolve (shared across CPU and GPU FJ). // early_best_objective is in the original problem's solver-space (always minimization), @@ -347,35 +412,51 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, std::vector early_best_user_assignment; std::mutex early_callback_mutex; + std::unique_ptr> early_cpufj; + std::unique_ptr> early_gpufj; + bool run_early_fj = run_presolve && settings.determinism_mode != CUOPT_MODE_DETERMINISTIC && op_problem.get_n_integers() > 0 && op_problem.get_n_constraints() > 0; f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20; if (run_early_fj) { - auto early_fj_start = std::chrono::steady_clock::now(); - auto early_fj_callback = [&early_best_objective, - &early_best_user_obj, - &early_best_user_assignment, - &early_callback_mutex, - &early_fj_start, - mip_callbacks = settings.get_mip_callbacks(), - no_bound](f_t solver_obj, - f_t user_obj, - const std::vector& assignment, - const char* heuristic_name) { - std::lock_guard lock(early_callback_mutex); - if (solver_obj >= early_best_objective.load()) { return; } - early_best_objective.store(solver_obj); - early_best_user_obj = user_obj; - early_best_user_assignment = assignment; - double elapsed = - std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start).count(); - CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", - heuristic_name, - user_obj, - elapsed); - auto user_assignment = assignment; - invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound); - }; + auto early_fj_start = std::chrono::steady_clock::now(); + auto early_fj_callback = + [&early_best_objective, + &early_best_user_obj, + &early_best_user_assignment, + &early_callback_mutex, + early_fj_start, + mip_callbacks = settings.get_mip_callbacks(), + has_semi_continuous_callback_translation = + detail::mip_solver_settings_accessor::has_semi_continuous_callback_translation( + settings), + semi_continuous_original_num_variables = detail::mip_solver_settings_accessor:: + get_semi_continuous_original_num_variables(settings), + no_bound](f_t solver_obj, + f_t user_obj, + const std::vector& assignment, + const char* heuristic_name) { + std::lock_guard lock(early_callback_mutex); + if (solver_obj >= early_best_objective.load()) { return; } + early_best_objective.store(solver_obj); + early_best_user_obj = user_obj; + early_best_user_assignment = assignment; + double elapsed = + std::chrono::duration(std::chrono::steady_clock::now() - early_fj_start) + .count(); + CUOPT_LOG_INFO( + "New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f", + heuristic_name, + user_obj, + elapsed); + auto user_assignment = assignment; + invoke_solution_callbacks(mip_callbacks, + has_semi_continuous_callback_translation, + semi_continuous_original_num_variables, + user_obj, + user_assignment, + no_bound); + }; // Start early CPUFJ on original problem (will restart on presolved problem after Papilo) early_cpufj = std::make_unique>( @@ -469,10 +550,10 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, CUOPT_LOG_INFO("Writing presolved problem to file: %s", settings.presolve_file.c_str()); presolve_result_opt->reduced_problem.write_to_mps(settings.presolve_file); } - // early_best_user_obj is in user-space. // run_mip stores it in context.initial_upper_bound and converts to target spaces as needed. - auto sol = run_mip(problem, settings, timer, early_best_user_obj, early_best_user_assignment); + auto sol = + run_mip_solver(problem, settings, timer, early_best_user_obj, early_best_user_assignment); const f_t cuopt_presolve_time = sol.get_stats().presolve_time; if (run_presolve) { @@ -544,6 +625,49 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } } + // Strip auxiliary binary variables that were injected by SC reformulation. + // The caller only knows about the original n_orig_before_sc variables. + if (has_semi_continuous && sol.get_solution().size() > static_cast(n_orig_before_sc)) { + sol.get_solution().resize(n_orig_before_sc, op_problem.get_handle_ptr()->get_stream()); + } + + if (has_semi_continuous && + (sol.get_termination_status() == mip_termination_status_t::FeasibleFound || + sol.get_termination_status() == mip_termination_status_t::Optimal)) { + auto host_solution = + cuopt::host_copy(sol.get_solution(), op_problem.get_handle_ptr()->get_stream()); + const f_t active_tol = settings.tolerances.integrality_tolerance; + i_t num_active_fallback_big_m = 0; + std::string active_fallback_big_m_var_name; + for (i_t i = 0; i < static_cast(sc_used_fallback_big_m.size()); ++i) { + if (!sc_used_fallback_big_m[i]) { continue; } + if (host_solution[i] >= settings.semi_continuous_big_m - active_tol) { + ++num_active_fallback_big_m; + if (active_fallback_big_m_var_name.empty()) { + if (i < static_cast(original_variable_names.size()) && + !original_variable_names[i].empty()) { + active_fallback_big_m_var_name = original_variable_names[i]; + } else { + active_fallback_big_m_var_name = "X" + std::to_string(i); + } + } + } + } + if (num_active_fallback_big_m > 0) { + std::ostringstream error_msg; + error_msg << "Semi-continuous variable " << active_fallback_big_m_var_name + << " is at upper bound coming from big-M " << settings.semi_continuous_big_m + << "; results may depend on artificial upper bound"; + if (num_active_fallback_big_m > 1) { + error_msg << " " << (num_active_fallback_big_m - 1) + << " additional semi-continuous variables are also at fallback big-M"; + } + return mip_solution_t{ + cuopt::logic_error(error_msg.str(), cuopt::error_type_t::RuntimeError), + op_problem.get_handle_ptr()->get_stream()}; + } + } + if (sol.get_termination_status() == mip_termination_status_t::FeasibleFound || sol.get_termination_status() == mip_termination_status_t::Optimal) { sol.log_detailed_summary(); @@ -553,6 +677,7 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, CUOPT_LOG_INFO("Writing solution to file %s", settings.sol_file.c_str()); sol.write_to_sol_file(settings.sol_file, op_problem.get_handle_ptr()->get_stream()); } + return sol; } catch (const cuopt::logic_error& e) { CUOPT_LOG_ERROR("Error in solve_mip: %s", e.what()); @@ -567,12 +692,62 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, throw; } } +template +mip_solution_t solve_mip(optimization_problem_t& op_problem, + mip_solver_settings_t const& settings_const) +{ + std::exception_ptr exception; + i_t num_threads = 0; + if (settings_const.num_cpu_threads < 0) { + num_threads = omp_get_max_threads(); + } else { + num_threads = settings_const.num_cpu_threads; + } + + if (num_threads < 2) { + CUOPT_LOG_ERROR("The MIP solver requires at least 2 CPU threads!"); + return mip_solution_t{ + cuopt::logic_error("The number of CPU threads is less than the expected minimum (2).", + cuopt::error_type_t::RuntimeError), + op_problem.get_handle_ptr()->get_stream()}; + } + + mip_solution_t sol(mip_termination_status_t::NoTermination, + solver_stats_t{}, + op_problem.get_handle_ptr()->get_stream()); + + // The outer solver opens an omp parallel region in solve.cu, so this inner team would + // collapse to a single thread under the default OMP_MAX_ACTIVE_LEVELS=1 and only worker 0 + // would execute. Enable two active levels locally and restore on the way out. + const int saved_max_active_levels = omp_get_max_active_levels(); + if (saved_max_active_levels < 2) { omp_set_max_active_levels(2); } + + // Creates the OpenMP thread pool. It will be shared across the entire MIP solver. +#pragma omp parallel num_threads(num_threads) default(none) \ + shared(sol, op_problem, settings_const, exception) + { +#pragma omp masked + { + try { + sol = solve_mip_helper(op_problem, settings_const); + } catch (...) { + // We cannot throw inside an OpenMP parallel region. So we need to catch and then + // re-throw later. + exception = std::current_exception(); + } + } + } // Implicit barrier + + if (saved_max_active_levels < 2) { omp_set_max_active_levels(saved_max_active_levels); } + + if (exception) { std::rethrow_exception(exception); } + return sol; +} template -mip_solution_t solve_mip( - raft::handle_t const* handle_ptr, - const cuopt::mps_parser::mps_data_model_t& mps_data_model, - mip_solver_settings_t const& settings) +mip_solution_t solve_mip(raft::handle_t const* handle_ptr, + const mps_parser::mps_data_model_t& mps_data_model, + mip_solver_settings_t const& settings) { auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); return solve_mip(op_problem, settings); @@ -624,6 +799,7 @@ std::unique_ptr> solve_mip( try { // Check if remote execution is enabled (always uses CPU backend) +#ifdef CUOPT_ENABLE_GRPC if (is_remote_execution_enabled()) { auto* cpu_prob = dynamic_cast*>(problem_interface); cuopt_expects(cpu_prob != nullptr, @@ -631,6 +807,12 @@ std::unique_ptr> solve_mip( "Remote execution requires CPU memory backend"); return solve_mip_remote(*cpu_prob, settings); } +#else + cuopt_expects( + !is_remote_execution_enabled(), + error_type_t::ValidationError, + "Remote execution was requested, but this build was compiled without gRPC support"); +#endif // Local execution - dispatch to appropriate overload based on problem type auto* cpu_prob = dynamic_cast*>(problem_interface); diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index ce6b602fba..540e31800b 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -181,6 +181,8 @@ void extract_probing_implied_bounds( template solution_t mip_solver_t::run_solver() { + solution_t sol(*context.problem_ptr); + // we need to keep original problem const cuopt_assert(context.problem_ptr != nullptr, "invalid problem pointer"); context.problem_ptr->tolerances = context.settings.get_tolerances(); @@ -191,7 +193,6 @@ solution_t mip_solver_t::run_solver() diversity_manager_t dm(context); if (context.problem_ptr->empty) { CUOPT_LOG_INFO("Problem fully reduced in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); for (auto callback : context.settings.get_mip_callbacks()) { if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { @@ -202,6 +203,7 @@ solution_t mip_solver_t::run_solver() context.problem_ptr->post_process_solution(sol); return sol; } + dm.timer = timer_; const bool run_presolve = context.settings.presolver != presolver_t::None; f_t time_limit = context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC @@ -227,14 +229,13 @@ solution_t mip_solver_t::run_solver() if (!presolve_success) { CUOPT_LOG_INFO("Problem proven infeasible in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); context.problem_ptr->post_process_solution(sol); return sol; } + if (run_presolve && context.problem_ptr->empty) { CUOPT_LOG_INFO("Problem full reduced in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); for (auto callback : context.settings.get_mip_callbacks()) { if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { @@ -248,7 +249,6 @@ solution_t mip_solver_t::run_solver() if (timer_.check_time_limit()) { CUOPT_LOG_INFO("Time limit reached after presolve"); - solution_t sol(*context.problem_ptr); context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); return sol; @@ -265,7 +265,6 @@ solution_t mip_solver_t::run_solver() auto opt_sol = solve_lp_with_method(*context.problem_ptr, settings, lp_timer); - solution_t sol(*context.problem_ptr); sol.copy_new_assignment( host_copy(opt_sol.get_primal_solution(), context.problem_ptr->handle_ptr->get_stream())); if (opt_sol.get_termination_status() == pdlp_termination_status_t::Optimal || @@ -284,10 +283,11 @@ solution_t mip_solver_t::run_solver() context.problem_ptr->post_process_solution(sol); return sol; } + context.work_unit_scheduler_.register_context(context.gpu_heur_loop); - namespace dual_simplex = cuopt::linear_programming::dual_simplex; - std::future branch_and_bound_status_future; + namespace dual_simplex = cuopt::linear_programming::dual_simplex; + dual_simplex::mip_status_t branch_and_bound_status = dual_simplex::mip_status_t::UNSET; dual_simplex::user_problem_t branch_and_bound_problem(context.problem_ptr->handle_ptr); context.problem_ptr->recompute_objective_integrality(); if (context.problem_ptr->is_objective_integral()) { @@ -302,8 +302,9 @@ solution_t mip_solver_t::run_solver() dual_simplex::probing_implied_bound_t probing_implied_bound; - bool run_bb = !context.settings.heuristics_only; - if (run_bb) { + i_t num_threads = omp_get_num_threads(); + + if (!context.settings.heuristics_only) { // Convert the presolved problem to dual_simplex::user_problem_t op_problem_.get_host_user_problem(branch_and_bound_problem); // Resize the solution now that we know the number of columns/variables @@ -317,6 +318,7 @@ solution_t mip_solver_t::run_solver() // Fill in the settings for branch and bound branch_and_bound_settings.time_limit = timer_.get_time_limit(); branch_and_bound_settings.node_limit = context.settings.node_limit; + branch_and_bound_settings.num_threads = std::max(num_threads - 1, 1); branch_and_bound_settings.print_presolve_stats = false; branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; @@ -356,21 +358,18 @@ solution_t mip_solver_t::run_solver() ? 2 : context.settings.reduced_cost_strengthening; - if (context.settings.num_cpu_threads < 0) { - branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1); - } else { - branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads); - } - // Set the branch and bound -> primal heuristics callback branch_and_bound_settings.solution_callback = std::bind(&branch_and_bound_solution_helper_t::solution_callback, &solution_helper, std::placeholders::_1, std::placeholders::_2); - // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread + + // heuristic_preemption_callback is needed in both modes to properly stop the heuristic + // thread branch_and_bound_settings.heuristic_preemption_callback = std::bind( &branch_and_bound_solution_helper_t::preempt_heuristic_solver, &solution_helper); + if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { branch_and_bound_settings.set_simplex_solution_callback = std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, @@ -444,33 +443,34 @@ solution_t mip_solver_t::run_solver() if (timer_.check_time_limit()) { CUOPT_LOG_INFO("Time limit reached during B&B setup"); - solution_t sol(*context.problem_ptr); context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); return sol; } - - // Fork a thread for branch and bound - // std::async and std::future allow us to get the return value of bb::solve() - // without having to manually manage the thread - // std::future.get() performs a join() operation to wait until the return status is available - branch_and_bound_status_future = std::async(std::launch::async, - &dual_simplex::branch_and_bound_t::solve, - branch_and_bound.get(), - std::ref(branch_and_bound_solution)); } - // Start the primal heuristics - context.diversity_manager_ptr = &dm; - auto sol = dm.run_solver(); - if (run_bb) { - // Wait for the branch and bound to finish - auto bb_status = branch_and_bound_status_future.get(); +#pragma omp taskgroup + { + if (!context.settings.heuristics_only) { +#pragma omp task default(shared) + { + branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); + } + } + + // Start the primal heuristics + context.diversity_manager_ptr = &dm; + sol = dm.run_solver(); + } // implicit barrier for all tasks created in B&B and heuristics + + if (!context.settings.heuristics_only) { if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { context.stats.set_solution_bound( context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound)); } - if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); } + if (branch_and_bound_status == dual_simplex::mip_status_t::INFEASIBLE) { + sol.set_problem_fully_reduced(); + } context.stats.num_nodes = branch_and_bound_solution.nodes_explored; context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations; } diff --git a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh b/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh deleted file mode 100644 index 2b982e1f47..0000000000 --- a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh +++ /dev/null @@ -1,147 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights - * reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace cuopt::linear_programming::detail { - -template -class cpu_worker_thread_base_t { - public: - cpu_worker_thread_base_t(); - ~cpu_worker_thread_base_t(); - - void start_cpu_solver(); - bool wait_for_cpu_solver(); - - // Derived classes MUST call this in their destructor before the base destructor runs. - // This ensures on_terminate() is called while the derived object is still fully alive. - void request_termination(); - - // Internal method for thread management - safe to call during destruction - void join_worker(); - void cpu_worker_thread(); - - std::thread cpu_worker; - std::mutex cpu_mutex; - std::condition_variable cpu_cv; - std::atomic should_stop{false}; - std::atomic cpu_thread_should_start{false}; - std::atomic cpu_thread_done{true}; - std::atomic cpu_thread_terminate{false}; -}; - -template -cpu_worker_thread_base_t::cpu_worker_thread_base_t() -{ - cpu_worker = std::thread(&cpu_worker_thread_base_t::cpu_worker_thread, this); -} - -template -cpu_worker_thread_base_t::~cpu_worker_thread_base_t() -{ - // Note: We don't call on_terminate() here since the derived object is already destroyed. - join_worker(); -} - -template -void cpu_worker_thread_base_t::cpu_worker_thread() -{ - while (!cpu_thread_terminate) { - { - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; }); - - if (cpu_thread_terminate) break; - - cpu_thread_done = false; - cpu_thread_should_start = false; - } - - static_cast(this)->run_worker(); - - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = true; - } - cpu_cv.notify_all(); - } -} - -template -void cpu_worker_thread_base_t::request_termination() -{ - bool should_terminate = false; - { - std::lock_guard lock(cpu_mutex); - if (cpu_thread_terminate) return; - cpu_thread_terminate = true; - should_terminate = true; - static_cast(this)->on_terminate(); - } - - if (should_terminate) { - cpu_cv.notify_one(); - join_worker(); - } -} - -template -void cpu_worker_thread_base_t::join_worker() -{ - { - std::lock_guard lock(cpu_mutex); - if (!cpu_thread_terminate) { cpu_thread_terminate = true; } - } - cpu_cv.notify_one(); - - if (cpu_worker.joinable()) { cpu_worker.join(); } -} - -template -void cpu_worker_thread_base_t::start_cpu_solver() -{ - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = false; - cpu_thread_should_start = true; - static_cast(this)->on_start(); - } - cpu_cv.notify_one(); -} - -template -bool cpu_worker_thread_base_t::wait_for_cpu_solver() -{ - auto wait_start = std::chrono::high_resolution_clock::now(); - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; }); - auto wait_end = std::chrono::high_resolution_clock::now(); - double wait_time = std::chrono::duration(wait_end - wait_start).count(); - if (wait_time > 1.0) { CUOPT_LOG_DEBUG("CPU thread wait time: %.2f seconds", wait_time); } - - return static_cast(this)->get_result(); -} - -} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/utilities/sort_csr.cuh b/cpp/src/mip_heuristics/utilities/sort_csr.cuh index b7c5634cdf..92e560dbb9 100644 --- a/cpp/src/mip_heuristics/utilities/sort_csr.cuh +++ b/cpp/src/mip_heuristics/utilities/sort_csr.cuh @@ -50,6 +50,7 @@ void sort_csr(optimization_problem_t& op_problem) op_problem.get_constraint_matrix_offsets().data() + 1, stream_view); RAFT_CHECK_CUDA(stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); } } // namespace linear_programming::detail diff --git a/cpp/src/pdlp/cpu_optimization_problem.cpp b/cpp/src/pdlp/cpu_optimization_problem.cpp index 406b0b6541..de1f74ed47 100644 --- a/cpp/src/pdlp/cpu_optimization_problem.cpp +++ b/cpp/src/pdlp/cpu_optimization_problem.cpp @@ -133,6 +133,14 @@ void cpu_optimization_problem_t::set_quadratic_objective_matrix( std::copy(Q_offsets, Q_offsets + size_offsets, Q_offsets_.begin()); } +template +void cpu_optimization_problem_t::set_quadratic_constraints( + std::vector::quadratic_constraint_t> + constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + template void cpu_optimization_problem_t::set_variable_lower_bounds( const f_t* variable_lower_bounds, i_t size) @@ -494,6 +502,19 @@ bool cpu_optimization_problem_t::has_quadratic_objective() const return !Q_values_.empty(); } +template +const std::vector::quadratic_constraint_t>& +cpu_optimization_problem_t::get_quadratic_constraints() const +{ + return quadratic_constraints_; +} + +template +bool cpu_optimization_problem_t::has_quadratic_constraints() const +{ + return !quadratic_constraints_.empty(); +} + // ============================================================================== // Host Getters (return references to CPU memory) // ============================================================================== @@ -621,6 +642,12 @@ cpu_optimization_problem_t::to_optimization_problem(raft::handle_t con Q_offsets_.size()); } + if (!quadratic_constraints_.empty()) { + gpu_problem->set_quadratic_constraints( + std::vector::quadratic_constraint_t>( + quadratic_constraints_)); + } + // Set variable bounds if (!variable_lower_bounds_.empty()) { gpu_problem->set_variable_lower_bounds(variable_lower_bounds_.data(), @@ -740,6 +767,10 @@ void cpu_optimization_problem_t::write_to_mps(const std::string& mps_f false); } + if (!quadratic_constraints_.empty()) { + data_model_view.set_quadratic_constraints(quadratic_constraints_); + } + cuopt::mps_parser::write_mps(data_model_view, mps_file_path); } diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu index 64ec44f5ef..2f541bd61a 100644 --- a/cpp/src/pdlp/cusparse_view.cu +++ b/cpp/src/pdlp/cusparse_view.cu @@ -153,6 +153,92 @@ cusparse_dn_mat_descr_wrapper_t::operator cusparseDnMatDescr_t() const return descr_; } +#if CUDA_VER_13_2_UP +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t() + : descr_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t::~cusparse_spmvop_descr_wrapper_t() +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseSpMVOp_destroyDescr(descr_)); } +} + +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t( + const cusparse_spmvop_descr_wrapper_t& other) + : descr_(other.descr_), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t& cusparse_spmvop_descr_wrapper_t::operator=( + cusparse_spmvop_descr_wrapper_t&& other) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyDescr(descr_)); } + descr_ = other.descr_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + + +void cusparse_spmvop_descr_wrapper_t::create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + void* buffer) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyDescr(descr_)); } + RAFT_CUSPARSE_TRY(cusparseSpMVOp_createDescr( + handle, &descr_, opA, matA, vecX, vecY, vecZ, computeType, buffer)); + need_destruction_ = true; +} + +cusparse_spmvop_descr_wrapper_t::operator cusparseSpMVOpDescr_t() const { return descr_; } + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t() + : plan_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t::~cusparse_spmvop_plan_wrapper_t() +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseSpMVOp_destroyPlan(plan_)); } +} + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t( + const cusparse_spmvop_plan_wrapper_t& other) + : plan_(other.plan_), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t& cusparse_spmvop_plan_wrapper_t::operator=( + cusparse_spmvop_plan_wrapper_t&& other) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyPlan(plan_)); } + plan_ = other.plan_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + +void cusparse_spmvop_plan_wrapper_t::create(cusparseHandle_t handle, + cusparseSpMVOpDescr_t descr, + char* lto_buffer, + size_t lto_buffer_size) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyPlan(plan_)); } + RAFT_CUSPARSE_TRY( + cusparseSpMVOp_createPlan(handle, descr, &plan_, lto_buffer, lto_buffer_size)); + need_destruction_ = true; +} + +cusparse_spmvop_plan_wrapper_t::operator cusparseSpMVOpPlan_t() const { return plan_; } + +#endif + #if CUDA_VER_12_4_UP struct dynamic_load_runtime { static void* get_cusparse_runtime_handle() @@ -304,6 +390,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{op_problem_scaled.reverse_constraints}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -407,8 +495,9 @@ cusparse_view_t::cusparse_view_t( _tmp_primal.data(), CUSPARSE_ORDER_COL); - primal_gradient.create(op_problem_scaled.n_variables, - current_saddle_point_state.get_primal_gradient().data()); + primal_gradient.create( + current_saddle_point_state.get_primal_gradient().size(), // It is 0 in cupdlpx + current_saddle_point_state.get_primal_gradient().data()); dual_gradient.create(op_problem_scaled.n_constraints, current_saddle_point_state.get_dual_gradient().data()); @@ -716,6 +805,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{_A_T_indices}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -925,6 +1016,8 @@ cusparse_view_t::cusparse_view_t( tmp_dual(existing_cusparse_view.tmp_dual), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1030,6 +1123,7 @@ cusparse_view_t::cusparse_view_t( #endif } + // Empty constructor used in kkt restart to save memory template cusparse_view_t::cusparse_view_t( @@ -1040,6 +1134,8 @@ cusparse_view_t::cusparse_view_t( : handle_ptr_(handle_ptr), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1082,6 +1178,39 @@ void cusparse_view_t::update_mixed_precision_matrices() } } +// Redirects the cuSPARSE CSR structure pointers from op_problem_scaled_ to the original problem +// so the duplicated row/column buffers can be freed. +template +void cusparse_view_t::redirect_cusparse_csr_structure_pointers( + const problem_t& original_problem) +{ + RAFT_CUSPARSE_TRY(cusparseCsrSetPointers(A, + const_cast(original_problem.offsets.data()), + const_cast(original_problem.variables.data()), + const_cast(A_.data()))); + + RAFT_CUSPARSE_TRY( + cusparseCsrSetPointers(A_T, + const_cast(original_problem.reverse_offsets.data()), + const_cast(original_problem.reverse_constraints.data()), + const_cast(A_T_.data()))); + + if constexpr (std::is_same_v) { + if (mixed_precision_enabled_) { + RAFT_CUSPARSE_TRY(cusparseCsrSetPointers(A_mixed_, + const_cast(original_problem.offsets.data()), + const_cast(original_problem.variables.data()), + A_float_.data())); + + RAFT_CUSPARSE_TRY( + cusparseCsrSetPointers(A_T_mixed_, + const_cast(original_problem.reverse_offsets.data()), + const_cast(original_problem.reverse_constraints.data()), + A_T_float_.data())); + } + } +} + // Mixed precision SpMV implementation: FP32 matrix with FP64 vectors and FP64 compute type size_t mixed_precision_spmv_buffersize(cusparseHandle_t handle, cusparseOperation_t opA, @@ -1148,6 +1277,66 @@ bool is_cusparse_runtime_mixed_precision_supported() return (major > 12) || (major == 12 && minor >= 5); } +// Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. +template +void cusparse_view_t::create_spmv_op_plans(bool is_reflected) +{ +#if CUDA_VER_13_2_UP + CUSPARSE_CHECK(cusparseSetStream(handle_ptr_->get_cusparse_handle(), handle_ptr_->get_stream())); + // Prepare buffers for At_y SpMVOp + size_t buffer_size_transpose = 0; + RAFT_CUSPARSE_TRY(cusparseSpMVOp_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + &buffer_size_transpose)); + buffer_transpose_spmvop.resize(buffer_size_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_t_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + buffer_transpose_spmvop.data()); + + char* lto_buffer = NULL; + size_t lto_buffer_size = 0; + spmv_op_plan_A_t_.create( + handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_t_, lto_buffer, lto_buffer_size); + + // Only prepare buffers for A_x if we are using reflected_halpern + if (is_reflected) { + size_t buffer_size_non_transpose = 0; + RAFT_CUSPARSE_TRY(cusparseSpMVOp_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + &buffer_size_non_transpose)); + buffer_non_transpose_spmvop.resize(buffer_size_non_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + buffer_non_transpose_spmvop.data()); + + spmv_op_plan_A_.create( + handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_, lto_buffer, lto_buffer_size); + } +#endif +} + #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT template class cusparse_sp_mat_descr_wrapper_t; template class cusparse_dn_vec_descr_wrapper_t; diff --git a/cpp/src/pdlp/cusparse_view.hpp b/cpp/src/pdlp/cusparse_view.hpp index 416a0b1e5f..ba14a1b555 100644 --- a/cpp/src/pdlp/cusparse_view.hpp +++ b/cpp/src/pdlp/cusparse_view.hpp @@ -20,6 +20,8 @@ #include +#define CUDA_VER_13_2_UP (CUDART_VERSION >= 13020) + namespace cuopt::linear_programming::detail { template @@ -79,6 +81,54 @@ class cusparse_dn_mat_descr_wrapper_t { bool need_destruction_; }; +#if CUDA_VER_13_2_UP +class cusparse_spmvop_descr_wrapper_t { + public: + cusparse_spmvop_descr_wrapper_t(); + ~cusparse_spmvop_descr_wrapper_t(); + + cusparse_spmvop_descr_wrapper_t(const cusparse_spmvop_descr_wrapper_t& other); + cusparse_spmvop_descr_wrapper_t& operator=(cusparse_spmvop_descr_wrapper_t&& other); + cusparse_spmvop_descr_wrapper_t& operator=(const cusparse_spmvop_descr_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + void* buffer); + + operator cusparseSpMVOpDescr_t() const; + + private: + cusparseSpMVOpDescr_t descr_; + bool need_destruction_; +}; + +class cusparse_spmvop_plan_wrapper_t { + public: + cusparse_spmvop_plan_wrapper_t(); + ~cusparse_spmvop_plan_wrapper_t(); + + cusparse_spmvop_plan_wrapper_t(const cusparse_spmvop_plan_wrapper_t& other); + cusparse_spmvop_plan_wrapper_t& operator=(cusparse_spmvop_plan_wrapper_t&& other); + cusparse_spmvop_plan_wrapper_t& operator=(const cusparse_spmvop_plan_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, + cusparseSpMVOpDescr_t descr, + char* lto_buffer, + size_t lto_buffer_size); + + operator cusparseSpMVOpPlan_t() const; + + private: + cusparseSpMVOpPlan_t plan_; + bool need_destruction_; +}; +#endif + template class cusparse_view_t { public: @@ -172,6 +222,17 @@ class cusparse_view_t { rmm::device_uvector buffer_non_transpose; rmm::device_uvector buffer_transpose; + // SpMVOp buffers for A and A_T + rmm::device_uvector buffer_non_transpose_spmvop{0, handle_ptr_->get_stream()}; + rmm::device_uvector buffer_transpose_spmvop{0, handle_ptr_->get_stream()}; + +#if CUDA_VER_13_2_UP + // SpMVOp descriptors and plans for A and A_T (descr before plan so dtor destroys plan first) + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_; + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_t_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_t_; +#endif // reuse buffers for cusparse spmm rmm::device_uvector buffer_transpose_batch; rmm::device_uvector buffer_non_transpose_batch; @@ -208,6 +269,12 @@ class cusparse_view_t { // Update FP32 matrix copies after scaling (must be called after scale_problem()) void update_mixed_precision_matrices(); + + // Redirects the cuSPARSE CSR structure pointers from op_problem_scaled_ to the original problem + // so the duplicated row/column buffers can be freed. + void redirect_cusparse_csr_structure_pointers(const problem_t& original_problem); + // Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. + void create_spmv_op_plans(bool is_reflected); }; // Mixed precision SpMV: FP32 matrix with FP64 vectors and FP64 compute type diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index b618550f6e..c79249c45d 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -22,10 +23,50 @@ #include #include +#include +#include +#include +#include #include namespace cuopt::linear_programming::detail { +template +struct weighted_square_op { + f_t weight; + HDI f_t operator()(f_t v) { return v * v * weight; } +}; + +template +struct rescaling_from_squared_norm_op { + HDI f_t operator()(f_t sum) { return f_t(1.0) / (raft::sqrt(sum) + f_t(1.0)); } +}; + +template +struct inverse_rescaling_op { + HDI f_t operator()(f_t v) + { + cuopt_assert(v != f_t(0), "Numerical error: rescaling should never equal 0"); + return v != f_t(0) ? f_t(1.0) / v : v; + } +}; + +template +__global__ void scaling_swap_rescaling_kernel(const swap_pair_t* swap_pairs, + i_t swap_count, + raft::device_span bound_rescaling, + raft::device_span objective_rescaling) +{ + const i_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx >= swap_count) { return; } + + const i_t left = swap_pairs[idx].left; + const i_t right = swap_pairs[idx].right; + + cuda::std::swap(bound_rescaling[left], bound_rescaling[right]); + cuda::std::swap(objective_rescaling[left], objective_rescaling[right]); +} + template pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( raft::handle_t const* handle_ptr, @@ -37,6 +78,7 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( rmm::device_uvector& A_T_indices, pdhg_solver_t* pdhg_solver_ptr, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params, + i_t original_batch_size, bool running_mip) : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), @@ -51,8 +93,11 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( running_mip_(running_mip), iteration_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, iteration_variable_scaling_{static_cast(primal_size_h_), stream_view_}, - bound_rescaling_(f_t(1), stream_view_), - objective_rescaling_(f_t(1), stream_view_), + original_batch_size_(original_batch_size), + bound_rescaling_(static_cast(original_batch_size_), stream_view_), + objective_rescaling_(static_cast(original_batch_size_), stream_view_), + h_bound_rescaling_(static_cast(original_batch_size_), f_t(1)), + h_objective_rescaling_(static_cast(original_batch_size_), f_t(1)), cummulative_constraint_matrix_scaling_{static_cast(dual_size_h_), stream_view_}, cummulative_variable_scaling_{static_cast(primal_size_h_), stream_view_} { @@ -63,6 +108,7 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( #endif if (!running_mip_) cuopt_assert(pdhg_solver_ptr_ != nullptr, "PDHG solver pointer is null"); + cuopt_assert(original_batch_size_ > 0, "Original batch size must be positive"); // start with all one for scaling vectors RAFT_CUDA_TRY(cudaMemsetAsync( @@ -77,8 +123,17 @@ pdlp_initial_scaling_strategy_t::pdlp_initial_scaling_strategy_t( cummulative_variable_scaling_.begin(), cummulative_variable_scaling_.end(), f_t(1)); + thrust::fill( + handle_ptr_->get_thrust_policy(), bound_rescaling_.begin(), bound_rescaling_.end(), f_t(1)); + thrust::fill(handle_ptr_->get_thrust_policy(), + objective_rescaling_.begin(), + objective_rescaling_.end(), + f_t(1)); compute_scaling_vectors(number_of_ruiz_iterations, alpha); + + iteration_constraint_matrix_scaling_.resize(0, stream_view_); + iteration_variable_scaling_.resize(0, stream_view_); } template @@ -95,57 +150,37 @@ template void pdlp_initial_scaling_strategy_t::bound_objective_rescaling() { // TODO: test bound obj scaling w/ MIP - rmm::device_buffer d_temp_storage; - size_t bytes; - - auto main_op = [] HD(const thrust::tuple t) { - const f_t lower = thrust::get<0>(t); - const f_t upper = thrust::get<1>(t); - f_t sum = 0; - if (isfinite(lower) && (lower != upper)) sum += lower * lower; - if (isfinite(upper)) sum += upper * upper; - return sum; - }; - cub::DeviceReduce::TransformReduce( - nullptr, - bytes, - thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(), - op_problem_scaled_.constraint_upper_bounds.data()), - bound_rescaling_.data(), - op_problem_scaled_.constraint_lower_bounds.size(), - cuda::std::plus<>{}, - main_op, - f_t(0), - stream_view_); + segmented_sum_handler_t segmented_sum_handler(stream_view_); - d_temp_storage.resize(bytes, stream_view_); + // ------- Constraints bounds scaling ------- + // This works whether we have different bounds per climber or not because of the + // problem_wrap_container + const i_t n_constrs = op_problem_scaled_.n_constraints; + const auto n_batches = original_batch_size_; + auto bound_input = thrust::make_transform_iterator( + thrust::make_zip_iterator(problem_wrap_container(op_problem_scaled_.constraint_lower_bounds), + problem_wrap_container(op_problem_scaled_.constraint_upper_bounds)), + rhs_sum_of_squares_t{}); + auto bound_output = thrust::make_transform_output_iterator(bound_rescaling_.data(), + rescaling_from_squared_norm_op{}); - cub::DeviceReduce::TransformReduce( - d_temp_storage.data(), - bytes, - thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(), - op_problem_scaled_.constraint_upper_bounds.data()), - bound_rescaling_.data(), - op_problem_scaled_.constraint_lower_bounds.size(), - cuda::std::plus<>{}, - main_op, - f_t(0), - stream_view_); + segmented_sum_handler.segmented_sum_helper(bound_input, bound_output, n_batches, n_constrs); - h_bound_rescaling = f_t(1.0) / (std::sqrt(bound_rescaling_.value(stream_view_)) + f_t(1.0)); - bound_rescaling_.set_value_async(h_bound_rescaling, stream_view_); + h_bound_rescaling_ = cuopt::host_copy(bound_rescaling_, stream_view_); - detail::my_l2_weighted_norm(op_problem_scaled_.objective_coefficients, - hyper_params_.initial_primal_weight_c_scaling, - objective_rescaling_, - stream_view_); + // ------- Objective coefficients scaling ------- - // sqrt already applied - h_objective_rescaling = f_t(1.0) / (objective_rescaling_.value(stream_view_) + f_t(1.0)); - objective_rescaling_.set_value_async(h_objective_rescaling, stream_view_); + const i_t n_variables = op_problem_scaled_.n_variables; + auto objective_input = thrust::make_transform_iterator( + problem_wrap_container(op_problem_scaled_.objective_coefficients), + weighted_square_op{f_t(hyper_params_.initial_primal_weight_c_scaling)}); + auto objective_output = thrust::make_transform_output_iterator( + objective_rescaling_.data(), rescaling_from_squared_norm_op{}); - // Sync since we are using local variable - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + segmented_sum_handler.segmented_sum_helper( + objective_input, objective_output, n_batches, n_variables); + + h_objective_rescaling_ = cuopt::host_copy(objective_rescaling_, stream_view_); } template @@ -404,13 +439,78 @@ __global__ void scale_transposed_problem_kernel( template f_t pdlp_initial_scaling_strategy_t::get_h_bound_rescaling() const { - return h_bound_rescaling; + cuopt_assert(!h_bound_rescaling_.empty(), "Bound rescaling vector should not be empty"); + return h_bound_rescaling_[0]; } template f_t pdlp_initial_scaling_strategy_t::get_h_objective_rescaling() const { - return h_objective_rescaling; + cuopt_assert(!h_objective_rescaling_.empty(), "Objective rescaling vector should not be empty"); + return h_objective_rescaling_[0]; +} + +template +const rmm::device_uvector& +pdlp_initial_scaling_strategy_t::get_bound_rescaling_vector() const +{ + return bound_rescaling_; +} + +template +const rmm::device_uvector& +pdlp_initial_scaling_strategy_t::get_objective_rescaling_vector() const +{ + return objective_rescaling_; +} + +template +void pdlp_initial_scaling_strategy_t::swap_context( + const thrust::universal_host_pinned_vector>& swap_pairs) +{ + if (swap_pairs.empty()) { return; } + + const auto batch_size = static_cast(bound_rescaling_.size()); + cuopt_assert(batch_size == static_cast(objective_rescaling_.size()), + "Rescaling vectors must have the same size"); + cuopt_assert(h_bound_rescaling_.size() == static_cast(batch_size), + "Host/device bound rescaling sizes must match"); + cuopt_assert(h_objective_rescaling_.size() == static_cast(batch_size), + "Host/device objective rescaling sizes must match"); + for (const auto& pair : swap_pairs) { + cuopt_assert(pair.left < pair.right, "Left swap index must be less than right swap index"); + cuopt_assert(pair.right < batch_size, "Right swap index is out of bounds"); + } + + const auto [grid_size, block_size] = + kernel_config_from_batch_size(static_cast(swap_pairs.size())); + scaling_swap_rescaling_kernel + <<>>(thrust::raw_pointer_cast(swap_pairs.data()), + static_cast(swap_pairs.size()), + make_span(bound_rescaling_), + make_span(objective_rescaling_)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + for (const auto& pair : swap_pairs) { + std::swap(h_bound_rescaling_[pair.left], h_bound_rescaling_[pair.right]); + std::swap(h_objective_rescaling_[pair.left], h_objective_rescaling_[pair.right]); + } +} + +template +void pdlp_initial_scaling_strategy_t::resize_context(i_t new_size) +{ + [[maybe_unused]] const auto batch_size = static_cast(bound_rescaling_.size()); + cuopt_assert(batch_size == static_cast(objective_rescaling_.size()), + "Rescaling vectors must have the same size"); + cuopt_assert(new_size > 0, "New size must be greater than 0"); + cuopt_assert(new_size < batch_size, "New size must be less than batch size"); + + bound_rescaling_.resize(new_size, stream_view_); + objective_rescaling_.resize(new_size, stream_view_); + h_bound_rescaling_.resize(new_size); + h_objective_rescaling_.resize(new_size); + original_batch_size_ = new_size; } template @@ -471,18 +571,19 @@ void pdlp_initial_scaling_strategy_t::scale_problem() stream_view_); } - // TODO later batch mode: handle different constraints bounds - raft::linalg::eltwiseMultiply( - const_cast&>(op_problem_scaled_.constraint_lower_bounds).data(), + cub::DeviceTransform::Transform( + cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(), + problem_wrap_container(cummulative_constraint_matrix_scaling_)), op_problem_scaled_.constraint_lower_bounds.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, + op_problem_scaled_.constraint_lower_bounds.size(), + cuda::std::multiplies{}, stream_view_); - raft::linalg::eltwiseMultiply( - const_cast&>(op_problem_scaled_.constraint_upper_bounds).data(), + cub::DeviceTransform::Transform( + cuda::std::make_tuple(op_problem_scaled_.constraint_upper_bounds.data(), + problem_wrap_container(cummulative_constraint_matrix_scaling_)), op_problem_scaled_.constraint_upper_bounds.data(), - cummulative_constraint_matrix_scaling_.data(), - dual_size_h_, + op_problem_scaled_.constraint_upper_bounds.size(), + cuda::std::multiplies{}, stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { @@ -490,55 +591,48 @@ void pdlp_initial_scaling_strategy_t::scale_problem() bound_objective_rescaling(); #ifdef CUPDLP_DEBUG_MODE - printf("Bound rescaling %lf %lf\n", - bound_rescaling_.value(stream_view_), - objective_rescaling_.value(stream_view_)); + print("bound_rescaling", bound_rescaling_); + print("objective_rescaling", objective_rescaling_); #endif cub::DeviceTransform::Transform( cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(), - op_problem_scaled_.constraint_upper_bounds.data()), + op_problem_scaled_.constraint_upper_bounds.data(), + batch_wrapped_container(bound_rescaling_, dual_size_h_)), thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(), op_problem_scaled_.constraint_upper_bounds.data()), op_problem_scaled_.constraint_upper_bounds.size(), - [bound_rescaling = bound_rescaling_.data()] __device__( - f_t constraint_lower_bound, f_t constraint_upper_bound) -> thrust::tuple { - return {constraint_lower_bound * *bound_rescaling, - constraint_upper_bound * *bound_rescaling}; + [] __device__(f_t constraint_lower_bound, + f_t constraint_upper_bound, + f_t bound_rescaling) -> thrust::tuple { + return {constraint_lower_bound * bound_rescaling, constraint_upper_bound * bound_rescaling}; }, stream_view_.value()); - cub::DeviceTransform::Transform( - op_problem_scaled_.variable_bounds.data(), - op_problem_scaled_.variable_bounds.data(), - op_problem_scaled_.variable_bounds.size(), - [bound_rescaling = bound_rescaling_.data(), - objective_rescaling = objective_rescaling_.data()] __device__(f_t2 variable_bounds) -> f_t2 { - return {variable_bounds.x * *bound_rescaling, variable_bounds.y * *bound_rescaling}; - }, - stream_view_); - - if (pdhg_solver_ptr_ && pdhg_solver_ptr_->get_new_bounds_idx().size() != 0) { + // In batch mode we don't scale the variable bounds (here) because they are shared across + // climbers. While the variable bounds are the same across climbers, there can be different + // bound rescaling factors for each climber. One solution would be to have per climber variable + // bounds but its costly from a memory perspective and from a memory bandwidth perspective. + // Since the variable bounds are the same across climbers but only the scaling factor changes, + // we pass the scaling factor to PDHG later. In PDHG we act the (almost fully) scaled variable + // bounds and add this missing scaling factor. + if (original_batch_size_ == 1) { cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_ptr_->get_new_bounds_lower().data(), - pdhg_solver_ptr_->get_new_bounds_upper().data()), - thrust::make_zip_iterator(pdhg_solver_ptr_->get_new_bounds_lower().data(), - pdhg_solver_ptr_->get_new_bounds_upper().data()), - pdhg_solver_ptr_->get_new_bounds_idx().size(), - [bound_rescaling = bound_rescaling_.data()] __device__( - f_t lower, f_t upper) -> thrust::tuple { - return {lower * *bound_rescaling, upper * *bound_rescaling}; + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.size(), + [bound_rescaling = bound_rescaling_.data()] __device__(f_t2 variable_bounds) -> f_t2 { + return {variable_bounds.x * *bound_rescaling, variable_bounds.y * *bound_rescaling}; }, stream_view_); } cub::DeviceTransform::Transform( - op_problem_scaled_.objective_coefficients.data(), + cuda::std::make_tuple(op_problem_scaled_.objective_coefficients.data(), + batch_wrapped_container(objective_rescaling_, primal_size_h_)), op_problem_scaled_.objective_coefficients.data(), op_problem_scaled_.objective_coefficients.size(), - [bound_rescaling = bound_rescaling_.data(), - objective_rescaling = objective_rescaling_.data()] __device__(f_t objective_coefficient) - -> f_t { return objective_coefficient * *objective_rescaling; }, + cuda::std::multiplies{}, stream_view_.value()); } @@ -590,11 +684,13 @@ void pdlp_initial_scaling_strategy_t::scale_solutions( stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cub::DeviceTransform::Transform(primal_solution.data(), - primal_solution.data(), - primal_solution.size(), - a_times_scalar(h_bound_rescaling), - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(primal_solution.data(), + batch_wrapped_container(bound_rescaling_, primal_size_h_)), + primal_solution.data(), + primal_solution.size(), + cuda::std::multiplies{}, + stream_view_); } } @@ -615,11 +711,13 @@ void pdlp_initial_scaling_strategy_t::scale_solutions( stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cub::DeviceTransform::Transform(dual_solution.data(), - dual_solution.data(), - dual_solution.size(), - a_times_scalar(h_objective_rescaling), - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_solution.data(), + batch_wrapped_container(objective_rescaling_, dual_size_h_)), + dual_solution.data(), + dual_solution.size(), + cuda::std::multiplies{}, + stream_view_); } } @@ -640,11 +738,13 @@ void pdlp_initial_scaling_strategy_t::scale_solutions( stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cub::DeviceTransform::Transform(dual_slack.data(), - dual_slack.data(), - dual_slack.size(), - a_times_scalar{h_objective_rescaling}, - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_slack.data(), + batch_wrapped_container(objective_rescaling_, primal_size_h_)), + dual_slack.data(), + dual_slack.size(), + cuda::std::multiplies{}, + stream_view_); } } } @@ -706,13 +806,15 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cuopt_assert(h_bound_rescaling != f_t(0), - "Numerical error: bound_rescaling_ should never equal 0"); - cub::DeviceTransform::Transform(primal_solution.data(), - primal_solution.data(), - primal_solution.size(), - a_times_scalar(f_t(1.0) / h_bound_rescaling), - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + primal_solution.data(), + thrust::make_transform_iterator(batch_wrapped_container(bound_rescaling_, primal_size_h_), + inverse_rescaling_op{})), + primal_solution.data(), + primal_solution.size(), + cuda::std::multiplies{}, + stream_view_); } } @@ -733,13 +835,15 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( cuda::std::multiplies<>{}, stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cuopt_assert(h_bound_rescaling != f_t(0), - "Numerical error: bound_rescaling_ should never equal 0"); - cub::DeviceTransform::Transform(dual_solution.data(), - dual_solution.data(), - dual_solution.size(), - a_times_scalar(f_t(1.0) / h_objective_rescaling), - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_solution.data(), + thrust::make_transform_iterator( + batch_wrapped_container(objective_rescaling_, dual_size_h_), + inverse_rescaling_op{})), + dual_solution.data(), + dual_solution.size(), + cuda::std::multiplies{}, + stream_view_); } } @@ -758,13 +862,15 @@ void pdlp_initial_scaling_strategy_t::unscale_solutions( batch_safe_div(), stream_view_); if (hyper_params_.bound_objective_rescaling && !running_mip_) { - cuopt_assert(h_bound_rescaling != f_t(0), - "Numerical error: bound_rescaling_ should never equal 0"); - cub::DeviceTransform::Transform(dual_slack.data(), - dual_slack.data(), - dual_slack.size(), - a_times_scalar{f_t(1.0) / h_objective_rescaling}, - stream_view_); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_slack.data(), + thrust::make_transform_iterator( + batch_wrapped_container(objective_rescaling_, primal_size_h_), + inverse_rescaling_op{})), + dual_slack.data(), + dual_slack.size(), + cuda::std::multiplies{}, + stream_view_); } } } diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 5a3dcfaca2..99d1472b6f 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -9,6 +9,7 @@ #include #include +#include #include @@ -17,6 +18,9 @@ #include #include +#include +#include + namespace cuopt::linear_programming::detail { template @@ -51,6 +55,7 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T_indices, pdhg_solver_t* pdhg_solver_ptr, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params, + i_t original_batch_size, bool running_mip = false); void scale_problem(); @@ -75,6 +80,10 @@ class pdlp_initial_scaling_strategy_t { f_t get_h_bound_rescaling() const; f_t get_h_objective_rescaling() const; + const rmm::device_uvector& get_bound_rescaling_vector() const; + const rmm::device_uvector& get_objective_rescaling_vector() const; + void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); + void resize_context(i_t new_size); void bound_objective_rescaling(); @@ -100,11 +109,12 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector iteration_constraint_matrix_scaling_; rmm::device_uvector iteration_variable_scaling_; - rmm::device_scalar bound_rescaling_; - rmm::device_scalar objective_rescaling_; + i_t original_batch_size_; + rmm::device_uvector bound_rescaling_; + rmm::device_uvector objective_rescaling_; // Since we need it on the host - f_t h_bound_rescaling = std::numeric_limits::signaling_NaN(); - f_t h_objective_rescaling = std::numeric_limits::signaling_NaN(); + std::vector h_bound_rescaling_; + std::vector h_objective_rescaling_; rmm::device_uvector cummulative_constraint_matrix_scaling_; rmm::device_uvector cummulative_variable_scaling_; diff --git a/cpp/src/pdlp/optimization_problem.cu b/cpp/src/pdlp/optimization_problem.cu index 87ff9dab08..a6f0d30ea8 100644 --- a/cpp/src/pdlp/optimization_problem.cu +++ b/cpp/src/pdlp/optimization_problem.cu @@ -84,6 +84,7 @@ optimization_problem_t::optimization_problem_t( c_{other.get_objective_coefficients(), stream_view_}, objective_scaling_factor_{other.get_objective_scaling_factor()}, objective_offset_{other.get_objective_offset()}, + batch_objective_offsets_{other.get_batch_objective_offsets()}, Q_offsets_{other.get_quadratic_objective_offsets()}, Q_indices_{other.get_quadratic_objective_indices()}, Q_values_{other.get_quadratic_objective_values()}, @@ -97,7 +98,8 @@ optimization_problem_t::optimization_problem_t( problem_name_{other.get_problem_name()}, problem_category_{other.get_problem_category()}, var_names_{other.get_variable_names()}, - row_names_{other.get_row_names()} + row_names_{other.get_row_names()}, + quadratic_constraints_{other.get_quadratic_constraints()} { } @@ -167,6 +169,12 @@ void optimization_problem_t::set_objective_offset(f_t objective_offset objective_offset_ = objective_offset; } +template +void optimization_problem_t::set_batch_objective_offsets(const std::vector& offsets) +{ + batch_objective_offsets_ = offsets; +} + template void optimization_problem_t::set_quadratic_objective_matrix( const f_t* Q_values, @@ -197,6 +205,14 @@ void optimization_problem_t::set_quadratic_objective_matrix( // FIX ME:: check for positive semi definite matrix } +template +void optimization_problem_t::set_quadratic_constraints( + std::vector::quadratic_constraint_t> + constraints) +{ + quadratic_constraints_ = std::move(constraints); +} + template void optimization_problem_t::set_variable_lower_bounds(const f_t* variable_lower_bounds, i_t size) @@ -233,14 +249,17 @@ void optimization_problem_t::set_variable_types(const var_t* variable_ variable_types_.resize(size, stream_view_); raft::copy(variable_types_.data(), variable_types, size, stream_view_); - // Auto-detect problem category based on variable types - i_t n_integer = thrust::count_if(handle_ptr_->get_thrust_policy(), - variable_types_.begin(), - variable_types_.end(), - [] __device__(auto val) { return val == var_t::INTEGER; }); - if (n_integer == size) { + // Auto-detect problem category based on variable types. + // SEMI_CONTINUOUS vars will be reformulated into binary + continuous before solving, + // so a problem with only SC vars is treated as MIP. + i_t n_discrete = thrust::count_if( + handle_ptr_->get_thrust_policy(), + variable_types_.begin(), + variable_types_.end(), + [] __device__(auto val) { return val == var_t::INTEGER || val == var_t::SEMI_CONTINUOUS; }); + if (n_discrete == size) { problem_category_ = problem_category_t::IP; - } else if (n_integer > 0) { + } else if (n_discrete > 0) { problem_category_ = problem_category_t::MIP; } else { problem_category_ = problem_category_t::LP; @@ -420,6 +439,19 @@ f_t optimization_problem_t::get_objective_offset() const return objective_offset_; } +template +const std::vector& optimization_problem_t::get_batch_objective_offsets() + const noexcept +{ + return batch_objective_offsets_; +} + +template +std::vector& optimization_problem_t::get_batch_objective_offsets() noexcept +{ + return batch_objective_offsets_; +} + template const rmm::device_uvector& optimization_problem_t::get_variable_lower_bounds() const { @@ -548,6 +580,19 @@ bool optimization_problem_t::has_quadratic_objective() const return !Q_values_.empty(); } +template +const std::vector::quadratic_constraint_t>& +optimization_problem_t::get_quadratic_constraints() const +{ + return quadratic_constraints_; +} + +template +bool optimization_problem_t::has_quadratic_constraints() const +{ + return !quadratic_constraints_.empty(); +} + template raft::handle_t const* optimization_problem_t::get_handle_ptr() const noexcept { @@ -820,6 +865,10 @@ void optimization_problem_t::write_to_mps(const std::string& mps_file_ is_symmetrized); } + if (!quadratic_constraints_.empty()) { + data_model_view.set_quadratic_constraints(quadratic_constraints_); + } + cuopt::mps_parser::write_mps(data_model_view, mps_file_path); } @@ -1032,6 +1081,7 @@ bool optimization_problem_t::is_equivalent( if (n_constraints_ != other.n_constraints_) { return false; } if (objective_scaling_factor_ != other.objective_scaling_factor_) { return false; } if (objective_offset_ != other.objective_offset_) { return false; } + if (batch_objective_offsets_ != other.batch_objective_offsets_) { return false; } if (problem_category_ != other.problem_category_) { return false; } if (A_.size() != other.A_.size()) { return false; } @@ -1473,6 +1523,11 @@ optimization_problem_t optimization_problem_t::convert other.set_maximize(maximize_); other.set_objective_offset(static_cast(objective_offset_)); other.set_objective_scaling_factor(static_cast(objective_scaling_factor_)); + if (!batch_objective_offsets_.empty()) { + std::vector converted(batch_objective_offsets_.begin(), + batch_objective_offsets_.end()); + other.set_batch_objective_offsets(converted); + } if (A_.size() > 0) { auto other_A = gpu_cast(A_, stream); @@ -1482,36 +1537,43 @@ optimization_problem_t optimization_problem_t::convert static_cast(A_indices_.size()), A_offsets_.data(), static_cast(A_offsets_.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (c_.size() > 0) { auto other_c = gpu_cast(c_, stream); other.set_objective_coefficients(other_c.data(), static_cast(other_c.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (b_.size() > 0) { auto other_b = gpu_cast(b_, stream); other.set_constraint_bounds(other_b.data(), static_cast(other_b.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (constraint_lower_bounds_.size() > 0) { auto other_clb = gpu_cast(constraint_lower_bounds_, stream); other.set_constraint_lower_bounds(other_clb.data(), static_cast(other_clb.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (constraint_upper_bounds_.size() > 0) { auto other_cub = gpu_cast(constraint_upper_bounds_, stream); other.set_constraint_upper_bounds(other_cub.data(), static_cast(other_cub.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (variable_lower_bounds_.size() > 0) { auto other_vlb = gpu_cast(variable_lower_bounds_, stream); other.set_variable_lower_bounds(other_vlb.data(), static_cast(other_vlb.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (variable_upper_bounds_.size() > 0) { auto other_vub = gpu_cast(variable_upper_bounds_, stream); other.set_variable_upper_bounds(other_vub.data(), static_cast(other_vub.size())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } if (variable_types_.size() > 0) { diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 74df7fee01..8e371062fe 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -30,8 +30,14 @@ #include +#include + #include +#include +#include +#include + namespace cuopt::linear_programming::detail { template @@ -41,7 +47,7 @@ pdhg_solver_t::pdhg_solver_t( bool is_legacy_batch_mode, // Batch mode with streams const std::vector& climber_strategies, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params, - const std::vector>& new_bounds, + const std::vector>& new_bounds, bool enable_mixed_precision_spmv) : batch_mode_(climber_strategies.size() > 1), handle_ptr_(handle_ptr), @@ -49,8 +55,11 @@ pdhg_solver_t::pdhg_solver_t( problem_ptr(&op_problem_scaled), primal_size_h_(problem_ptr->n_variables), dual_size_h_(problem_ptr->n_constraints), - current_saddle_point_state_{ - handle_ptr_, problem_ptr->n_variables, problem_ptr->n_constraints, climber_strategies.size()}, + current_saddle_point_state_{handle_ptr_, + problem_ptr->n_variables, + problem_ptr->n_constraints, + climber_strategies.size(), + hyper_params}, tmp_primal_{(climber_strategies.size() * problem_ptr->n_variables), stream_view_}, tmp_dual_{(climber_strategies.size() * problem_ptr->n_constraints), stream_view_}, potential_next_primal_solution_{(climber_strategies.size() * problem_ptr->n_variables), @@ -92,22 +101,30 @@ pdhg_solver_t::pdhg_solver_t( d_total_pdhg_iterations_{0, stream_view_}, climber_strategies_(climber_strategies), hyper_params_(hyper_params), + new_bounds_climber_id_{new_bounds.size(), stream_view_}, new_bounds_idx_{new_bounds.size(), stream_view_}, new_bounds_lower_{new_bounds.size(), stream_view_}, new_bounds_upper_{new_bounds.size(), stream_view_}, batch_size_divisor_(climber_strategies_.size()) { if (!new_bounds.empty()) { - cuopt_assert(new_bounds.size() == climber_strategies_.size(), - "New bounds size must be equal to climber strategies size"); + std::set> seen_bounds; + std::vector climber_id(new_bounds.size()); std::vector idx(new_bounds.size()); std::vector lower(new_bounds.size()); std::vector upper(new_bounds.size()); for (size_t i = 0; i < new_bounds.size(); ++i) { - idx[i] = std::get<0>(new_bounds[i]); - lower[i] = std::get<1>(new_bounds[i]); - upper[i] = std::get<2>(new_bounds[i]); + climber_id[i] = std::get<0>(new_bounds[i]); + idx[i] = std::get<1>(new_bounds[i]); + lower[i] = std::get<2>(new_bounds[i]); + upper[i] = std::get<3>(new_bounds[i]); + cuopt_assert(climber_id[i] >= 0, "new_bounds climber_id must be non-negative"); + cuopt_assert(climber_id[i] < static_cast(climber_strategies_.size()), + "new_bounds climber_id must be less than batch size"); + cuopt_assert(seen_bounds.insert({climber_id[i], idx[i]}).second, + "new_bounds cannot contain duplicate (climber_id, variable_index) entries"); } + raft::copy(new_bounds_climber_id_.data(), climber_id.data(), climber_id.size(), stream_view_); raft::copy(new_bounds_idx_.data(), idx.data(), idx.size(), stream_view_); raft::copy(new_bounds_lower_.data(), lower.data(), lower.size(), stream_view_); raft::copy(new_bounds_upper_.data(), upper.data(), upper.size(), stream_view_); @@ -130,21 +147,103 @@ pdhg_solver_t::pdhg_solver_t( } template -__global__ void pdhg_swap_bounds_kernel(const swap_pair_t* swap_pairs, - i_t swap_count, - raft::device_span new_bounds_idx, - raft::device_span new_bounds_lower, - raft::device_span new_bounds_upper) +struct new_bound_entry_t { + i_t var_idx; + f_t lower; + f_t upper; +}; + +template +using new_bounds_groups_t = std::vector>>; + +// new_bounds is stored as flat device arrays, but a climber can own any number of variable-bound +// overrides. During context swaps we need to swap whole climber payloads, and we cannot know from +// the flat device layout how many entries belong to each climber without first regrouping them. +// Bring the flat arrays to the host, put each entry into the group it belongs to, and return the +// groups. Then the group will be swapped before being copied back to the device. +template +new_bounds_groups_t copy_new_bounds_to_groups( + const rmm::device_uvector& new_bounds_climber_id, + const rmm::device_uvector& new_bounds_idx, + const rmm::device_uvector& new_bounds_lower, + const rmm::device_uvector& new_bounds_upper, + i_t batch_size, + rmm::cuda_stream_view stream_view) { - const i_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (idx >= swap_count) { return; } + cuopt_assert(new_bounds_climber_id.size() == new_bounds_idx.size(), + "New bounds climber id and index sizes must match"); + cuopt_assert(new_bounds_lower.size() == new_bounds_idx.size(), + "New bounds lower and index sizes must match"); + cuopt_assert(new_bounds_upper.size() == new_bounds_idx.size(), + "New bounds upper and index sizes must match"); + + const auto n_entries = new_bounds_idx.size(); + std::vector h_climber_id(n_entries); + std::vector h_idx(n_entries); + std::vector h_lower(n_entries); + std::vector h_upper(n_entries); + if (n_entries > 0) { + raft::copy(h_climber_id.data(), new_bounds_climber_id.data(), n_entries, stream_view); + raft::copy(h_idx.data(), new_bounds_idx.data(), n_entries, stream_view); + raft::copy(h_lower.data(), new_bounds_lower.data(), n_entries, stream_view); + raft::copy(h_upper.data(), new_bounds_upper.data(), n_entries, stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + } + + new_bounds_groups_t groups(batch_size); + for (size_t i = 0; i < n_entries; ++i) { + cuopt_assert(h_climber_id[i] >= 0 && h_climber_id[i] < batch_size, + "new_bounds climber_id is out of active batch range"); + groups[h_climber_id[i]].push_back({h_idx[i], h_lower[i], h_upper[i]}); + } + return groups; +} + +template +void copy_groups_to_new_bounds(const new_bounds_groups_t& groups, + i_t group_count, + rmm::device_uvector& new_bounds_climber_id, + rmm::device_uvector& new_bounds_idx, + rmm::device_uvector& new_bounds_lower, + rmm::device_uvector& new_bounds_upper, + rmm::cuda_stream_view stream_view) +{ + size_t n_entries = 0; + for (i_t c = 0; c < group_count; ++c) { + n_entries += groups[c].size(); + } - const i_t left = swap_pairs[idx].left; - const i_t right = swap_pairs[idx].right; + cuopt_assert(n_entries == new_bounds_climber_id.size(), + "New bounds climber id size must match number of entries"); + cuopt_assert(n_entries == new_bounds_idx.size(), + "New bounds index size must match number of entries"); + cuopt_assert(n_entries == new_bounds_lower.size(), + "New bounds lower size must match number of entries"); + cuopt_assert(n_entries == new_bounds_upper.size(), + "New bounds upper size must match number of entries"); + + std::vector h_climber_id(n_entries); + std::vector h_idx(n_entries); + std::vector h_lower(n_entries); + std::vector h_upper(n_entries); + + size_t out_idx = 0; + for (i_t c = 0; c < group_count; ++c) { + for (const auto& entry : groups[c]) { + h_climber_id[out_idx] = c; + h_idx[out_idx] = entry.var_idx; + h_lower[out_idx] = entry.lower; + h_upper[out_idx] = entry.upper; + ++out_idx; + } + } - cuda::std::swap(new_bounds_idx[left], new_bounds_idx[right]); - cuda::std::swap(new_bounds_lower[left], new_bounds_lower[right]); - cuda::std::swap(new_bounds_upper[left], new_bounds_upper[right]); + if (n_entries > 0) { + raft::copy(new_bounds_climber_id.data(), h_climber_id.data(), n_entries, stream_view); + raft::copy(new_bounds_idx.data(), h_idx.data(), n_entries, stream_view); + raft::copy(new_bounds_lower.data(), h_lower.data(), n_entries, stream_view); + raft::copy(new_bounds_upper.data(), h_upper.data(), n_entries, stream_view); + } } template @@ -168,20 +267,64 @@ void pdhg_solver_t::swap_context( matrix_swap(reflected_dual_, dual_size_h_, swap_pairs); matrix_swap(dual_slack_, primal_size_h_, swap_pairs); current_saddle_point_state_.swap_context(swap_pairs); - if (new_bounds_idx_.size() != 0) { - const auto [grid_size, block_size] = - kernel_config_from_batch_size(static_cast(swap_pairs.size())); - pdhg_swap_bounds_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), - static_cast(swap_pairs.size()), - make_span(new_bounds_idx_), - make_span(new_bounds_lower_), - make_span(new_bounds_upper_)); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + // Swap per-climber scaled problem fields (objectives, constraint bounds) — all in COL-major + // during the convergence block when swap_context is invoked. + if (problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)) { + matrix_swap(problem_ptr->objective_coefficients, primal_size_h_, swap_pairs); + } + if (problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)) { + matrix_swap(problem_ptr->constraint_lower_bounds, dual_size_h_, swap_pairs); + matrix_swap(problem_ptr->constraint_upper_bounds, dual_size_h_, swap_pairs); } #ifdef CUPDLP_DEBUG_MODE std::cout << "Swap context for " << swap_pairs.size() << " pairs" << std::endl; +#endif +} + +template +void pdhg_solver_t::resize_and_swap_new_bounds_context( + const thrust::universal_host_pinned_vector>& swap_pairs, i_t new_size) +{ + if (new_bounds_climber_id_.size() == 0) { return; } + + const auto batch_size = static_cast(tmp_primal_.size() / primal_size_h_); + cuopt_assert(batch_size > 0, "Batch size must be greater than 0"); + cuopt_assert(new_size > 0, "New size must be greater than 0"); + cuopt_assert(new_size < batch_size, "New size must be less than batch size"); + + auto groups = copy_new_bounds_to_groups(new_bounds_climber_id_, + new_bounds_idx_, + new_bounds_lower_, + new_bounds_upper_, + batch_size, + stream_view_); + for (const auto& pair : swap_pairs) { + std::swap(groups[pair.left], groups[pair.right]); + } + + // We have just swapped the groups in the correct order and we know the new size + // We can thus porperly compute on the first new_size climbers what we be the final number of + // entries + size_t n_entries = 0; + for (i_t c = 0; c < new_size; ++c) { + n_entries += groups[c].size(); + } + + new_bounds_climber_id_.resize(n_entries, stream_view_); + new_bounds_idx_.resize(n_entries, stream_view_); + new_bounds_lower_.resize(n_entries, stream_view_); + new_bounds_upper_.resize(n_entries, stream_view_); + + copy_groups_to_new_bounds(groups, + new_size, + new_bounds_climber_id_, + new_bounds_idx_, + new_bounds_lower_, + new_bounds_upper_, + stream_view_); +#ifdef CUPDLP_DEBUG_MODE + print("new_bounds_climber_id_", new_bounds_climber_id_); print("new_bounds_idx_", new_bounds_idx_); print("new_bounds_lower_", new_bounds_lower_); print("new_bounds_upper_", new_bounds_upper_); @@ -204,10 +347,12 @@ void pdhg_solver_t::resize_context(i_t new_size) reflected_dual_.resize(new_size * dual_size_h_, stream_view_); dual_slack_.resize(new_size * primal_size_h_, stream_view_); current_saddle_point_state_.resize_context(new_size); - if (new_bounds_idx_.size() != 0) { - new_bounds_idx_.resize(new_size, stream_view_); - new_bounds_lower_.resize(new_size, stream_view_); - new_bounds_upper_.resize(new_size, stream_view_); + if (problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)) { + problem_ptr->objective_coefficients.resize(new_size * primal_size_h_, stream_view_); + } + if (problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)) { + problem_ptr->constraint_lower_bounds.resize(new_size * dual_size_h_, stream_view_); + problem_ptr->constraint_upper_bounds.resize(new_size * dual_size_h_, stream_view_); } batch_size_divisor_ = cuda::fast_mod_div(new_size); } @@ -299,6 +444,60 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector +void pdhg_solver_t::spmvop_At_y() +{ +#if CUDA_VER_13_2_UP + RAFT_CUSPARSE_TRY(cusparseSetStream(handle_ptr_->get_cusparse_handle(), stream_view_.value())); + RAFT_CUSPARSE_TRY(cusparseSpMVOp(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_t_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_solution, + cusparse_view_.current_AtY, + cusparse_view_.current_AtY)); +#else + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.current_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); +#endif +} + +template +void pdhg_solver_t::spmvop_A_x() +{ +#if CUDA_VER_13_2_UP + RAFT_CUSPARSE_TRY(cusparseSetStream(handle_ptr_->get_cusparse_handle(), stream_view_.value())); + RAFT_CUSPARSE_TRY(cusparseSpMVOp(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.reflected_primal_solution, + cusparse_view_.dual_gradient, + cusparse_view_.dual_gradient)); +#else + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.reflected_primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_non_transpose.data(), + stream_view_)); +#endif +} + template void pdhg_solver_t::compute_At_y() { @@ -317,9 +516,10 @@ void pdhg_solver_t::compute_At_y() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_transpose_mixed_.data(), stream_view_); + } else { + spmvop_At_y(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -365,9 +565,10 @@ void pdhg_solver_t::compute_A_x() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_non_transpose_mixed_.data(), stream_view_); + } else { + spmvop_A_x(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -598,14 +799,16 @@ template struct primal_reflected_major_projection_bulk_op { using f_t2 = typename type_2::type; const f_t* primal_solution; - const f_t* objective_coefficients; + const f_t* objective_coefficients; // ROW-major when per_climber, else single-problem const f_t* current_AtY; const f_t2* variable_bounds; const f_t* primal_step_size; + const f_t* bound_rescaling; f_t* potential_next_primal; f_t* dual_slack; f_t* reflected_primal; cuda::fast_mod_div batch_size; + bool per_climber_objectives; HDI void operator()(size_t idx) { @@ -614,8 +817,9 @@ struct primal_reflected_major_projection_bulk_op { const f_t step_size = primal_step_size[batch_idx]; const f_t primal_val = primal_solution[idx]; - const f_t obj_coef = objective_coefficients[var_idx]; - const f_t aty_val = current_AtY[idx]; + const f_t obj_coef = + per_climber_objectives ? objective_coefficients[idx] : objective_coefficients[var_idx]; + const f_t aty_val = current_AtY[idx]; cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection"); cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection"); @@ -625,9 +829,12 @@ struct primal_reflected_major_projection_bulk_op { const f_t next = primal_val - step_size * (obj_coef - aty_val); - const f_t2 bounds = variable_bounds[var_idx]; - const f_t next_clamped = - cuda::std::max(cuda::std::min(next, get_upper(bounds)), get_lower(bounds)); + // Variables bounds are common accross all climbers but their scaling factor changes. + // Instead of creating a matrix of variable bounds, we scale the bounds here. + const f_t bound_scale = bound_rescaling[batch_idx]; + const f_t2 bounds = variable_bounds[var_idx]; + const f_t next_clamped = cuda::std::max(cuda::std::min(next, get_upper(bounds) * bound_scale), + get_lower(bounds) * bound_scale); potential_next_primal[idx] = next_clamped; dual_slack[idx] = (next_clamped - next) / step_size; @@ -642,12 +849,13 @@ template struct dual_reflected_major_projection_bulk_op { const f_t* dual_solution; const f_t* dual_gradient; - const f_t* constraint_lower_bounds; + const f_t* constraint_lower_bounds; // ROW-major when per_climber, else single-problem const f_t* constraint_upper_bounds; const f_t* dual_step_size; f_t* potential_next_dual; f_t* reflected_dual; cuda::fast_mod_div batch_size; + bool per_climber_constraints; HDI void operator()(size_t idx) { @@ -664,10 +872,11 @@ struct dual_reflected_major_projection_bulk_op { cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection"); cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection"); - const f_t tmp = current_dual / step_size - Ax; + const int bound_idx = per_climber_constraints ? idx : constraint_idx; + const f_t tmp = current_dual / step_size - Ax; const f_t tmp_proj = - cuda::std::max(-constraint_upper_bounds[constraint_idx], - cuda::std::min(tmp, -constraint_lower_bounds[constraint_idx])); + cuda::std::max(-constraint_upper_bounds[bound_idx], + cuda::std::min(tmp, -constraint_lower_bounds[bound_idx])); const f_t next_dual = (tmp - tmp_proj) * step_size; potential_next_dual[idx] = next_dual; @@ -682,12 +891,14 @@ template struct primal_reflected_projection_bulk_op { using f_t2 = typename type_2::type; const f_t* primal_solution; - const f_t* objective_coefficients; + const f_t* objective_coefficients; // ROW-major when per_climber, else single-problem const f_t* current_AtY; const f_t2* variable_bounds; const f_t* primal_step_size; + const f_t* bound_rescaling; f_t* reflected_primal; int batch_size; + bool per_climber_objectives; HDI void operator()(size_t idx) { @@ -696,8 +907,9 @@ struct primal_reflected_projection_bulk_op { const f_t step_size = primal_step_size[batch_idx]; const f_t primal_val = primal_solution[idx]; - const f_t obj_coef = objective_coefficients[var_idx]; - const f_t aty_val = current_AtY[idx]; + const f_t obj_coef = + per_climber_objectives ? objective_coefficients[idx] : objective_coefficients[var_idx]; + const f_t aty_val = current_AtY[idx]; cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); @@ -707,8 +919,12 @@ struct primal_reflected_projection_bulk_op { f_t reflected = primal_val - step_size * (obj_coef - aty_val); - const f_t2 bounds = variable_bounds[var_idx]; - reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds)); + // Variables bounds are common accross all climbers but their scaling factor changes. + // Instead of creating a matrix of variable bounds, we scale the bounds here. + const f_t bound_scale = bound_rescaling[batch_idx]; + const f_t2 bounds = variable_bounds[var_idx]; + reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds) * bound_scale), + get_lower(bounds) * bound_scale); reflected_primal[idx] = f_t(2.0) * reflected - primal_val; @@ -723,11 +939,12 @@ struct dual_reflected_projection_bulk_op { const f_t* dual_solution; const f_t* dual_gradient; - const f_t* constraint_lower_bounds; + const f_t* constraint_lower_bounds; // ROW-major when per_climber, else single-problem const f_t* constraint_upper_bounds; const f_t* dual_step_size; f_t* reflected_dual; int batch_size; + bool per_climber_constraints; HDI void operator()(size_t idx) { @@ -743,10 +960,11 @@ struct dual_reflected_projection_bulk_op { cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection"); cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); - const f_t tmp = current_dual / step_size - dual_gradient[idx]; + const int bound_idx = per_climber_constraints ? idx : constraint_idx; + const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = - cuda::std::max(-constraint_upper_bounds[constraint_idx], - cuda::std::min(tmp, -constraint_lower_bounds[constraint_idx])); + cuda::std::max(-constraint_upper_bounds[bound_idx], + cuda::std::min(tmp, -constraint_lower_bounds[bound_idx])); const f_t next_dual = (tmp - tmp_proj) * step_size; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; @@ -758,6 +976,7 @@ struct dual_reflected_projection_bulk_op { template struct refine_primal_projection_major_bulk_op { + raft::device_span climber_id; raft::device_span idx; raft::device_span lower; raft::device_span upper; @@ -765,26 +984,31 @@ struct refine_primal_projection_major_bulk_op { raft::device_span objective; raft::device_span Aty; raft::device_span primal_step_size; + raft::device_span bound_rescaling; raft::device_span potential_next; raft::device_span dual_slack; raft::device_span reflected_primal; int batch_size; + bool per_climber_objectives; - HDI void operator()(size_t climber_id) + HDI void operator()(size_t entry_idx) { - i_t var_idx = idx[climber_id]; - f_t l = lower[climber_id]; - f_t u = upper[climber_id]; + i_t c = climber_id[entry_idx]; + i_t var_idx = idx[entry_idx]; + // Variables bounds are common accross all climbers but their scaling factor changes. + // Instead of creating a matrix of variable bounds, we scale the bounds here. + f_t l = lower[entry_idx] * bound_rescaling[c]; + f_t u = upper[entry_idx] * bound_rescaling[c]; - size_t global_idx = (size_t)var_idx * batch_size + climber_id; + size_t global_idx = (size_t)var_idx * batch_size + c; - f_t x = current_primal[global_idx]; - f_t c = objective[var_idx]; - f_t y_aty = Aty[global_idx]; - f_t tau = primal_step_size[climber_id]; + f_t x = current_primal[global_idx]; + f_t objective_coeff = per_climber_objectives ? objective[global_idx] : objective[var_idx]; + f_t y_aty = Aty[global_idx]; + f_t tau = primal_step_size[c]; auto [next_clamped, delta_primal, reflected_primal_value] = - primal_reflected_major_projection_batch{}(x, c, y_aty, {l, u}, tau); + primal_reflected_major_projection_batch{}(x, objective_coeff, y_aty, {l, u}, tau); potential_next[global_idx] = next_clamped; dual_slack[global_idx] = delta_primal; @@ -794,6 +1018,7 @@ struct refine_primal_projection_major_bulk_op { template struct refine_primal_projection_bulk_op { + raft::device_span climber_id; raft::device_span idx; raft::device_span lower; raft::device_span upper; @@ -801,68 +1026,80 @@ struct refine_primal_projection_bulk_op { raft::device_span objective; raft::device_span Aty; raft::device_span primal_step_size; + raft::device_span bound_rescaling; raft::device_span reflected_primal; int batch_size; + bool per_climber_objectives; - HDI void operator()(size_t climber_id) + HDI void operator()(size_t entry_idx) { - i_t var_idx = idx[climber_id]; - f_t l = lower[climber_id]; - f_t u = upper[climber_id]; + i_t c = climber_id[entry_idx]; + i_t var_idx = idx[entry_idx]; + // Variables bounds are common accross all climbers but their scaling factor changes. + // Instead of creating a matrix of variable bounds, we scale the bounds here. + f_t l = lower[entry_idx] * bound_rescaling[c]; + f_t u = upper[entry_idx] * bound_rescaling[c]; - size_t global_idx = (size_t)var_idx * batch_size + climber_id; + size_t global_idx = (size_t)var_idx * batch_size + c; - f_t x = current_primal[global_idx]; - f_t c = objective[var_idx]; - f_t y_aty = Aty[global_idx]; - f_t tau = primal_step_size[climber_id]; + f_t x = current_primal[global_idx]; + f_t objective_coeff = per_climber_objectives ? objective[global_idx] : objective[var_idx]; + f_t y_aty = Aty[global_idx]; + f_t tau = primal_step_size[c]; reflected_primal[global_idx] = - primal_reflected_projection_batch{}(x, c, y_aty, {l, u}, tau); + primal_reflected_projection_batch{}(x, objective_coeff, y_aty, {l, u}, tau); } }; template struct refine_initial_primal_projection_bulk_op { + raft::device_span climber_id; raft::device_span idx; raft::device_span lower; raft::device_span upper; + raft::device_span bound_rescaling; raft::device_span primal_solution; i_t n_variables; - HDI void operator()(size_t climber_id) + HDI void operator()(size_t entry_idx) { - i_t var_idx = idx[climber_id]; - f_t l = lower[climber_id]; - f_t u = upper[climber_id]; + i_t c = climber_id[entry_idx]; + i_t var_idx = idx[entry_idx]; + f_t l = lower[entry_idx] * bound_rescaling[c]; + f_t u = upper[entry_idx] * bound_rescaling[c]; // When refining, the solution is not yet transposed - size_t global_idx = (size_t)climber_id * n_variables + var_idx; + size_t global_idx = (size_t)c * n_variables + var_idx; using f_t2 = typename type_2::type; primal_solution[global_idx] = clamp{}(primal_solution[global_idx], {l, u}); } }; template -void pdhg_solver_t::refine_initial_primal_projection() +void pdhg_solver_t::refine_initial_primal_projection( + const rmm::device_uvector& bound_rescaling) { if (new_bounds_idx_.size() == 0) return; #ifdef CUPDLP_DEBUG_MODE + print("new_bounds_climber_id_", new_bounds_climber_id_); print("new_bounds_idx_", new_bounds_idx_); print("new_bounds_lower_", new_bounds_lower_); print("new_bounds_upper_", new_bounds_upper_); #endif - cuopt_assert(new_bounds_idx_.size() == climber_strategies_.size(), - "New bounds index size must be equal to climber strategies size"); - cuopt_assert(new_bounds_lower_.size() == climber_strategies_.size(), - "New bounds lower size must be equal to climber strategies size"); - cuopt_assert(new_bounds_upper_.size() == climber_strategies_.size(), - "New bounds upper size must be equal to climber strategies size"); - cub::DeviceFor::Bulk(climber_strategies_.size(), + cuopt_assert(new_bounds_climber_id_.size() == new_bounds_idx_.size(), + "New bounds climber id and index sizes must match"); + cuopt_assert(new_bounds_lower_.size() == new_bounds_idx_.size(), + "New bounds lower and index sizes must match"); + cuopt_assert(new_bounds_upper_.size() == new_bounds_idx_.size(), + "New bounds upper and index sizes must match"); + cub::DeviceFor::Bulk(new_bounds_idx_.size(), refine_initial_primal_projection_bulk_op{ + make_span(new_bounds_climber_id_), make_span(new_bounds_idx_), make_span(new_bounds_lower_), make_span(new_bounds_upper_), + make_span(bound_rescaling), make_span(current_saddle_point_state_.get_primal_solution()), problem_ptr->n_variables}, stream_view_.value()); @@ -872,6 +1109,7 @@ template void pdhg_solver_t::compute_next_primal_dual_solution_reflected( rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, + const rmm::device_uvector& bound_rescaling, bool should_major) { raft::common::nvtx::range fun_scope("compute_next_primal_dual_solution_reflected"); @@ -897,45 +1135,53 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( primal_reflected_major_projection(primal_step_size.data()), stream_view_.value()); } else { - cub::DeviceFor::Bulk(potential_next_primal_solution_.size(), - primal_reflected_major_projection_bulk_op{ - current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data(), - primal_step_size.data(), - potential_next_primal_solution_.data(), - dual_slack_.data(), - reflected_primal_.data(), - batch_size_divisor_}, - stream_view_.value()); + cub::DeviceFor::Bulk( + potential_next_primal_solution_.size(), + primal_reflected_major_projection_bulk_op{ + current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data(), + primal_step_size.data(), + bound_rescaling.data(), + potential_next_primal_solution_.data(), + dual_slack_.data(), + reflected_primal_.data(), + batch_size_divisor_, + problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)}, + stream_view_.value()); } if (new_bounds_idx_.size() != 0) { #ifdef CUPDLP_DEBUG_MODE + print("new_bounds_climber_id_", new_bounds_climber_id_); print("new_bounds_idx_", new_bounds_idx_); print("new_bounds_lower_", new_bounds_lower_); print("new_bounds_upper_", new_bounds_upper_); #endif - cuopt_assert(new_bounds_idx_.size() == climber_strategies_.size(), - "New bounds index size must be equal to climber strategies size"); - cuopt_assert(new_bounds_lower_.size() == climber_strategies_.size(), - "New bounds lower size must be equal to climber strategies size"); - cuopt_assert(new_bounds_upper_.size() == climber_strategies_.size(), - "New bounds upper size must be equal to climber strategies size"); - cub::DeviceFor::Bulk(climber_strategies_.size(), - refine_primal_projection_major_bulk_op{ - make_span(new_bounds_idx_), - make_span(new_bounds_lower_), - make_span(new_bounds_upper_), - make_span(current_saddle_point_state_.get_primal_solution()), - make_span(problem_ptr->objective_coefficients), - make_span(current_saddle_point_state_.get_current_AtY()), - make_span(primal_step_size), - make_span(potential_next_primal_solution_), - make_span(dual_slack_), - make_span(reflected_primal_), - (int)climber_strategies_.size()}, - stream_view_.value()); + cuopt_assert(new_bounds_climber_id_.size() == new_bounds_idx_.size(), + "New bounds climber id and index sizes must match"); + cuopt_assert(new_bounds_lower_.size() == new_bounds_idx_.size(), + "New bounds lower and index sizes must match"); + cuopt_assert(new_bounds_upper_.size() == new_bounds_idx_.size(), + "New bounds upper and index sizes must match"); + cub::DeviceFor::Bulk( + new_bounds_idx_.size(), + refine_primal_projection_major_bulk_op{ + make_span(new_bounds_climber_id_), + make_span(new_bounds_idx_), + make_span(new_bounds_lower_), + make_span(new_bounds_upper_), + make_span(current_saddle_point_state_.get_primal_solution()), + make_span(problem_ptr->objective_coefficients), + make_span(current_saddle_point_state_.get_current_AtY()), + make_span(primal_step_size), + make_span(bound_rescaling), + make_span(potential_next_primal_solution_), + make_span(dual_slack_), + make_span(reflected_primal_), + (int)climber_strategies_.size(), + problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)}, + stream_view_.value()); } #ifdef CUPDLP_DEBUG_MODE print("potential_next_primal_solution_", potential_next_primal_solution_); @@ -957,17 +1203,19 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( dual_reflected_major_projection(dual_step_size.data()), stream_view_.value()); } else { - cub::DeviceFor::Bulk(potential_next_dual_solution_.size(), - dual_reflected_major_projection_bulk_op{ - current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), - dual_step_size.data(), - potential_next_dual_solution_.data(), - reflected_dual_.data(), - batch_size_divisor_}, - stream_view_.value()); + cub::DeviceFor::Bulk( + potential_next_dual_solution_.size(), + dual_reflected_major_projection_bulk_op{ + current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_step_size.data(), + potential_next_dual_solution_.data(), + reflected_dual_.data(), + batch_size_divisor_, + problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)}, + stream_view_.value()); } #ifdef CUPDLP_DEBUG_MODE @@ -1004,41 +1252,49 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( primal_reflected_projection(primal_step_size.data()), stream_view_.value()); } else { - cub::DeviceFor::Bulk(reflected_primal_.size(), - primal_reflected_projection_bulk_op{ - current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data(), - primal_step_size.data(), - reflected_primal_.data(), - (int)climber_strategies_.size()}, - stream_view_.value()); + cub::DeviceFor::Bulk( + reflected_primal_.size(), + primal_reflected_projection_bulk_op{ + current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data(), + primal_step_size.data(), + bound_rescaling.data(), + reflected_primal_.data(), + (int)climber_strategies_.size(), + problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)}, + stream_view_.value()); } if (new_bounds_idx_.size() != 0) { #ifdef CUPDLP_DEBUG_MODE + print("new_bounds_climber_id_", new_bounds_climber_id_); print("new_bounds_idx_", new_bounds_idx_); print("new_bounds_lower_", new_bounds_lower_); print("new_bounds_upper_", new_bounds_upper_); #endif - cuopt_assert(new_bounds_idx_.size() == climber_strategies_.size(), - "New bounds index size must be equal to climber strategies size"); - cuopt_assert(new_bounds_lower_.size() == climber_strategies_.size(), - "New bounds lower size must be equal to climber strategies size"); - cuopt_assert(new_bounds_upper_.size() == climber_strategies_.size(), - "New bounds upper size must be equal to climber strategies size"); - cub::DeviceFor::Bulk(climber_strategies_.size(), - refine_primal_projection_bulk_op{ - make_span(new_bounds_idx_), - make_span(new_bounds_lower_), - make_span(new_bounds_upper_), - make_span(current_saddle_point_state_.get_primal_solution()), - make_span(problem_ptr->objective_coefficients), - make_span(current_saddle_point_state_.get_current_AtY()), - make_span(primal_step_size), - make_span(reflected_primal_), - (int)climber_strategies_.size()}, - stream_view_.value()); + cuopt_assert(new_bounds_climber_id_.size() == new_bounds_idx_.size(), + "New bounds climber id and index sizes must match"); + cuopt_assert(new_bounds_lower_.size() == new_bounds_idx_.size(), + "New bounds lower and index sizes must match"); + cuopt_assert(new_bounds_upper_.size() == new_bounds_idx_.size(), + "New bounds upper and index sizes must match"); + cub::DeviceFor::Bulk( + new_bounds_idx_.size(), + refine_primal_projection_bulk_op{ + make_span(new_bounds_climber_id_), + make_span(new_bounds_idx_), + make_span(new_bounds_lower_), + make_span(new_bounds_upper_), + make_span(current_saddle_point_state_.get_primal_solution()), + make_span(problem_ptr->objective_coefficients), + make_span(current_saddle_point_state_.get_current_AtY()), + make_span(primal_step_size), + make_span(bound_rescaling), + make_span(reflected_primal_), + (int)climber_strategies_.size(), + problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)}, + stream_view_.value()); } #ifdef CUPDLP_DEBUG_MODE print("reflected_primal_", reflected_primal_); @@ -1065,16 +1321,18 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( dual_reflected_projection(dual_step_size.data()), stream_view_.value()); } else { - cub::DeviceFor::Bulk(reflected_dual_.size(), - dual_reflected_projection_bulk_op{ - current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data(), - dual_step_size.data(), - reflected_dual_.data(), - (int)climber_strategies_.size()}, - stream_view_.value()); + cub::DeviceFor::Bulk( + reflected_dual_.size(), + dual_reflected_projection_bulk_op{ + current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data(), + dual_step_size.data(), + reflected_dual_.data(), + (int)climber_strategies_.size(), + problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)}, + stream_view_.value()); } #ifdef CUPDLP_DEBUG_MODE print("reflected_dual_", reflected_dual_); @@ -1088,6 +1346,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( template void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, + const rmm::device_uvector& bound_rescaling, i_t iterations_since_last_restart, bool last_restart_was_average, i_t total_pdlp_iterations, @@ -1110,6 +1369,7 @@ void pdhg_solver_t::take_step(rmm::device_uvector& primal_step_si compute_next_primal_dual_solution_reflected( primal_step_size, dual_step_size, + bound_rescaling, is_major_iteration || ((total_pdlp_iterations + 2) % conditional_major(total_pdlp_iterations + 2)) == 0); } diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 0a64e49efb..d16400bd3b 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -20,6 +20,9 @@ #include #include +#include +#include + namespace cuopt::linear_programming::detail { template class pdhg_solver_t { @@ -29,7 +32,7 @@ class pdhg_solver_t { bool is_legacy_batch_mode, const std::vector& climber_strategies, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params, - const std::vector>& new_bounds, + const std::vector>& new_bounds, bool enable_mixed_precision_spmv = false); saddle_point_state_t& get_saddle_point_state(); @@ -53,21 +56,25 @@ class pdhg_solver_t { i_t get_dual_size() const; void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); + void resize_and_swap_new_bounds_context( + const thrust::universal_host_pinned_vector>& swap_pairs, i_t new_size); void resize_context(i_t new_size); ping_pong_graph_t& get_graph_all(); + rmm::device_uvector& get_new_bounds_climber_id() { return new_bounds_climber_id_; } rmm::device_uvector& get_new_bounds_idx() { return new_bounds_idx_; } rmm::device_uvector& get_new_bounds_lower() { return new_bounds_lower_; } rmm::device_uvector& get_new_bounds_upper() { return new_bounds_upper_; } void take_step(rmm::device_uvector& primal_step_size, rmm::device_uvector& dual_step_size, + const rmm::device_uvector& bound_rescaling, // Only used in batch mode i_t iterations_since_last_restart, bool last_restart_was_average, i_t total_pdlp_iterations, bool is_major_iteration); void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); - void refine_initial_primal_projection(); + void refine_initial_primal_projection(const rmm::device_uvector& bound_rescaling); i_t total_pdhg_iterations_; @@ -78,15 +85,20 @@ class pdhg_solver_t { rmm::device_uvector& dual_step_size, i_t total_pdlp_iterations); void compute_next_dual_solution(rmm::device_uvector& dual_step_size); - void compute_next_primal_dual_solution_reflected(rmm::device_uvector& primal_step_size, - rmm::device_uvector& dual_step_size, - bool should_major); + void compute_next_primal_dual_solution_reflected( + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size, + const rmm::device_uvector& bound_rescaling, // Only used in batch mode + bool should_major); void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); void compute_At_y(); void compute_A_x(); + void spmvop_At_y(); + void spmvop_A_x(); + void my_spmvop(f_t* alpha, f_t* A, f_t* x, f_t* beta, f_t* y, f_t* result); bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; @@ -128,6 +140,7 @@ class pdhg_solver_t { const std::vector& climber_strategies_; const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params_; + rmm::device_uvector new_bounds_climber_id_; rmm::device_uvector new_bounds_idx_; rmm::device_uvector new_bounds_lower_; rmm::device_uvector new_bounds_upper_; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 8e6e80e322..fd0cc9ffcd 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -35,10 +35,14 @@ #include #include +#include +#include #include +#include #include #include +#include #include namespace cuopt::linear_programming::detail { @@ -96,22 +100,64 @@ inline cublasStatus_t cublasGeam(cublasHandle_t handle, return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } +template +struct scale_bounds_by_scalar_op { + using f_t2 = typename type_2::type; + + HDI f_t2 operator()(thrust::tuple value) + { + const auto bounds = thrust::get<0>(value); + const auto bound_scale = thrust::get<1>(value); + return {get_lower(bounds) * bound_scale, get_upper(bounds) * bound_scale}; + } +}; + template -static size_t batch_size_handler(const problem_t& op_problem, - const pdlp_solver_settings_t& settings) +static i_t max_new_bounds_climber_id(const pdlp_solver_settings_t& settings) { - if (settings.new_bounds.empty()) { return 1; } + i_t max_climber_id = 0; + for (const auto& new_bound : settings.new_bounds) { + const auto climber_id = std::get<0>(new_bound); + cuopt_assert(climber_id >= 0, "new_bounds climber_id must be non-negative"); + max_climber_id = std::max(max_climber_id, climber_id); + } + return max_climber_id; +} + +template +static size_t batch_size_handler(const pdlp_solver_settings_t& settings) +{ + // Two inputs only: + // - fixed_batch_size > 0 : caller pre-sized the batch (fixed path). Per-climber problem data + // (objectives/offsets/constraint bounds) lives directly on the optimization_problem_t. + // new_bounds may still be provided as per-climber variable-bound overrides within the batch. + // - fixed_batch_size == 0 : splitting path. Batch size is derived from new_bounds. + size_t batch_size; + if (settings.fixed_batch_size > 0) { + if (!settings.new_bounds.empty()) { + cuopt_assert(max_new_bounds_climber_id(settings) + 1 == settings.fixed_batch_size, + "new_bounds climber_id must be equal to fixed_batch_size"); + } + batch_size = static_cast(settings.fixed_batch_size); + } else { + batch_size = settings.new_bounds.empty() + ? 1 + : static_cast(max_new_bounds_climber_id(settings)) + 1; + } #ifdef BATCH_VERBOSE_MODE - std::cout << "Running batch PDLP with " << settings.new_bounds.size() << " problems" << std::endl; + if (batch_size > 1) { + std::cout << "Running batch PDLP with " << batch_size << " problems" << std::endl; + } #endif - return settings.new_bounds.size(); + return batch_size; } template pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdlp_solver_settings_t const& settings, bool is_legacy_batch_mode) - : climber_strategies_(batch_size_handler(op_problem, settings)), + : original_batch_size_(batch_size_handler(settings)), + climber_strategies_(original_batch_size_), batch_mode_(climber_strategies_.size() > 1), handle_ptr_(op_problem.handle_ptr), stream_view_(handle_ptr_->get_stream()), @@ -151,7 +197,8 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, op_problem_scaled_.reverse_offsets, op_problem_scaled_.reverse_constraints, &pdhg_solver_, - settings_.hyper_params}, + settings_.hyper_params, + static_cast(original_batch_size_)}, average_op_problem_evaluation_cusparse_view_{handle_ptr_, op_problem, unscaled_primal_avg_solution_, @@ -186,16 +233,17 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, is_legacy_batch_mode, climber_strategies_, settings_.hyper_params}, - average_termination_strategy_{handle_ptr_, - op_problem, - op_problem_scaled_, - average_op_problem_evaluation_cusparse_view_, - pdhg_solver_.get_cusparse_view(), - primal_size_h_, - dual_size_h_, - initial_scaling_strategy_, - settings_, - climber_strategies_}, + average_termination_strategy_{ + handle_ptr_, + op_problem, + op_problem_scaled_, + average_op_problem_evaluation_cusparse_view_, + pdhg_solver_.get_cusparse_view(), + settings_.hyper_params.never_restart_to_average ? 0 : primal_size_h_, + settings_.hyper_params.never_restart_to_average ? 0 : dual_size_h_, + initial_scaling_strategy_, + settings_, + climber_strategies_}, current_termination_strategy_{handle_ptr_, op_problem, op_problem_scaled_, @@ -214,6 +262,17 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, best_primal_solution_so_far{pdlp_termination_status_t::TimeLimit, stream_view_}, inside_mip_{false} { + cuopt_expects(!(settings_.first_primal_feasible && settings_.all_primal_feasible), + error_type_t::ValidationError, + "first_primal_feasible and all_primal_feasible are mutually exclusive"); + cuopt_expects(batch_mode_ || !settings_.all_primal_feasible, + error_type_t::ValidationError, + "all_primal_feasible only applies in batch mode"); + cuopt_expects(!(settings_.save_best_primal_so_far && batch_mode_), + error_type_t::ValidationError, + "save_best_primal_so_far is not supported in batch mode. Disable batch mode " + "(no fixed_batch_size and no new_bounds) or unset save_best_primal_so_far."); + // Set step_size initial scaling thrust::fill(handle_ptr_->get_thrust_policy(), step_size_.data(), @@ -291,18 +350,17 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); op_problem.check_problem_representation(true, false); - op_problem_scaled_.check_problem_representation(true, false); - if (settings_.new_bounds.size() > 0) { + if (batch_mode_) { batch_solution_to_return_.get_additional_termination_informations().resize( - settings_.new_bounds.size()); - batch_solution_to_return_.get_terminations_status().resize(settings_.new_bounds.size()); + original_batch_size_); + batch_solution_to_return_.get_terminations_status().resize(original_batch_size_); batch_solution_to_return_.get_primal_solution().resize( - op_problem.n_variables * settings_.new_bounds.size(), stream_view_); + op_problem.n_variables * original_batch_size_, stream_view_); batch_solution_to_return_.get_dual_solution().resize( - op_problem.n_constraints * settings_.new_bounds.size(), stream_view_); + op_problem.n_constraints * original_batch_size_, stream_view_); batch_solution_to_return_.get_reduced_cost().resize( - op_problem.n_variables * settings_.new_bounds.size(), stream_view_); + op_problem.n_variables * original_batch_size_, stream_view_); } for (size_t i = 0; i < climber_strategies_.size(); ++i) { climber_strategies_[i].original_index = static_cast(i); @@ -331,32 +389,6 @@ void pdlp_solver_t::set_initial_k(i_t initial_k) initial_k_ = initial_k; } -template -void pdlp_solver_t::set_relative_dual_tolerance_factor(f_t dual_tolerance_factor) -{ - average_termination_strategy_.set_relative_dual_tolerance_factor(dual_tolerance_factor); - current_termination_strategy_.set_relative_dual_tolerance_factor(dual_tolerance_factor); -} - -template -void pdlp_solver_t::set_relative_primal_tolerance_factor(f_t primal_tolerance_factor) -{ - average_termination_strategy_.set_relative_primal_tolerance_factor(primal_tolerance_factor); - current_termination_strategy_.set_relative_primal_tolerance_factor(primal_tolerance_factor); -} - -template -f_t pdlp_solver_t::get_relative_dual_tolerance_factor() const -{ - return current_termination_strategy_.get_relative_dual_tolerance_factor(); -} - -template -f_t pdlp_solver_t::get_relative_primal_tolerance_factor() const -{ - return current_termination_strategy_.get_relative_primal_tolerance_factor(); -} - template void pdlp_solver_t::set_initial_primal_solution( const rmm::device_uvector& initial_primal_solution) @@ -403,28 +435,7 @@ std::optional> pdlp_solver_t } if (batch_mode_) { - // Set the termination status to TimeLimit for all climbers appart from the potentially - // already done ones - for (size_t i = 0; i < batch_solution_to_return_.get_terminations_status().size(); ++i) { - if (!current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i))) { - batch_solution_to_return_ - .get_terminations_status()[climber_strategies_[i].original_index] = - pdlp_termination_status_t::TimeLimit; - } - } - current_termination_strategy_.convert_gpu_terms_stats_to_host( - batch_solution_to_return_.get_additional_termination_informations()); - return optimization_problem_solution_t{ - batch_solution_to_return_.get_primal_solution(), - batch_solution_to_return_.get_dual_solution(), - batch_solution_to_return_.get_reduced_cost(), - get_filled_warmed_start_data(), - problem_ptr->objective_name, - problem_ptr->var_names, - problem_ptr->row_names, - std::move(batch_solution_to_return_.get_additional_termination_informations()), - std::move(batch_solution_to_return_.get_terminations_status())}; + return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::TimeLimit); } #ifdef PDLP_VERBOSE_MODE @@ -461,28 +472,7 @@ std::optional> pdlp_solver_t #endif if (batch_mode_) { - // Set the termination status to IterationLimit for all climbers appart from the potentially - // already done ones - for (size_t i = 0; i < batch_solution_to_return_.get_terminations_status().size(); ++i) { - if (!current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i))) { - batch_solution_to_return_ - .get_terminations_status()[climber_strategies_[i].original_index] = - pdlp_termination_status_t::IterationLimit; - } - } - current_termination_strategy_.convert_gpu_terms_stats_to_host( - batch_solution_to_return_.get_additional_termination_informations()); - return optimization_problem_solution_t{ - batch_solution_to_return_.get_primal_solution(), - batch_solution_to_return_.get_dual_solution(), - batch_solution_to_return_.get_reduced_cost(), - get_filled_warmed_start_data(), - problem_ptr->objective_name, - problem_ptr->var_names, - problem_ptr->row_names, - std::move(batch_solution_to_return_.get_additional_termination_informations()), - std::move(batch_solution_to_return_.get_terminations_status())}; + return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::IterationLimit); } return current_termination_strategy_.fill_return_problem_solution( @@ -507,28 +497,7 @@ std::optional> pdlp_solver_t #endif if (batch_mode_) { - // Set the termination status to ConcurrentLimit for all climbers appart from the potentially - // already done ones - for (size_t i = 0; i < batch_solution_to_return_.get_terminations_status().size(); ++i) { - if (!current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i))) { - batch_solution_to_return_ - .get_terminations_status()[climber_strategies_[i].original_index] = - pdlp_termination_status_t::ConcurrentLimit; - } - } - current_termination_strategy_.convert_gpu_terms_stats_to_host( - batch_solution_to_return_.get_additional_termination_informations()); - return optimization_problem_solution_t{ - batch_solution_to_return_.get_primal_solution(), - batch_solution_to_return_.get_dual_solution(), - batch_solution_to_return_.get_reduced_cost(), - get_filled_warmed_start_data(), - problem_ptr->objective_name, - problem_ptr->var_names, - problem_ptr->row_names, - std::move(batch_solution_to_return_.get_additional_termination_informations()), - std::move(batch_solution_to_return_.get_terminations_status())}; + return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::ConcurrentLimit); } return current_termination_strategy_.fill_return_problem_solution( @@ -754,6 +723,95 @@ void pdlp_solver_t::print_final_termination_criteria( } } +template +void pdlp_solver_t::snapshot_climber_into_return(size_t i) +{ + const auto term = current_termination_strategy_.get_termination_status(i); + const i_t local_idx = climber_strategies_[i].original_index; + + batch_solution_to_return_.get_terminations_status()[local_idx] = term; + raft::copy(batch_solution_to_return_.get_primal_solution().data() + local_idx * primal_size_h_, + pdhg_solver_.get_potential_next_primal_solution().data() + i * primal_size_h_, + primal_size_h_, + stream_view_); + raft::copy(batch_solution_to_return_.get_dual_solution().data() + local_idx * dual_size_h_, + pdhg_solver_.get_potential_next_dual_solution().data() + i * dual_size_h_, + dual_size_h_, + stream_view_); + raft::copy(batch_solution_to_return_.get_reduced_cost().data() + local_idx * primal_size_h_, + current_termination_strategy_.get_convergence_information().get_reduced_cost().data() + + i * primal_size_h_, + primal_size_h_, + stream_view_); + auto& info = batch_solution_to_return_.get_additional_termination_informations()[local_idx]; + info.number_of_steps_taken = total_pdlp_iterations_; + info.total_number_of_attempted_steps = pdhg_solver_.get_total_pdhg_iterations(); + if (term != pdlp_termination_status_t::ConcurrentLimit) { info.solved_by = method_t::PDLP; } + if (sb_view_.is_valid()) { sb_view_.mark_solved(local_idx); } +} + +template +optimization_problem_solution_t pdlp_solver_t::finalize_batch_return() +{ + current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + current_termination_strategy_.convert_gpu_terms_stats_to_host( + batch_solution_to_return_.get_additional_termination_informations()); + return optimization_problem_solution_t{ + batch_solution_to_return_.get_primal_solution(), + batch_solution_to_return_.get_dual_solution(), + batch_solution_to_return_.get_reduced_cost(), + get_filled_warmed_start_data(), + problem_ptr->objective_name, + problem_ptr->var_names, + problem_ptr->row_names, + std::move(batch_solution_to_return_.get_additional_termination_informations()), + std::move(batch_solution_to_return_.get_terminations_status())}; +} + +template +optimization_problem_solution_t +pdlp_solver_t::finalize_batch_return_with_limit_reached( + pdlp_termination_status_t fallback_status) +{ + const bool accept_pf = settings_.first_primal_feasible || settings_.all_primal_feasible; + // Iterate over ACTIVE climbers (climber_strategies_.size()), not the original batch size. + // After climber removal/swapping the active arrays (current_termination_strategy_ and + // climber_strategies_) shrink, while batch_solution_to_return_.get_terminations_status() + // keeps its original size and is indexed by original_index. Looping up to the original size + // and reading current_termination_strategy_.get_termination_status(i) / climber_strategies_[i] + // would index past the end of the active arrays. Read with the active index `i`, write with + // the original index. + for (size_t i = 0; i < climber_strategies_.size(); ++i) { + if (!current_termination_strategy_.is_done( + current_termination_strategy_.get_termination_status(i), accept_pf)) { + const auto original_index = climber_strategies_[i].original_index; + batch_solution_to_return_.get_terminations_status()[original_index] = fallback_status; + current_termination_strategy_.set_termination_status(i, fallback_status); + } + } + current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_, true); + current_termination_strategy_.convert_gpu_terms_stats_to_host( + batch_solution_to_return_.get_additional_termination_informations()); + if (fallback_status != pdlp_termination_status_t::ConcurrentLimit) { + for (size_t i = 0; i < climber_strategies_.size(); ++i) { + const auto original_index = static_cast(climber_strategies_[i].original_index); + batch_solution_to_return_.get_additional_termination_informations()[original_index] + .solved_by = method_t::PDLP; + } + } + return optimization_problem_solution_t{ + batch_solution_to_return_.get_primal_solution(), + batch_solution_to_return_.get_dual_solution(), + batch_solution_to_return_.get_reduced_cost(), + get_filled_warmed_start_data(), + problem_ptr->objective_name, + problem_ptr->var_names, + problem_ptr->row_names, + std::move(batch_solution_to_return_.get_additional_termination_informations()), + std::move(batch_solution_to_return_.get_terminations_status())}; +} + template std::optional> pdlp_solver_t::check_batch_termination(const timer_t& timer) @@ -764,10 +822,13 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) [[maybe_unused]] const bool is_cupdlpx = is_cupdlpx_restart(settings_.hyper_params); cuopt_assert(is_cupdlpx, "Batch termination handling only supported with cuPDLPx restart"); + const bool accept_primal_feasible = + settings_.first_primal_feasible || settings_.all_primal_feasible; + #ifdef BATCH_VERBOSE_MODE for (size_t i = 0; i < current_termination_strategy_.get_terminations_status().size(); ++i) { const auto& term = current_termination_strategy_.get_termination_status(i); - if (current_termination_strategy_.is_done(term)) { + if (current_termination_strategy_.is_done(term, accept_primal_feasible)) { std::cout << "[BATCH MODE]: Climber " << i << " is done with " << optimization_problem_solution_t::get_termination_status_string(term) << " at step " << internal_solver_iterations_ << ". It's original index is " @@ -782,7 +843,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) // If PDLP has solved it to optimality we want to keep it and resolved both solvers having // solved the problem later if (current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i))) + current_termination_strategy_.get_termination_status(i), accept_primal_feasible)) continue; const i_t local_idx = climber_strategies_[i].original_index; if (sb_view_.is_solved(local_idx)) { @@ -797,71 +858,37 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) } } - // All are optimal, infeasible, or externally solved - if (current_termination_strategy_.all_done()) { - const auto original_batch_size = settings_.new_bounds.size(); + // first_primal_feasible: stop the whole batch as soon as any climber becomes primal feasible + // (Optimal or PrimalFeasible). Snapshot every climber's current iterate so that even non-PF + // climbers return their latest state + if (settings_.first_primal_feasible && + current_termination_strategy_.any_primal_feasible_or_optimal()) { + raft::common::nvtx::range fpf_scope("first_primal_feasible_batch_snapshot"); + for (size_t i = 0; i < current_termination_strategy_.get_terminations_status().size(); ++i) { + snapshot_climber_into_return(i); + } + return finalize_batch_return(); + } + + // All are optimal, infeasible, primal feasible (when accepted), or externally solved + if (current_termination_strategy_.all_done(accept_primal_feasible)) { // Some climber got removed from the batch while the optimization was running - if (original_batch_size != climber_strategies_.size()) { + if (original_batch_size_ != climber_strategies_.size()) { #ifdef BATCH_VERBOSE_MODE - std::cout << "Original batch size was " << original_batch_size << " but is now " + std::cout << "Original batch size was " << original_batch_size_ << " but is now " << climber_strategies_.size() << std::endl; #endif cuopt_assert(current_termination_strategy_.get_terminations_status().size() == climber_strategies_.size(), "Terminations status size mismatch"); for (size_t i = 0; i < current_termination_strategy_.get_terminations_status().size(); ++i) { - // Found one that is done - cuopt_assert(current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i)), - "Climber should be done"); - // Copy current climber solution information - batch_solution_to_return_.get_terminations_status()[climber_strategies_[i].original_index] = - current_termination_strategy_.get_termination_status(i); - raft::copy(batch_solution_to_return_.get_primal_solution().data() + - climber_strategies_[i].original_index * primal_size_h_, - pdhg_solver_.get_potential_next_primal_solution().data() + i * primal_size_h_, - primal_size_h_, - stream_view_); - raft::copy(batch_solution_to_return_.get_dual_solution().data() + - climber_strategies_[i].original_index * dual_size_h_, - pdhg_solver_.get_potential_next_dual_solution().data() + i * dual_size_h_, - dual_size_h_, - stream_view_); - raft::copy( - batch_solution_to_return_.get_reduced_cost().data() + - climber_strategies_[i].original_index * primal_size_h_, - current_termination_strategy_.get_convergence_information().get_reduced_cost().data() + - i * primal_size_h_, - primal_size_h_, - stream_view_); - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .number_of_steps_taken = total_pdlp_iterations_; - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .total_number_of_attempted_steps = pdhg_solver_.get_total_pdhg_iterations(); - if (current_termination_strategy_.get_termination_status(i) != - pdlp_termination_status_t::ConcurrentLimit) { - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .solved_by = method_t::PDLP; - } - if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); } + cuopt_assert( + current_termination_strategy_.is_done( + current_termination_strategy_.get_termination_status(i), accept_primal_feasible), + "Climber should be done"); + snapshot_climber_into_return(i); } - current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_); - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - current_termination_strategy_.convert_gpu_terms_stats_to_host( - batch_solution_to_return_.get_additional_termination_informations()); - return optimization_problem_solution_t{ - batch_solution_to_return_.get_primal_solution(), - batch_solution_to_return_.get_dual_solution(), - batch_solution_to_return_.get_reduced_cost(), - get_filled_warmed_start_data(), - problem_ptr->objective_name, - problem_ptr->var_names, - problem_ptr->row_names, - std::move(batch_solution_to_return_.get_additional_termination_informations()), - std::move(batch_solution_to_return_.get_terminations_status())}; + return finalize_batch_return(); } if (sb_view_.is_valid()) { for (size_t i = 0; i < climber_strategies_.size(); ++i) { @@ -883,7 +910,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) for (size_t i = 0; i < current_termination_strategy_.get_terminations_status().size(); ++i) { // Found one that is done if (current_termination_strategy_.is_done( - current_termination_strategy_.get_termination_status(i))) { + current_termination_strategy_.get_termination_status(i), accept_primal_feasible)) { raft::common::nvtx::range fun_scope("remove_done_climber"); #ifdef BATCH_VERBOSE_MODE const bool externally_solved = (current_termination_strategy_.get_termination_status(i) == @@ -893,39 +920,7 @@ pdlp_solver_t::check_batch_termination(const timer_t& timer) << (externally_solved ? " [solved by DS]" : " [solved by PDLP]") << std::endl; #endif to_remove.emplace(i); - // Copy current climber solution information - batch_solution_to_return_.get_terminations_status()[climber_strategies_[i].original_index] = - current_termination_strategy_.get_termination_status(i); - raft::copy(batch_solution_to_return_.get_primal_solution().data() + - climber_strategies_[i].original_index * primal_size_h_, - pdhg_solver_.get_potential_next_primal_solution().data() + i * primal_size_h_, - primal_size_h_, - stream_view_); - raft::copy(batch_solution_to_return_.get_dual_solution().data() + - climber_strategies_[i].original_index * dual_size_h_, - pdhg_solver_.get_potential_next_dual_solution().data() + i * dual_size_h_, - dual_size_h_, - stream_view_); - raft::copy( - batch_solution_to_return_.get_reduced_cost().data() + - climber_strategies_[i].original_index * primal_size_h_, - current_termination_strategy_.get_convergence_information().get_reduced_cost().data() + - i * primal_size_h_, - primal_size_h_, - stream_view_); - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .number_of_steps_taken = total_pdlp_iterations_; - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .total_number_of_attempted_steps = pdhg_solver_.get_total_pdhg_iterations(); - if (current_termination_strategy_.get_termination_status(i) != - pdlp_termination_status_t::ConcurrentLimit) { - batch_solution_to_return_ - .get_additional_termination_informations()[climber_strategies_[i].original_index] - .solved_by = method_t::PDLP; - } - if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); } + snapshot_climber_into_return(i); } } if (to_remove.size() > 0) { @@ -1016,13 +1011,10 @@ std::optional> pdlp_solver_t // First check for pdlp_termination_reason_t::Optimality and handle the first primal feasible case if (settings_.first_primal_feasible) { - // Both primal feasible, return best objective - // TODO later batch mode: handle primal feasible here - cuopt_expects(!batch_mode_, - error_type_t::ValidationError, - "First primal feasible is not supported in batch mode"); - if (termination_average == pdlp_termination_status_t::PrimalFeasible && + if (!settings_.hyper_params.never_restart_to_average && + termination_average == pdlp_termination_status_t::PrimalFeasible && termination_current == pdlp_termination_status_t::PrimalFeasible) { + // Both primal feasible, return the one with the best overall residual const f_t current_overall_primal_residual = current_termination_strategy_.get_convergence_information() .get_l2_primal_residual() @@ -1065,7 +1057,8 @@ std::optional> pdlp_solver_t : pdhg_solver_.get_potential_next_dual_solution(), get_filled_warmed_start_data(), {termination_current}); - } else if (termination_average == pdlp_termination_status_t::PrimalFeasible) { + } else if (!settings_.hyper_params.never_restart_to_average && + termination_average == pdlp_termination_status_t::PrimalFeasible) { return average_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -1621,6 +1614,15 @@ void pdlp_solver_t::swap_context( make_span(primal_step_size_), make_span(dual_step_size_)); RAFT_CUDA_TRY(cudaPeekAtLastError()); + // Swap unscaled problem's per-climber fields (COL-major blocks) + if (problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)) { + matrix_swap(problem_ptr->objective_coefficients, primal_size_h_, swap_pairs); + } + if (problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)) { + matrix_swap(problem_ptr->constraint_lower_bounds, dual_size_h_, swap_pairs); + matrix_swap(problem_ptr->constraint_upper_bounds, dual_size_h_, swap_pairs); + matrix_swap(problem_ptr->combined_bounds, dual_size_h_, swap_pairs); + } } template @@ -1636,6 +1638,16 @@ void pdlp_solver_t::resize_context(i_t new_size) step_size_.resize(new_size, stream_view_); primal_step_size_.resize(new_size, stream_view_); dual_step_size_.resize(new_size, stream_view_); + initial_scaling_strategy_.resize_context(new_size); + // Resize unscaled problem's per-climber fields (COL-major) + if (problem_ptr->objective_coefficients.size() > static_cast(primal_size_h_)) { + problem_ptr->objective_coefficients.resize(new_size * primal_size_h_, stream_view_); + } + if (problem_ptr->constraint_lower_bounds.size() > static_cast(dual_size_h_)) { + problem_ptr->constraint_lower_bounds.resize(new_size * dual_size_h_, stream_view_); + problem_ptr->constraint_upper_bounds.resize(new_size * dual_size_h_, stream_view_); + problem_ptr->combined_bounds.resize(new_size * dual_size_h_, stream_view_); + } climber_strategies_.resize(new_size); } @@ -1653,6 +1665,7 @@ void pdlp_solver_t::swap_all_context( swap_context(swap_pairs); step_size_strategy_.swap_context(swap_pairs); current_termination_strategy_.swap_context(swap_pairs); + initial_scaling_strategy_.swap_context(swap_pairs); for (const auto& pair : swap_pairs) { host_vector_swap(climber_strategies_, pair.left, pair.right); @@ -1666,7 +1679,7 @@ void pdlp_solver_t::resize_all_context(i_t new_size) { raft::common::nvtx::range fun_scope("resize_all_context"); - // Resize PDHG, its saddle point and its new bounds + // Resize PDHG and its saddle point pdhg_solver_.resize_context(new_size); // Resize restart strategy and its duality gap container restart_strategy_.resize_context(new_size); @@ -1717,10 +1730,14 @@ void pdlp_solver_t::resize_and_swap_all_context_loop( // No swap can happen if all climbers to remove are at the end if (!swap_pairs.empty()) { swap_all_context(swap_pairs); } + const i_t new_size = last + 1; cuopt_assert( - last + 1 == climber_strategies_.size() - climber_strategies_to_remove.size(), + new_size == climber_strategies_.size() - climber_strategies_to_remove.size(), "Last + 1 must be equal to climber_strategies_.size() - climber_strategies_to_remove.size()"); - resize_all_context(last + 1); + // New bounds are grouped per climber: one climber can own multiple entries + // We need both the swap pairs and the new size to perform the operation + pdhg_solver_.resize_and_swap_new_bounds_context(swap_pairs, new_size); + resize_all_context(new_size); #ifdef BATCH_VERBOSE_MODE std::cout << "Batch size is now " << climber_strategies_.size() << ". Climbers left: "; @@ -2078,6 +2095,43 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte } } +// Need to tranposed the scaled problem fields between COL-major and ROW-major. +// In PDHG everything is ROW-major for faster SpMM. +// The scaled fields need to be tranposed back to COL-major as we might need to swap and resize +// them. No op if the fields were not expanded +template +void pdlp_solver_t::transpose_problem_fields(bool to_row) +{ + auto transpose_field = [&](rmm::device_uvector& field, i_t rows) { + if (field.size() <= static_cast(rows)) return; + rmm::device_uvector transposed(field.size(), stream_view_); + auto batch_size = static_cast(climber_strategies_.size()); + auto input_ld = to_row ? &rows : &batch_size; + auto output_ld = to_row ? &batch_size : &rows; + CUBLAS_CHECK(cublasGeam(handle_ptr_->get_cublas_handle(), + CUBLAS_OP_T, + CUBLAS_OP_N, + *output_ld, + *input_ld, + reusable_device_scalar_value_1_.data(), + field.data(), + *input_ld, + reusable_device_scalar_value_0_.data(), + nullptr, + *output_ld, + transposed.data(), + *output_ld)); + raft::copy(field.data(), transposed.data(), field.size(), stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + }; + + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); + // We need to swap the scaled version because they can be dynamically resized and swapped. + transpose_field(op_problem_scaled_.objective_coefficients, primal_size_h_); + transpose_field(op_problem_scaled_.constraint_lower_bounds, dual_size_h_); + transpose_field(op_problem_scaled_.constraint_upper_bounds, dual_size_h_); +} + // Tranpose all the data we use in termination condition and restart: // potential_next_primal_solution, potential_next_dual_solution, dual_slack template @@ -2155,6 +2209,8 @@ void pdlp_solver_t::transpose_primal_dual_to_row( dual_transposed.data(), dual_size_h_ * climber_strategies_.size(), stream_view_); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } template @@ -2233,6 +2289,8 @@ void pdlp_solver_t::transpose_primal_dual_back_to_col( dual_transposed.data(), dual_size_h_ * climber_strategies_.size(), stream_view_); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } template @@ -2258,10 +2316,24 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co compute_initial_primal_weight(); initial_scaling_strategy_.scale_problem(); + if constexpr (std::is_same_v) { + if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) { + pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + settings_.hyper_params.use_reflected_primal_dual); + } + } // Update FP32 matrix copies for mixed precision SpMV after scaling pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); + // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, indices), + // then free the duplicated structural vectors from the scaled copy to save device memory. + pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr); + op_problem_scaled_.variables.resize(0, stream_view_); + op_problem_scaled_.offsets.resize(0, stream_view_); + op_problem_scaled_.reverse_constraints.resize(0, stream_view_); + op_problem_scaled_.reverse_offsets.resize(0, stream_view_); + if (!settings_.hyper_params.compute_initial_step_size_before_scaling && !settings_.get_initial_step_size().has_value()) compute_initial_step_size(); @@ -2374,15 +2446,34 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // Project initial primal solution if (settings_.hyper_params.project_initial_primal) { using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), - problem_wrap_container(op_problem_scaled_.variable_bounds)), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().size(), - clamp(), - stream_view_.value()); + if (batch_mode_) { + // In batch mode variable_bounds are shared and only the bound rescaling is per climber. + // Apply it here too so the initial point is projected into the correct saacled space + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + pdhg_solver_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_zip_iterator( + problem_wrap_container(op_problem_scaled_.variable_bounds), + batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(), + primal_size_h_)), + scale_bounds_by_scalar_op{})), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + problem_wrap_container(op_problem_scaled_.variable_bounds)), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } - pdhg_solver_.refine_initial_primal_projection(); + pdhg_solver_.refine_initial_primal_projection( + initial_scaling_strategy_.get_bound_rescaling_vector()); if (!settings_.hyper_params.never_restart_to_average) { cuopt_expects(!batch_mode_, @@ -2426,6 +2517,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co restart_strategy_.last_restart_duality_gap_.dual_solution_, dummy); } + transpose_problem_fields(/*to_row=*/true); } if (verbose) { @@ -2513,8 +2605,10 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co } } - // In case of batch mode, primal and dual matrices are in row format - // We need to transpose them to column format before doing any checks + // In case of batch mode, primal/dual iterates and scaled problem fields are ROW-major + // for PDHG. We transpose them back to COL for convergence/termination checks, and + // swap_context / resize_context (which assume COL layout for block-based swaps). + // The unscaled problem fields (problem_ptr->) stay COL permanently if (batch_mode_) { rmm::device_uvector dummy(0, stream_view_); transpose_primal_dual_back_to_col(pdhg_solver_.get_potential_next_primal_solution(), @@ -2526,6 +2620,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co dummy); transpose_primal_dual_back_to_col( pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + transpose_problem_fields(/*to_row=*/false); } #ifdef CUPDLP_DEBUG_MODE @@ -2639,6 +2734,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co dummy); transpose_primal_dual_to_row( pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + transpose_problem_fields(/*to_row=*/true); } } @@ -2671,6 +2767,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co pdhg_solver_.get_saddle_point_state().get_current_AtY()); transpose_primal_dual_back_to_col( pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + transpose_problem_fields(/*to_row=*/false); } compute_fixed_error(has_restarted); // May set has_restarted to false if (batch_mode_) { @@ -2680,6 +2777,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co pdhg_solver_.get_saddle_point_state().get_current_AtY()); transpose_primal_dual_to_row( pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + transpose_problem_fields(/*to_row=*/true); } } halpern_update(); @@ -2708,12 +2806,14 @@ void pdlp_solver_t::take_adaptive_step(i_t total_pdlp_iterations, bool print("primal_step_size_", primal_step_size_); print("dual_step_size_", dual_step_size_); #endif - pdhg_solver_.take_step(primal_step_size_, - dual_step_size_, - restart_strategy_.get_iterations_since_last_restart(), - restart_strategy_.get_last_restart_was_average(), - total_pdlp_iterations, - is_major_iteration); + pdhg_solver_.take_step( + primal_step_size_, + dual_step_size_, + initial_scaling_strategy_.get_bound_rescaling_vector(), // Only used in batch mode + restart_strategy_.get_iterations_since_last_restart(), + restart_strategy_.get_last_restart_was_average(), + total_pdlp_iterations, + is_major_iteration); step_size_strategy_.compute_step_sizes( pdhg_solver_, primal_step_size_, dual_step_size_, total_pdlp_iterations); @@ -2736,7 +2836,13 @@ template void pdlp_solver_t::take_constant_step(bool is_major_iteration) { pdhg_solver_.take_step( - primal_step_size_, dual_step_size_, 0, false, total_pdlp_iterations_, is_major_iteration); + primal_step_size_, + dual_step_size_, + initial_scaling_strategy_.get_bound_rescaling_vector(), // Only used in batch mode + 0, + false, + total_pdlp_iterations_, + is_major_iteration); } template @@ -3015,7 +3121,6 @@ void pdlp_solver_t::compute_initial_primal_weight() // Here we use the combined bounds of the op_problem_scaled which may or may not be scaled yet // based on pdlp config - // TODO later batch mode: handle per problem objective coefficients and rhs detail::combine_constraint_bounds(op_problem_scaled_, op_problem_scaled_.combined_bounds); rmm::device_scalar c_vec_norm{0.0, stream_view_}; diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index d03430f150..9447eaeaf3 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -67,8 +67,6 @@ class pdlp_solver_t { f_t get_primal_weight_h(i_t id) const; f_t get_step_size_h(i_t id) const; i_t get_total_pdhg_iterations() const; - f_t get_relative_dual_tolerance_factor() const; - f_t get_relative_primal_tolerance_factor() const; detail::pdlp_termination_strategy_t& get_current_termination_strategy(); void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); @@ -87,7 +85,6 @@ class pdlp_solver_t { void set_initial_primal_weight(f_t initial_primal_weight); void set_initial_step_size(f_t initial_primal_weight); void set_initial_k(i_t initial_k); - void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor); void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor); using primal_quality_adapter_t = @@ -111,6 +108,13 @@ class pdlp_solver_t { std::optional> check_termination(const timer_t& timer); std::optional> check_batch_termination( const timer_t& timer); + // Snapshot the current iterate of climber `i` (batch-local index) into + // `batch_solution_to_return_` at its `original_index` slot + void snapshot_climber_into_return(size_t i); + // flush GPU termination stats into `batch_solution_to_return_` and construct the final solution. + optimization_problem_solution_t finalize_batch_return(); + optimization_problem_solution_t finalize_batch_return_with_limit_reached( + pdlp_termination_status_t limit_reached_status); std::optional> check_limits(const timer_t& timer); void record_best_primal_so_far(const detail::pdlp_termination_strategy_t& current, const detail::pdlp_termination_strategy_t& average, @@ -132,6 +136,11 @@ class pdlp_solver_t { void update_primal_dual_solutions(std::optional*> primal, std::optional*> dual); + // Initial number of climbers (derived from settings.fixed_batch_size / settings.new_bounds at + // ctor time). + // Stable throughout solving — use this whenever you need the ORIGINAL batch size, since + // `climber_strategies_` shrinks as climbers finish via resize_and_swap_all_context_loop. + const size_t original_batch_size_; std::vector climber_strategies_; bool batch_mode_{false}; @@ -185,6 +194,7 @@ class pdlp_solver_t { pdlp_warm_start_data_t get_filled_warmed_start_data(); + void transpose_problem_fields(bool to_row); void transpose_primal_dual_to_row(rmm::device_uvector& primal_to_transpose, rmm::device_uvector& dual_to_transpose, rmm::device_uvector& dual_slack_to_transpose); diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 2b10310260..17c7abcac5 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include @@ -87,8 +89,8 @@ pdlp_restart_strategy_t::pdlp_restart_strategy_t( restart_triggered_{0, stream_view_}, candidate_is_avg_{0, stream_view_}, avg_duality_gap_{handle_ptr_, - is_cupdlpx_restart(hyper_params) ? 0 : primal_size, - is_cupdlpx_restart(hyper_params) ? 0 : dual_size, + hyper_params.never_restart_to_average ? 0 : primal_size, + hyper_params.never_restart_to_average ? 0 : dual_size, climber_strategies, hyper_params}, current_duality_gap_{handle_ptr_, @@ -848,9 +850,9 @@ __global__ void kernel_compute_next_cupdlpx_primal_weight( if (index >= batch_size) { return; } const f_t relative_l2_dual_residual_value = - view.l2_dual_residual[index] / (f_t(1.0) + view.l2_norm_primal_linear_objective); + view.l2_dual_residual[index] / (f_t(1.0) + view.l2_norm_primal_linear_objective[index]); const f_t relative_l2_primal_residual_value = - view.l2_primal_residual[index] / (f_t(1.0) + view.l2_norm_primal_right_hand_side); + view.l2_primal_residual[index] / (f_t(1.0) + view.l2_norm_primal_right_hand_side[index]); cupdlpx_new_primal_weight_computation(view.primal_distance[index], view.dual_distance[index], @@ -2442,9 +2444,9 @@ pdlp_restart_strategy_t::make_cupdlpx_restart_view( v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); v.l2_norm_primal_linear_objective = - current_convergence_information.get_relative_dual_tolerance_factor(); + make_span(current_convergence_information.get_l2_norm_primal_linear_objective()); v.l2_norm_primal_right_hand_side = - current_convergence_information.get_relative_primal_tolerance_factor(); + make_span(current_convergence_information.get_l2_norm_primal_right_hand_side()); v.step_size = make_span(step_size); v.primal_weight = make_span(primal_weight); v.primal_weight_error_sum = make_span(primal_weight_error_sum_); diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cuh b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cuh index 4274185191..0c00e50240 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cuh +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cuh @@ -88,8 +88,8 @@ class pdlp_restart_strategy_t { raft::device_span dual_distance; raft::device_span l2_dual_residual; raft::device_span l2_primal_residual; - f_t l2_norm_primal_linear_objective; - f_t l2_norm_primal_right_hand_side; + raft::device_span l2_norm_primal_linear_objective; + raft::device_span l2_norm_primal_right_hand_side; raft::device_span step_size; raft::device_span primal_weight; raft::device_span primal_weight_error_sum; diff --git a/cpp/src/pdlp/saddle_point.cu b/cpp/src/pdlp/saddle_point.cu index 157e7fa389..f740176a3c 100644 --- a/cpp/src/pdlp/saddle_point.cu +++ b/cpp/src/pdlp/saddle_point.cu @@ -7,6 +7,7 @@ #include +#include #include #include @@ -17,10 +18,12 @@ namespace cuopt::linear_programming::detail { template -saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handle_ptr, - const i_t primal_size, - const i_t dual_size, - const size_t batch_size) +saddle_point_state_t::saddle_point_state_t( + raft::handle_t const* handle_ptr, + const i_t primal_size, + const i_t dual_size, + const size_t batch_size, + const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params) : primal_size_{primal_size}, dual_size_{dual_size}, primal_solution_{batch_size * primal_size, handle_ptr->get_stream()}, @@ -28,7 +31,9 @@ saddle_point_state_t::saddle_point_state_t(raft::handle_t const* handl delta_primal_{batch_size * primal_size, handle_ptr->get_stream()}, delta_dual_{batch_size * dual_size, handle_ptr->get_stream()}, // Primal gradient is only used in trust region restart mode which does not support batch mode - primal_gradient_{static_cast(primal_size), handle_ptr->get_stream()}, + primal_gradient_{ + !is_cupdlpx_restart(hyper_params) ? static_cast(primal_size) : 0, + handle_ptr->get_stream()}, dual_gradient_{batch_size * dual_size, handle_ptr->get_stream()}, current_AtY_{batch_size * primal_size, handle_ptr->get_stream()}, next_AtY_{batch_size * primal_size, handle_ptr->get_stream()} diff --git a/cpp/src/pdlp/saddle_point.hpp b/cpp/src/pdlp/saddle_point.hpp index 7e8f87fa25..eb6b8025cf 100644 --- a/cpp/src/pdlp/saddle_point.hpp +++ b/cpp/src/pdlp/saddle_point.hpp @@ -7,6 +7,8 @@ #pragma once +#include + #include #include @@ -64,7 +66,8 @@ class saddle_point_state_t { saddle_point_state_t(raft::handle_t const* handle_ptr, i_t primal_size, i_t dual_size, - size_t batch_size); + size_t batch_size, + const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params); /** * @brief Copies the values of the solutions in another saddle_point_state_t diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 29a7f32db6..bb2d193e18 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -53,8 +54,14 @@ #include +#include + +#include +#include #include +#include #include +#include #define CUOPT_LOG_CONDITIONAL_INFO(condition, ...) \ if ((condition)) { CUOPT_LOG_INFO(__VA_ARGS__); } @@ -630,6 +637,7 @@ static optimization_problem_solution_t run_pdlp_solver_in_fp32( fs.per_constraint_residual = settings.per_constraint_residual; fs.save_best_primal_so_far = settings.save_best_primal_so_far; fs.first_primal_feasible = settings.first_primal_feasible; + fs.all_primal_feasible = settings.all_primal_feasible; fs.eliminate_dense_columns = settings.eliminate_dense_columns; fs.pdlp_precision = pdlp_precision_t::DefaultPrecision; fs.method = method_t::PDLP; @@ -846,10 +854,15 @@ optimization_problem_solution_t run_pdlp(detail::problem_t& } // Compute in double as some cases overflow when using size_t +// +// `per_climber_objectives` / `per_climber_constraint_bounds` tell the estimator whether the caller +// will expand these fields to (trial_batch_size * n_{vars,constraints}). template static double batch_pdlp_memory_estimator(const optimization_problem_t& problem, double trial_batch_size, - bool collect_solutions = false) + bool per_climber_objectives = false, + bool per_climber_constraint_bounds = false, + bool collect_solutions = false) { double total_memory = 0.0; // In PDLP we store the scaled version of the problem which contains all of those @@ -857,12 +870,30 @@ static double batch_pdlp_memory_estimator(const optimization_problem_t total_memory += problem.get_constraint_matrix_offsets().size() * sizeof(i_t); total_memory += problem.get_constraint_matrix_values().size() * sizeof(f_t); total_memory *= 2.0; // To account for the A_t matrix - total_memory += problem.get_objective_coefficients().size() * sizeof(f_t); + + // Internally we always use have a scaled and an unscaled version of the objective coefficients + if (per_climber_objectives) { + total_memory += 2.0 * trial_batch_size * problem.get_n_variables() * sizeof(f_t); + } else { + total_memory += 2.0 * problem.get_objective_coefficients().size() * sizeof(f_t); + } + total_memory += problem.get_constraint_bounds().size() * sizeof(f_t); total_memory += problem.get_variable_lower_bounds().size() * sizeof(f_t); total_memory += problem.get_variable_upper_bounds().size() * sizeof(f_t); - total_memory += problem.get_constraint_lower_bounds().size() * sizeof(f_t); - total_memory += problem.get_constraint_upper_bounds().size() * sizeof(f_t); + + // Per-climber constraint bounds expansion adds 2 * trial_batch_size * n_constraints. Strong + // branching never expands these, so the flag guards the cost. + // 2.0 because we have scaled and unscaled + if (per_climber_constraint_bounds) { + total_memory += + 2.0 * trial_batch_size * problem.get_constraint_lower_bounds().size() * sizeof(f_t); + total_memory += + 2.0 * trial_batch_size * problem.get_constraint_upper_bounds().size() * sizeof(f_t); + } else { + total_memory += 2.0 * problem.get_constraint_lower_bounds().size() * sizeof(f_t); + total_memory += 2.0 * problem.get_constraint_upper_bounds().size() * sizeof(f_t); + } // Batch data estimator @@ -909,34 +940,306 @@ static double batch_pdlp_memory_estimator(const optimization_problem_t return total_memory; } +// We need to custom craft a solver settings for the batch mode as we need a specific set of values +// We override iteration limit and pdlp tolerance unless the user has specified otherwise template -optimization_problem_solution_t run_batch_pdlp( - optimization_problem_t& problem, pdlp_solver_settings_t const& settings) +static void apply_batch_settings_overrides( + const pdlp_solver_settings_t& original_settings, + pdlp_solver_settings_t& batch_settings) { - // Hyper parameter than can be changed, I have put what I believe to be the best + constexpr int batch_iteration_limit = 100000; + constexpr f_t pdlp_tolerance = 1e-4; + + const pdlp_solver_settings_t default_settings{}; + + auto override_or_keep_given = + [&](const auto& given_value, const auto& default_value, const auto& override_value) { + return given_value == default_value ? override_value : given_value; + }; + + batch_settings.method = cuopt::linear_programming::method_t::PDLP; + batch_settings.presolver = presolver_t::None; + batch_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; + batch_settings.detect_infeasibility = false; + batch_settings.iteration_limit = override_or_keep_given( + original_settings.iteration_limit, default_settings.iteration_limit, batch_iteration_limit); + batch_settings.inside_mip = true; + // Override the tolerances unless the user has specified otherwise + // Only risk is overriding a user intentionnaly wanting to use numeric_limits::max() as an + // iteration limit + batch_settings.tolerances.absolute_dual_tolerance = + override_or_keep_given(original_settings.tolerances.absolute_dual_tolerance, + default_settings.tolerances.absolute_dual_tolerance, + pdlp_tolerance); + batch_settings.tolerances.relative_dual_tolerance = + override_or_keep_given(original_settings.tolerances.relative_dual_tolerance, + default_settings.tolerances.relative_dual_tolerance, + pdlp_tolerance); + batch_settings.tolerances.absolute_primal_tolerance = + override_or_keep_given(original_settings.tolerances.absolute_primal_tolerance, + default_settings.tolerances.absolute_primal_tolerance, + pdlp_tolerance); + batch_settings.tolerances.relative_primal_tolerance = + override_or_keep_given(original_settings.tolerances.relative_primal_tolerance, + default_settings.tolerances.relative_primal_tolerance, + pdlp_tolerance); + batch_settings.tolerances.absolute_gap_tolerance = + override_or_keep_given(original_settings.tolerances.absolute_gap_tolerance, + default_settings.tolerances.absolute_gap_tolerance, + pdlp_tolerance); + batch_settings.tolerances.relative_gap_tolerance = + override_or_keep_given(original_settings.tolerances.relative_gap_tolerance, + default_settings.tolerances.relative_gap_tolerance, + pdlp_tolerance); + constexpr bool pdlp_primal_dual_init = true; constexpr bool primal_weight_init = true; - constexpr bool use_initial_pdlp_iterations = true; - bool use_optimal_batch_size = false; - constexpr int batch_iteration_limit = 100000; - constexpr f_t pdlp_tolerance = 1e-5; + constexpr bool use_initial_pdlp_iterations = false; + if (original_settings.has_initial_primal_solution() && pdlp_primal_dual_init) { + batch_settings.set_initial_primal_solution( + original_settings.get_initial_primal_solution().data(), + original_settings.get_initial_primal_solution().size(), + original_settings.get_initial_primal_solution().stream()); + } + if (original_settings.has_initial_dual_solution() && pdlp_primal_dual_init) { + batch_settings.set_initial_dual_solution( + original_settings.get_initial_dual_solution().data(), + original_settings.get_initial_dual_solution().size(), + original_settings.get_initial_dual_solution().stream()); + } + // Step size doesn't change anyways, just to save the compute + if (original_settings.get_initial_step_size().has_value()) { + batch_settings.set_initial_step_size(original_settings.get_initial_step_size().value()); + } + if (original_settings.get_initial_primal_weight().has_value() && primal_weight_init) { + batch_settings.set_initial_primal_weight(original_settings.get_initial_primal_weight().value()); + } + if (original_settings.get_initial_pdlp_iteration().has_value() && use_initial_pdlp_iterations) { + batch_settings.set_initial_pdlp_iteration( + original_settings.get_initial_pdlp_iteration().value()); + } +} + +// Fixed-path helper: caller pre-sized the batch via fixed_batch_size and pre-expanded any +// per-climber problem fields directly on the optimization_problem_t (objective_coefficients, +// constraint_lower_bounds, constraint_upper_bounds, batch_objective_offsets_). A single +// solve_lp call runs the batch — no memory heuristics, no sub-batching. +template +static optimization_problem_solution_t run_batch_pdlp_fixed( + optimization_problem_t& problem, pdlp_solver_settings_t const& settings) +{ + cuopt_expects(settings.fixed_batch_size > 0, + error_type_t::ValidationError, + "run_batch_pdlp_fixed requires fixed_batch_size > 0"); + + const size_t n_vars = static_cast(problem.get_n_variables()); + const size_t n_constraints = static_cast(problem.get_n_constraints()); + const size_t bs = static_cast(settings.fixed_batch_size); + + const size_t obj_size = problem.get_objective_coefficients().size(); + const size_t clb_size = problem.get_constraint_lower_bounds().size(); + const size_t cub_size = problem.get_constraint_upper_bounds().size(); + const size_t off_size = problem.get_batch_objective_offsets().size(); + + cuopt_expects( + obj_size == n_vars || obj_size == bs * n_vars, + error_type_t::ValidationError, + "run_batch_pdlp fixed path: objective_coefficients size (%zu) must equal n_variables " + "(%zu, shared across climbers) or fixed_batch_size * n_variables (%zu, per-climber).", + obj_size, + n_vars, + bs * n_vars); + + cuopt_expects( + clb_size == n_constraints || clb_size == bs * n_constraints, + error_type_t::ValidationError, + "run_batch_pdlp fixed path: constraint_lower_bounds size (%zu) must equal n_constraints " + "(%zu, shared across climbers) or fixed_batch_size * n_constraints (%zu, per-climber).", + clb_size, + n_constraints, + bs * n_constraints); + + cuopt_expects( + cub_size == n_constraints || cub_size == bs * n_constraints, + error_type_t::ValidationError, + "run_batch_pdlp fixed path: constraint_upper_bounds size (%zu) must equal n_constraints " + "(%zu, shared across climbers) or fixed_batch_size * n_constraints (%zu, per-climber).", + cub_size, + n_constraints, + bs * n_constraints); + + // The lower/upper sweep in pdhg.cu (`if (constraint_lower_bounds.size() > dual_size_h_)`) keys + // off the lower-bound array only and assumes the upper-bound array follows. Reject any layout + // where one is shared and the other is per-climber. + cuopt_expects(clb_size == cub_size, + error_type_t::ValidationError, + "run_batch_pdlp fixed path: constraint_lower_bounds (%zu) and " + "constraint_upper_bounds (%zu) must have the same size (both shared or both " + "per-climber).", + clb_size, + cub_size); + + cuopt_expects(off_size == 0 || off_size == bs, + error_type_t::ValidationError, + "run_batch_pdlp fixed path: batch_objective_offsets size (%zu) must be 0 (no " + "per-climber offsets) or fixed_batch_size (%zu).", + off_size, + bs); + + pdlp_solver_settings_t batch_settings = settings; + apply_batch_settings_overrides(settings, batch_settings); + + return solve_lp(problem, + batch_settings, + /*problem_checking=*/false, + /*use_pdlp_solver_mode=*/true, + /*is_batch_mode=*/true); +} + +template +static void validate_new_bounds(const optimization_problem_t& problem, + pdlp_solver_settings_t const& settings) +{ + std::set> seen_bounds; + i_t last_climber_id = -1; + for (const auto& new_bound : settings.new_bounds) { + const auto climber_id = std::get<0>(new_bound); + const auto var_idx = std::get<1>(new_bound); + const auto lower = std::get<2>(new_bound); + const auto upper = std::get<3>(new_bound); + + cuopt_expects( + climber_id >= 0, error_type_t::ValidationError, "new_bounds climber_id must be non-negative"); + if (settings.fixed_batch_size > 0) { + cuopt_expects(climber_id < settings.fixed_batch_size, + error_type_t::ValidationError, + "new_bounds climber_id must be less than fixed_batch_size"); + } + if (climber_id != last_climber_id) { + cuopt_expects(climber_id > last_climber_id, + error_type_t::ValidationError, + "new_bounds climber_id entries must be sorted ascending and grouped"); + last_climber_id = climber_id; + } + cuopt_expects(var_idx >= 0 && var_idx < problem.get_n_variables(), + error_type_t::ValidationError, + "new_bounds variable_index must be in [0, n_variables)"); + cuopt_expects(!std::isnan(lower) && !std::isnan(upper), + error_type_t::ValidationError, + "new_bounds lower and upper bounds must not be NaN"); + cuopt_expects(lower <= upper, + error_type_t::ValidationError, + "new_bounds lower bound must be less than or equal to upper bound"); + cuopt_expects(seen_bounds.insert({climber_id, var_idx}).second, + error_type_t::ValidationError, + "new_bounds cannot contain duplicate (climber_id, variable_index) entries"); + } +} + +// Returns the batch size implied by per-climber variable-bound overrides. +template +static size_t new_bounds_batch_size(const std::vector>& new_bounds) +{ + cuopt_assert(!new_bounds.empty(), "Batch size should be greater than 0"); + i_t max_climber_id = 0; + for (const auto& new_bound : new_bounds) { + const auto climber_id = std::get<0>(new_bound); + cuopt_assert(climber_id >= 0, "new_bounds climber_id must be non-negative"); + max_climber_id = std::max(max_climber_id, climber_id); + } + return static_cast(max_climber_id) + 1; +} + +template +static void validate_splitting_new_bounds( + const std::vector>& new_bounds, size_t batch_size) +{ + cuopt_expects(new_bounds.size() == batch_size, + error_type_t::ValidationError, + "run_batch_pdlp splitting path requires exactly one new_bounds entry per climber"); + for (size_t i = 0; i < batch_size; ++i) { + cuopt_expects(std::get<0>(new_bounds[i]) == static_cast(i), + error_type_t::ValidationError, + "run_batch_pdlp splitting path requires new_bounds sorted by climber_id with no " + "missing climbers"); + } +} +template +static size_t max_memory_batch_size(const optimization_problem_t& problem, + bool per_climber_objectives, + bool per_climber_constraint_bounds, + bool collect_solutions, + size_t memory_max_batch_size) +{ + size_t st_free_mem, st_total_mem; + RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem)); + const double free_mem = static_cast(st_free_mem); + const double total_mem = static_cast(st_total_mem); + + while (memory_max_batch_size > 0) { + const double mem_est = batch_pdlp_memory_estimator(problem, + memory_max_batch_size, + per_climber_objectives, + per_climber_constraint_bounds, + collect_solutions); + if (mem_est <= free_mem) { break; } +#ifdef BATCH_VERBOSE_MODE + std::cout << "Memory estimate: " << mem_est << std::endl; + std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl; + std::cout << "Free memory: " << free_mem << std::endl; + std::cout << "Total memory: " << total_mem << std::endl; + std::cout << "--------------------------------" << std::endl; +#endif + memory_max_batch_size--; + } + return memory_max_batch_size; +} + +// Splitting-path helper: strong-branching flow. +// By default will try to run with the full batch size +// If the memory is too high, it will use the optimal batch size heuristic and split the batch into +// sub-batches +template +static optimization_problem_solution_t run_batch_pdlp_splitting( + optimization_problem_t& problem, pdlp_solver_settings_t const& settings) +{ rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream(); + const i_t n_vars = problem.get_n_variables(); + const i_t n_constraints = problem.get_n_constraints(); - rmm::device_uvector initial_primal(0, stream); - rmm::device_uvector initial_dual(0, stream); - f_t initial_step_size = std::numeric_limits::signaling_NaN(); - f_t initial_primal_weight = std::numeric_limits::signaling_NaN(); - i_t initial_pdlp_iteration = -1; + // Splitting path only supports un-expanded problems + per-climber variable-bound overrides. + cuopt_expects(problem.get_objective_coefficients().size() == static_cast(n_vars), + error_type_t::ValidationError, + "run_batch_pdlp splitting path requires un-expanded objective_coefficients " + "(size == n_variables). Set fixed_batch_size and pre-expand on the " + "optimization_problem_t to use the fixed path for per-climber problem data."); + cuopt_expects(problem.get_constraint_lower_bounds().size() == static_cast(n_constraints), + error_type_t::ValidationError, + "run_batch_pdlp splitting path requires un-expanded constraint_lower_bounds " + "(size == n_constraints)."); + cuopt_expects(problem.get_constraint_upper_bounds().size() == static_cast(n_constraints), + error_type_t::ValidationError, + "run_batch_pdlp splitting path requires un-expanded constraint_upper_bounds " + "(size == n_constraints)."); + cuopt_expects(problem.get_batch_objective_offsets().size() == 0, + error_type_t::ValidationError, + "run_batch_pdlp splitting path does not support per-climber objective offsets. " + "Use the fixed path (set fixed_batch_size) instead."); cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0"); - const size_t max_batch_size = settings.new_bounds.size(); + const size_t max_batch_size = new_bounds_batch_size(settings.new_bounds); size_t memory_max_batch_size = max_batch_size; + validate_splitting_new_bounds(settings.new_bounds, max_batch_size); - // Check if we don't hit the limit using max_batch_size const bool collect_solutions = settings.generate_batch_primal_dual_solution; + // Strong branching never expands per-climber objectives or constraint bounds. const double memory_estimate = - batch_pdlp_memory_estimator(problem, max_batch_size, collect_solutions); + batch_pdlp_memory_estimator(problem, + max_batch_size, + /*per_climber_objectives=*/false, + /*per_climber_constraint_bounds=*/false, + collect_solutions); size_t st_free_mem, st_total_mem; RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem)); const double free_mem = static_cast(st_free_mem); @@ -948,25 +1251,17 @@ optimization_problem_solution_t run_batch_pdlp( std::cout << "Total memory: " << total_mem << std::endl; #endif + bool use_optimal_batch_size = false; + // If the memory estimate is too high, we need to use the optimal batch size heuristic if (memory_estimate > free_mem) { use_optimal_batch_size = true; - // Decrement batch size iteratively until we find a batch size that fits - while (memory_max_batch_size > 1) { - const double memory_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size, collect_solutions); - if (memory_estimate <= free_mem) { break; } -#ifdef BATCH_VERBOSE_MODE - std::cout << "Memory estimate: " << memory_estimate << std::endl; - std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl; - std::cout << "Free memory: " << free_mem << std::endl; - std::cout << "Total memory: " << total_mem << std::endl; - std::cout << "--------------------------------" << std::endl; -#endif - memory_max_batch_size--; - } - const double min_estimate = - batch_pdlp_memory_estimator(problem, memory_max_batch_size, collect_solutions); - if (min_estimate > free_mem) { + memory_max_batch_size = max_memory_batch_size(problem, + /*per_climber_objectives=*/false, + /*per_climber_constraint_bounds=*/false, + collect_solutions, + memory_max_batch_size); + // Can't even fit one PDLP + if (memory_max_batch_size == 0) { return optimization_problem_solution_t(pdlp_termination_status_t::NumericalError, stream); } @@ -975,39 +1270,10 @@ optimization_problem_solution_t run_batch_pdlp( size_t optimal_batch_size = use_optimal_batch_size ? detail::optimal_batch_size_handler(problem, memory_max_batch_size) : max_batch_size; - if (settings.sub_batch_size > 0) { optimal_batch_size = settings.sub_batch_size; } + if (settings.fixed_batch_size > 0) { optimal_batch_size = settings.fixed_batch_size; } cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size, "Optimal batch size should be between 1 and max batch size"); - const bool warm_start_from_settings = settings.has_initial_primal_solution() || - settings.has_initial_dual_solution() || - settings.get_initial_step_size().has_value() || - settings.get_initial_primal_weight().has_value() || - settings.get_initial_pdlp_iteration().has_value(); - - if (warm_start_from_settings) { -#ifdef BATCH_VERBOSE_MODE - std::cout << "Using warm start from settings" << std::endl; -#endif - if (settings.has_initial_primal_solution() && pdlp_primal_dual_init) { - initial_primal = rmm::device_uvector(settings.get_initial_primal_solution(), - settings.get_initial_primal_solution().stream()); - } - if (settings.has_initial_dual_solution() && pdlp_primal_dual_init) { - initial_dual = rmm::device_uvector(settings.get_initial_dual_solution(), - settings.get_initial_dual_solution().stream()); - } - if (settings.get_initial_step_size().has_value() && pdlp_primal_dual_init) { - initial_step_size = *settings.get_initial_step_size(); - } - if (settings.get_initial_primal_weight().has_value() && primal_weight_init) { - initial_primal_weight = *settings.get_initial_primal_weight(); - } - if (settings.get_initial_pdlp_iteration().has_value() && use_initial_pdlp_iterations) { - initial_pdlp_iteration = *settings.get_initial_pdlp_iteration(); - } - } - rmm::device_uvector full_primal_solution( (collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream); rmm::device_uvector full_dual_solution( @@ -1020,47 +1286,35 @@ optimization_problem_solution_t run_batch_pdlp( full_info; std::vector full_status; - pdlp_solver_settings_t batch_settings = settings; - const auto original_new_bounds = batch_settings.new_bounds; - batch_settings.method = cuopt::linear_programming::method_t::PDLP; - batch_settings.presolver = presolver_t::None; - batch_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3; - batch_settings.detect_infeasibility = false; - batch_settings.iteration_limit = batch_iteration_limit; - batch_settings.inside_mip = true; - batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance; - batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance; - batch_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance; - batch_settings.tolerances.relative_primal_tolerance = pdlp_tolerance; - batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance; - batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance; - if (initial_primal.size() > 0) { - batch_settings.set_initial_primal_solution( - initial_primal.data(), initial_primal.size(), initial_primal.stream()); - } - if (initial_dual.size() > 0) { - batch_settings.set_initial_dual_solution( - initial_dual.data(), initial_dual.size(), initial_dual.stream()); - } - if (!std::isnan(initial_step_size)) { batch_settings.set_initial_step_size(initial_step_size); } - if (initial_pdlp_iteration != -1) { - batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration); - } - if (!std::isnan(initial_primal_weight)) { - batch_settings.set_initial_primal_weight(initial_primal_weight); - } + pdlp_solver_settings_t batch_settings = settings; + const auto original_new_bounds = batch_settings.new_bounds; + apply_batch_settings_overrides(settings, batch_settings); for (size_t i = 0; i < max_batch_size; i += optimal_batch_size) { const size_t current_batch_size = std::min(optimal_batch_size, max_batch_size - i); - // Only take the new bounds from [i, i + current_batch_size) - batch_settings.new_bounds = std::vector>( - original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size); + batch_settings.new_bounds.clear(); + for (size_t c = 0; c < current_batch_size; ++c) { + const auto& new_bound = original_new_bounds[i + c]; + batch_settings.new_bounds.emplace_back(static_cast(c), + std::get<1>(new_bound), + std::get<2>(new_bound), + std::get<3>(new_bound)); + } if (!settings.shared_sb_solved.empty()) { batch_settings.shared_sb_solved = settings.shared_sb_solved.subspan(i, current_batch_size); } - auto sol = solve_lp(problem, batch_settings); + auto sol = solve_lp(problem, + batch_settings, + /*problem_checking=*/false, + /*use_pdlp_solver_mode=*/true, + /*is_batch_mode=*/true); + + // solve_lp swallows cuopt::logic_error and surfaces it via error_status on the returned + // solution. If we kept aggregating, the final batched solution we build below would be + // constructed without forwarding that error_status, silently dropping the error + if (sol.get_error_status().get_error_type() != error_type_t::Success) { return sol; } if (collect_solutions) { raft::copy(full_primal_solution.data() + i * problem.get_n_variables(), @@ -1093,6 +1347,55 @@ optimization_problem_solution_t run_batch_pdlp( std::move(full_status)); } +template +optimization_problem_solution_t run_batch_pdlp( + optimization_problem_t& problem, pdlp_solver_settings_t const& settings) +{ + validate_new_bounds(problem, settings); + + // Fixed path: caller has pre-sized the batch (via fixed_batch_size) and pre-expanded any + // per-climber problem fields directly on the optimization_problem_t. One solve_lp, no memory + // heuristics. + if (settings.fixed_batch_size > 0) { return run_batch_pdlp_fixed(problem, settings); } + // Splitting path: strong-branching flow. Auto-picks batch size and sub-batches based on memory. + return run_batch_pdlp_splitting(problem, settings); +} + +// At this stage, the problem shouldn't already be expanded +// The results of this function should be used as the settings.fixed_batch_size, to expand the +// problem fields and call run_batch_pdlp +template +size_t compute_optimal_batch_size(const optimization_problem_t& problem, + bool per_climber_objectives, + bool per_climber_constraint_bounds, + bool collect_solutions) +{ + // Find the maximum batch size that can be used without exceeding the free memory + + // Since we decerement iteratively, we don't want to use std::numeric_limits::max() + // Even if 20K fits in memory it will never be an optimal batch size, it's just to have a + // reasonable upper bound + constexpr size_t max_batch_size = 20000; + const size_t memory_max_batch_size = max_memory_batch_size(problem, + per_climber_objectives, + per_climber_constraint_bounds, + collect_solutions, + max_batch_size); +#ifdef BATCH_VERBOSE_MODE + std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl; +#endif + + // We now know the maximum batch size that can be used without exceeding the free memory + // Now find the optimal batch size [0, memory_max_batch_size] + + const size_t optimal_batch_size = static_cast( + detail::optimal_batch_size_handler(problem, static_cast(memory_max_batch_size))); +#ifdef BATCH_VERBOSE_MODE + std::cout << "Optimal batch size: " << optimal_batch_size << std::endl; +#endif + return optimal_batch_size; +} + template optimization_problem_solution_t batch_pdlp_solve( raft::handle_t const* handle_ptr, @@ -1112,15 +1415,16 @@ optimization_problem_solution_t batch_pdlp_solve( // Lower bounds can sometimes generate infeasible instances that we struggle to detect constexpr bool only_upper = false; - int batch_size = only_upper ? fractional.size() : fractional.size() * 2; for (size_t i = 0; i < fractional.size(); ++i) - settings.new_bounds.push_back({fractional[i], + settings.new_bounds.push_back({static_cast(i), + fractional[i], mps_model.get_variable_lower_bounds()[fractional[i]], std::floor(root_soln_x[i])}); if (!only_upper) { for (size_t i = 0; i < fractional.size(); i++) - settings.new_bounds.push_back({fractional[i], + settings.new_bounds.push_back({static_cast(i + fractional.size()), + fractional[i], std::ceil(root_soln_x[i]), mps_model.get_variable_upper_bounds()[fractional[i]]}); } @@ -1159,9 +1463,11 @@ optimization_problem_solution_t run_concurrent( // Copy the settings so that we can set the concurrent halt pointer pdlp_solver_settings_t settings_pdlp(settings); - // Set the concurrent halt pointer - global_concurrent_halt = 0; - settings_pdlp.concurrent_halt = &global_concurrent_halt; + // Use a local halt flag only when the caller did not provide one. + if (settings_pdlp.concurrent_halt == nullptr) { + global_concurrent_halt = 0; + settings_pdlp.concurrent_halt = &global_concurrent_halt; + } // Make sure allocations are done on the original stream problem.handle_ptr->sync_stream(); @@ -1184,12 +1490,20 @@ optimization_problem_solution_t run_concurrent( std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>> sol_dual_simplex_ptr; std::thread dual_simplex_thread; + std::exception_ptr dual_simplex_exception; + auto request_concurrent_halt = [&settings_pdlp]() { + if (settings_pdlp.concurrent_halt != nullptr) { settings_pdlp.concurrent_halt->store(1); } + }; if (!settings.inside_mip) { - dual_simplex_thread = std::thread(run_dual_simplex_thread, - std::ref(dual_simplex_problem), - std::ref(settings_pdlp), - std::ref(sol_dual_simplex_ptr), - std::ref(timer)); + dual_simplex_thread = std::thread([&]() { + try { + run_dual_simplex_thread( + dual_simplex_problem, settings_pdlp, sol_dual_simplex_ptr, timer); + } catch (...) { + dual_simplex_exception = std::current_exception(); + request_concurrent_halt(); + } + }); } // Create a thread for barrier. // The barrier handle is owned here so that its destructor runs on the @@ -1199,25 +1513,28 @@ optimization_problem_solution_t run_concurrent( std::unique_ptr< std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>> sol_barrier_ptr; + std::exception_ptr barrier_exception; auto barrier_thread = std::thread([&]() { - auto call_barrier_thread = [&]() { - rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread; - barrier_handle_ptr = std::make_unique(barrier_stream); - auto barrier_problem = dual_simplex_problem; - barrier_problem.handle_ptr = barrier_handle_ptr.get(); - - run_barrier_thread(std::ref(barrier_problem), - std::ref(settings_pdlp), - std::ref(sol_barrier_ptr), - std::ref(timer)); - }; - if (settings.num_gpus > 1) { - problem.handle_ptr->sync_stream(); - raft::device_setter device_setter(1); // Scoped variable - CUOPT_LOG_DEBUG("Barrier device: %d", device_setter.get_current_device()); - call_barrier_thread(); - } else { - call_barrier_thread(); + try { + auto call_barrier_thread = [&]() { + rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread; + barrier_handle_ptr = std::make_unique(barrier_stream); + auto barrier_problem = dual_simplex_problem; + barrier_problem.handle_ptr = barrier_handle_ptr.get(); + + run_barrier_thread(barrier_problem, settings_pdlp, sol_barrier_ptr, timer); + }; + if (settings.num_gpus > 1) { + problem.handle_ptr->sync_stream(); + raft::device_setter device_setter(1); // Scoped variable + CUOPT_LOG_DEBUG("Barrier device: %d", device_setter.get_current_device()); + call_barrier_thread(); + } else { + call_barrier_thread(); + } + } catch (...) { + barrier_exception = std::current_exception(); + request_concurrent_halt(); } }); @@ -1234,19 +1551,22 @@ optimization_problem_solution_t run_concurrent( try { sol_pdlp = run_pdlp(problem, settings_pdlp, timer, is_batch_mode); } catch (...) { - pdlp_exception = std::current_exception(); - *settings_pdlp.concurrent_halt = 1; - std::rethrow_exception(pdlp_exception); + pdlp_exception = std::current_exception(); + request_concurrent_halt(); } // Wait for dual simplex thread to finish - if (!settings.inside_mip) { dual_simplex_thread.join(); } + if (dual_simplex_thread.joinable()) { dual_simplex_thread.join(); } - barrier_thread.join(); + if (barrier_thread.joinable()) { barrier_thread.join(); } // At this point, it is safe to destroy the barrier context since we're outside of any PDLP graph // capture. barrier_handle_ptr.reset(); + if (pdlp_exception) { std::rethrow_exception(pdlp_exception); } + if (dual_simplex_exception) { std::rethrow_exception(dual_simplex_exception); } + if (barrier_exception) { std::rethrow_exception(barrier_exception); } + // copy the dual simplex solution to the device auto sol_dual_simplex = !settings.inside_mip @@ -1396,8 +1716,10 @@ optimization_problem_solution_t solve_lp( raft::common::nvtx::range fun_scope("Check problem representation"); // This is required as user might forget to set some fields problem_checking_t::check_problem_representation(op_problem); - // In batch PDLP for strong branching, the initial solutions will be by design out of bounds - if (settings.new_bounds.size() == 0) + // In batch PDLP for strong branching, the initial solutions will be by design out of bounds. + // Batch mode also disables this check: fixed_batch_size > 0 means the caller has already + // expanded per-climber fields on the problem, which would fail single-problem size checks. + if (settings.new_bounds.size() == 0 && settings.fixed_batch_size == 0) problem_checking_t::check_initial_solution_representation(op_problem, settings); } @@ -1416,6 +1738,7 @@ optimization_problem_solution_t solve_lp( return optimization_problem_solution_t(pdlp_termination_status_t::PrimalInfeasible, op_problem.get_handle_ptr()->get_stream()); } + validate_new_bounds(op_problem, settings); auto lp_timer = cuopt::timer_t(settings.time_limit); detail::problem_t problem(op_problem); @@ -1631,11 +1954,10 @@ cuopt::linear_programming::optimization_problem_t mps_data_model_to_op } if (data_model.get_variable_types().size() != 0) { std::vector enum_variable_types(data_model.get_variable_types().size()); - std::transform( - data_model.get_variable_types().cbegin(), - data_model.get_variable_types().cend(), - enum_variable_types.begin(), - [](const auto val) -> var_t { return val == 'I' ? var_t::INTEGER : var_t::CONTINUOUS; }); + std::transform(data_model.get_variable_types().cbegin(), + data_model.get_variable_types().cend(), + enum_variable_types.begin(), + detail::char_to_var_type); op_problem.set_variable_types(enum_variable_types.data(), enum_variable_types.size()); } @@ -1742,6 +2064,7 @@ std::unique_ptr> solve_lp( "problem_interface cannot be null"); // Check if remote execution is enabled (always uses CPU backend) +#ifdef CUOPT_ENABLE_GRPC if (is_remote_execution_enabled()) { cuopt_expects(!is_batch_mode, error_type_t::ValidationError, @@ -1753,6 +2076,11 @@ std::unique_ptr> solve_lp( "Remote execution requires CPU memory backend"); return solve_lp_remote(*cpu_prob, settings); } +#else + cuopt_expects(!is_remote_execution_enabled(), + error_type_t::ValidationError, + "Remote execution was requested, but this build was compiled without gRPC support"); +#endif // Local execution - dispatch to appropriate overload based on problem type auto* cpu_prob = dynamic_cast*>(problem_interface); @@ -1770,51 +2098,60 @@ std::unique_ptr> solve_lp( return std::make_unique>(std::move(gpu_solution)); } -#define INSTANTIATE(F_TYPE) \ - template optimization_problem_solution_t solve_lp( \ - optimization_problem_t& op_problem, \ - pdlp_solver_settings_t const& settings, \ - bool problem_checking, \ - bool use_pdlp_solver_mode, \ - bool is_batch_mode); \ - \ - template optimization_problem_solution_t solve_lp( \ - raft::handle_t const* handle_ptr, \ - const cuopt::mps_parser::mps_data_model_t& mps_data_model, \ - pdlp_solver_settings_t const& settings, \ - bool problem_checking, \ - bool use_pdlp_solver_mode); \ - \ - template std::unique_ptr> solve_lp( \ - cpu_optimization_problem_t&, \ - pdlp_solver_settings_t const&, \ - bool, \ - bool, \ - bool); \ - \ - template std::unique_ptr> solve_lp( \ - optimization_problem_interface_t*, \ - pdlp_solver_settings_t const&, \ - bool, \ - bool, \ - bool); \ - \ - template optimization_problem_solution_t solve_lp_with_method( \ - detail::problem_t& problem, \ - pdlp_solver_settings_t const& settings, \ - const timer_t& timer, \ - bool is_batch_mode); \ - \ - template optimization_problem_solution_t batch_pdlp_solve( \ - raft::handle_t const* handle_ptr, \ - const cuopt::mps_parser::mps_data_model_t& mps_data_model, \ - const std::vector& fractional, \ - const std::vector& root_soln_x, \ - pdlp_solver_settings_t const& settings); \ - \ - template optimization_problem_t mps_data_model_to_optimization_problem( \ - raft::handle_t const* handle_ptr, \ - const cuopt::mps_parser::mps_data_model_t& data_model); \ +#define INSTANTIATE(F_TYPE) \ + template optimization_problem_solution_t solve_lp( \ + optimization_problem_t& op_problem, \ + pdlp_solver_settings_t const& settings, \ + bool problem_checking, \ + bool use_pdlp_solver_mode, \ + bool is_batch_mode); \ + \ + template optimization_problem_solution_t solve_lp( \ + raft::handle_t const* handle_ptr, \ + const cuopt::mps_parser::mps_data_model_t& mps_data_model, \ + pdlp_solver_settings_t const& settings, \ + bool problem_checking, \ + bool use_pdlp_solver_mode); \ + \ + template std::unique_ptr> solve_lp( \ + cpu_optimization_problem_t&, \ + pdlp_solver_settings_t const&, \ + bool, \ + bool, \ + bool); \ + \ + template std::unique_ptr> solve_lp( \ + optimization_problem_interface_t*, \ + pdlp_solver_settings_t const&, \ + bool, \ + bool, \ + bool); \ + \ + template optimization_problem_solution_t solve_lp_with_method( \ + detail::problem_t& problem, \ + pdlp_solver_settings_t const& settings, \ + const timer_t& timer, \ + bool is_batch_mode); \ + \ + template optimization_problem_solution_t batch_pdlp_solve( \ + raft::handle_t const* handle_ptr, \ + const cuopt::mps_parser::mps_data_model_t& mps_data_model, \ + const std::vector& fractional, \ + const std::vector& root_soln_x, \ + pdlp_solver_settings_t const& settings); \ + \ + template optimization_problem_solution_t run_batch_pdlp( \ + optimization_problem_t& problem, \ + pdlp_solver_settings_t const& settings); \ + \ + template size_t compute_optimal_batch_size(const optimization_problem_t& problem, \ + bool per_climber_objectives, \ + bool per_climber_constraint_bounds, \ + bool collect_solutions); \ + \ + template optimization_problem_t mps_data_model_to_optimization_problem( \ + raft::handle_t const* handle_ptr, \ + const cuopt::mps_parser::mps_data_model_t& data_model); \ template void set_pdlp_solver_mode(pdlp_solver_settings_t& settings); #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh index 984454b6f9..8aea524570 100644 --- a/cpp/src/pdlp/solve.cuh +++ b/cpp/src/pdlp/solve.cuh @@ -15,6 +15,11 @@ namespace cuopt::linear_programming { +namespace detail { +template +class problem_t; +} // namespace detail + template cuopt::linear_programming::optimization_problem_t mps_data_model_to_optimization_problem( raft::handle_t const* handle_ptr, @@ -27,6 +32,80 @@ cuopt::linear_programming::optimization_problem_solution_t solve_lp_wi const timer_t& timer, bool is_batch_mode = false); +/** + * @brief Entry point for batch PDLP. Solves multiple LPs sharing the same constraint + * matrix structure in a single batched GPU run. + * + * Two call contexts are supported: + * + * 1. Strong-branching path: + * The caller passes an un-expanded optimization_problem_t plus per-climber + * variable bounds in settings.new_bounds. Each bound entry has shape + * (climber_id, variable_index, lower, upper); several entries may target + * the same climber. The batch size is max(climber_id) + 1. run_batch_pdlp + * auto-picks the optimal sub-batch size and may loop over sub-batches, + * managing memory pressure internally. + * See pdlp_test.cu:strong_branching_user_api for a full example. + * + * 2. Fixed-batch path (settings.fixed_batch_size > 0): + * The caller has already sized the batch (typically via + * compute_optimal_batch_size below) and pre-expanded the per-climber problem + * fields directly on the optimization_problem_t (objective_coefficients, + * constraint_lower_bounds, constraint_upper_bounds, batch_objective_offsets_). + * run_batch_pdlp performs a single solve_lp with no memory-aware sub-batching. + * See pdlp_test.cu:big_batch_fixed_path for a full example. + * + * @param problem The optimization problem (un-expanded for case 1, pre-expanded for case 2). + * @param settings Solver settings + * @return The batched solution. + * + * @code + * // Case 1: Strong branching (auto batch sizing) + * pdlp_solver_settings_t settings; + * // Per-climber variable bounds: (climber_id, variable_index, lower, upper). + * settings.new_bounds.push_back({0, branch_var, lower_bound, down_bound}); + * settings.new_bounds.push_back({1, branch_var, up_bound, upper_bound}); + * auto solution = run_batch_pdlp(problem, settings); + * @endcode + * + * @code + * // Case 2: Fixed batch (caller-managed expansion) + * size_t batch_size = compute_optimal_batch_size(problem, + * per_climber_objectives, + * per_climber_constraint_bounds); + * expand_problem_in_place(problem, batch_size); // caller fills the per-climber fields + * // Shouldn't use the set_X API as it will change the problem n_variables and n_constraints + * // Instead, directly use get_X() = X to set the values + * pdlp_solver_settings_t settings; + * settings.fixed_batch_size = batch_size; + * auto solution = run_batch_pdlp(problem, settings); + * @endcode + */ +template +cuopt::linear_programming::optimization_problem_solution_t run_batch_pdlp( + cuopt::linear_programming::optimization_problem_t& problem, + pdlp_solver_settings_t const& settings); + +/** + @brief Compute the optimal batch size for the problem. + @param problem The problem to compute the optimal batch size for. + @param per_climber_objectives Whether the problem will per-climber objectives (resulting in a + larger memory footprint). + @param per_climber_constraint_bounds Whether the problem will have per-climber constraint bounds + (resulting in a larger memory footprint). + @param collect_solutions Whether the problem has per-climber solutions (only for testing, by + default we don't need to collect solution vectors). + @return The optimal batch size for the problem. + @note At this stage, the problem shouldn't already be expanded. The results of this function + should be used as the fixed_batch_size to expand the problem and call run_batch_pdlp. +*/ +template +size_t compute_optimal_batch_size( + const cuopt::linear_programming::optimization_problem_t& problem, + bool per_climber_objectives, + bool per_climber_constraint_bounds, + bool collect_solutions = false); // Only for testing + template void set_pdlp_solver_mode(pdlp_solver_settings_t& settings); diff --git a/cpp/src/pdlp/solver_settings.cu b/cpp/src/pdlp/solver_settings.cu index ac2564bb16..28e7428fac 100644 --- a/cpp/src/pdlp/solver_settings.cu +++ b/cpp/src/pdlp/solver_settings.cu @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -19,6 +18,7 @@ #include #include +#include namespace cuopt::linear_programming { @@ -296,23 +296,22 @@ void pdlp_solver_settings_t::set_pdlp_warm_start_data( "last_restart_duality_gap_dual_solution cannot be null"); pdlp_warm_start_data_view_.current_primal_solution_ = - cuopt::mps_parser::span(current_primal_solution, primal_size); + std::span(current_primal_solution, primal_size); pdlp_warm_start_data_view_.current_dual_solution_ = - cuopt::mps_parser::span(current_dual_solution, dual_size); + std::span(current_dual_solution, dual_size); pdlp_warm_start_data_view_.initial_primal_average_ = - cuopt::mps_parser::span(initial_primal_average, primal_size); + std::span(initial_primal_average, primal_size); pdlp_warm_start_data_view_.initial_dual_average_ = - cuopt::mps_parser::span(initial_dual_average, dual_size); - pdlp_warm_start_data_view_.current_ATY_ = - cuopt::mps_parser::span(current_ATY, primal_size); + std::span(initial_dual_average, dual_size); + pdlp_warm_start_data_view_.current_ATY_ = std::span(current_ATY, primal_size); pdlp_warm_start_data_view_.sum_primal_solutions_ = - cuopt::mps_parser::span(sum_primal_solutions, primal_size); + std::span(sum_primal_solutions, primal_size); pdlp_warm_start_data_view_.sum_dual_solutions_ = - cuopt::mps_parser::span(sum_dual_solutions, dual_size); + std::span(sum_dual_solutions, dual_size); pdlp_warm_start_data_view_.last_restart_duality_gap_primal_solution_ = - cuopt::mps_parser::span(last_restart_duality_gap_primal_solution, primal_size); + std::span(last_restart_duality_gap_primal_solution, primal_size); pdlp_warm_start_data_view_.last_restart_duality_gap_dual_solution_ = - cuopt::mps_parser::span(last_restart_duality_gap_dual_solution, dual_size); + std::span(last_restart_duality_gap_dual_solution, dual_size); pdlp_warm_start_data_view_.initial_primal_weight_ = initial_primal_weight; pdlp_warm_start_data_view_.initial_step_size_ = initial_step_size; pdlp_warm_start_data_view_.total_pdlp_iterations_ = total_pdlp_iterations; diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index d17a88dd29..c95ed67ca6 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,6 +28,9 @@ #include +#include +#include + #include namespace cuopt::linear_programming::detail { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index ab0c921cc7..a6d6d14d96 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -25,6 +25,10 @@ #include #include +#include +#include +#include +#include #include #include @@ -38,7 +42,7 @@ convergence_information_t::convergence_information_t( i_t primal_size, i_t dual_size, const std::vector& climber_strategies, - const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params) + const pdlp_solver_settings_t& settings) : batch_mode_(climber_strategies.size() > 1), handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), @@ -46,15 +50,16 @@ convergence_information_t::convergence_information_t( dual_size_h_(dual_size), problem_ptr(&op_problem), op_problem_cusparse_view_(cusparse_view), - l2_norm_primal_linear_objective_{0.0, stream_view_}, - l2_norm_primal_right_hand_side_{0.0, stream_view_}, // TODO later batch mode: per problem rhs + l2_norm_primal_linear_objective_{climber_strategies.size(), stream_view_}, + l2_norm_primal_right_hand_side_{climber_strategies.size(), stream_view_}, + objective_offsets_{climber_strategies.size(), stream_view_}, primal_objective_{climber_strategies.size(), stream_view_}, dual_objective_{climber_strategies.size(), stream_view_}, reduced_cost_dual_objective_{f_t(0.0), stream_view_}, l2_primal_residual_{climber_strategies.size(), stream_view_}, l2_dual_residual_{climber_strategies.size(), stream_view_}, - linf_primal_residual_{0.0, stream_view_}, - linf_dual_residual_{0.0, stream_view_}, + linf_primal_residual_{climber_strategies.size(), stream_view_}, + linf_dual_residual_{climber_strategies.size(), stream_view_}, nb_violated_constraints_{0, stream_view_}, gap_{climber_strategies.size(), stream_view_}, abs_objective_{climber_strategies.size(), stream_view_}, @@ -62,18 +67,20 @@ convergence_information_t::convergence_information_t( dual_residual_{climber_strategies.size() * primal_size_h_, stream_view_}, reduced_cost_{climber_strategies.size() * primal_size_h_, stream_view_}, bound_value_{static_cast(std::max(primal_size_h_, dual_size_h_)), stream_view_}, - primal_slack_{(hyper_params.use_reflected_primal_dual) + primal_slack_{(settings.hyper_params.use_reflected_primal_dual) ? static_cast(dual_size_h_ * climber_strategies.size()) : 0, stream_view_}, reusable_device_scalar_value_1_{1.0, stream_view_}, reusable_device_scalar_value_0_{0.0, stream_view_}, reusable_device_scalar_value_neg_1_{-1.0, stream_view_}, + segmented_sum_handler_{stream_view_}, dual_dot_{climber_strategies.size(), stream_view_}, sum_primal_slack_{climber_strategies.size(), stream_view_}, climber_strategies_(climber_strategies), - hyper_params_(hyper_params) + hyper_params_(settings.hyper_params) { + // Zero-init per-climber scalars RAFT_CUDA_TRY(cudaMemsetAsync( primal_objective_.data(), 0, sizeof(f_t) * primal_objective_.size(), stream_view_)); RAFT_CUDA_TRY( @@ -81,35 +88,133 @@ convergence_information_t::convergence_information_t( RAFT_CUDA_TRY(cudaMemsetAsync(gap_.data(), 0, sizeof(f_t) * gap_.size(), stream_view_)); RAFT_CUDA_TRY( cudaMemsetAsync(abs_objective_.data(), 0, sizeof(f_t) * abs_objective_.size(), stream_view_)); - RAFT_CUDA_TRY(cudaMemsetAsync( l2_dual_residual_.data(), 0, sizeof(f_t) * l2_dual_residual_.size(), stream_view_)); RAFT_CUDA_TRY(cudaMemsetAsync( l2_primal_residual_.data(), 0, sizeof(f_t) * l2_primal_residual_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + linf_primal_residual_.data(), 0, sizeof(f_t) * linf_primal_residual_.size(), stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + linf_dual_residual_.data(), 0, sizeof(f_t) * linf_dual_residual_.size(), stream_view_)); + + init_objective_offsets(); + init_reduction_storage(); + init_l2_norms(); - combine_constraint_bounds(*problem_ptr, - primal_residual_, - batch_mode_); // primal_residual_ will contain abs max of bounds when - // finite, otherwise 0 //just reused allocated mem here + // Zero the residual workspace (reused each iteration by compute_convergence_information). + RAFT_CUDA_TRY(cudaMemsetAsync( + primal_residual_.data(), 0.0, sizeof(f_t) * primal_residual_.size(), stream_view_)); + RAFT_CUDA_TRY( + cudaMemsetAsync(dual_residual_.data(), 0.0, sizeof(f_t) * dual_residual_.size(), stream_view_)); +} - // TODO later batch mode: different objective coefficients - // constant throughout solving, so precompute - my_l2_norm( - problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_, handle_ptr_); +// --------------------------------------------------------------------------- +// init_objective_offsets: fill the per-climber objective_offsets_ device vector. +// - Non-batch: single entry = scalar problem offset. +// - Batch with user-provided per-climber offsets: copy from host vector. +// - Batch without per-climber offsets: replicate the scalar problem offset. +// --------------------------------------------------------------------------- +template +void convergence_information_t::init_objective_offsets() +{ + const auto* original = (problem_ptr != nullptr) ? problem_ptr->original_problem_ptr : nullptr; + if (original != nullptr && !original->get_batch_objective_offsets().empty()) { + const auto& h_offsets = original->get_batch_objective_offsets(); + cuopt_assert(h_offsets.size() == climber_strategies_.size(), + "batch_objective_offsets size must equal batch size"); + raft::copy(objective_offsets_.data(), h_offsets.data(), h_offsets.size(), stream_view_); + } else { + thrust::fill(handle_ptr_->get_thrust_policy(), + objective_offsets_.begin(), + objective_offsets_.end(), + problem_ptr->presolve_data.objective_offset); + } +} +// --------------------------------------------------------------------------- +// init_l2_norms: precompute the L2 norms of objective coefficients and RHS +// (constraint bounds) used in the relative termination criteria. +// +// In batch mode the problem fields may be single-problem-sized (splitting path, +// only variable bounds differ) or batch-expanded (fixed path, per-climber +// objectives / constraint bounds). Both cases are handled: +// - Single-problem: compute the norm once, broadcast to all climbers. +// - Batch-expanded: compute per-climber via segmented reduce. +// --------------------------------------------------------------------------- +template +void convergence_information_t::init_l2_norms() +{ + const size_t obj_size = problem_ptr->objective_coefficients.size(); + const bool per_climber_objectives = obj_size > static_cast(primal_size_h_); + const size_t cstr_size = problem_ptr->constraint_lower_bounds.size(); + const bool per_climber_constraints = cstr_size > static_cast(dual_size_h_); + + // --- Objective L2 norm --- + if (!per_climber_objectives) { + // Shared objective coefficients: cublasnrm2 → single entry. + my_l2_norm( + problem_ptr->objective_coefficients, l2_norm_primal_linear_objective_, handle_ptr_); + // Broadcast in case we are in batch mode, else is a no op anyways + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_linear_objective_.begin(), + l2_norm_primal_linear_objective_.end(), + l2_norm_primal_linear_objective_.element(0, stream_view_)); + } else { + // Per-climber objective coefficients: Segmented reduce: one segment per climber. + segmented_sum_handler_.segmented_sum_helper( + thrust::make_transform_iterator(problem_ptr->objective_coefficients.data(), + power_two_func_t{}), + thrust::make_transform_output_iterator(l2_norm_primal_linear_objective_.data(), + sqrt_func_t{}), + climber_strategies_.size(), + primal_size_h_); + } + + // --- RHS L2 norm (constraint bounds) --- if (hyper_params_.initial_primal_weight_combined_bounds) { cuopt_expects(!batch_mode_, error_type_t::ValidationError, "Batch mode not supported with initial_primal_weight_combined_bounds"); - my_l2_norm(primal_residual_, l2_norm_primal_right_hand_side_, handle_ptr_); + combine_constraint_bounds(*problem_ptr, primal_residual_); + my_l2_norm(primal_residual_.data(), + l2_norm_primal_right_hand_side_.data(), + primal_residual_.size(), + handle_ptr_); } else { - // TODO later batch mode: different constraints bounds - compute_sum_bounds(problem_ptr->constraint_lower_bounds, - problem_ptr->constraint_upper_bounds, - l2_norm_primal_right_hand_side_, - handle_ptr_->get_stream()); + if (!per_climber_constraints) { + // Shared constraint bounds: compute_sum_bounds gives sum-of-squares (matching the original + // formula). + compute_sum_bounds(problem_ptr->constraint_lower_bounds, + problem_ptr->constraint_upper_bounds, + l2_norm_primal_right_hand_side_.data(), + handle_ptr_->get_stream()); + // Broadcast in case we are in batch mode, else is a no op anyways + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_right_hand_side_.begin(), + l2_norm_primal_right_hand_side_.end(), + l2_norm_primal_right_hand_side_.element(0, stream_view_)); + } else { + // Per-climber constraint bounds: Segmented reduce. + segmented_sum_handler_.segmented_sum_helper( + thrust::make_transform_iterator( + thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + rhs_sum_of_squares_t{}), + thrust::make_transform_output_iterator(l2_norm_primal_right_hand_side_.data(), + sqrt_func_t{}), + climber_strategies_.size(), + dual_size_h_); + } } +} +// --------------------------------------------------------------------------- +// init_reduction_storage: allocate and size the temporary buffers used by +// cub::DeviceReduce and cub::DeviceSegmentedReduce throughout solving. +// --------------------------------------------------------------------------- +template +void convergence_information_t::init_reduction_storage() +{ void* d_temp_storage = NULL; size_t temp_storage_bytes_1 = 0; cub::DeviceReduce::Sum(d_temp_storage, @@ -129,71 +234,6 @@ convergence_information_t::convergence_information_t( size_of_buffer_ = std::max({temp_storage_bytes_1, temp_storage_bytes_2}); this->rmm_tmp_buffer_ = rmm::device_buffer{size_of_buffer_, stream_view_}; - - if (batch_mode_) { - // Pass down any input pointer of the right type, actual pointer does not matter - size_t byte_needed = 0; - - cub::DeviceSegmentedReduce::Sum( - nullptr, - byte_needed, - thrust::make_transform_iterator(dual_dot_.data(), power_two_func_t{}), - dual_dot_.data(), - climber_strategies_.size(), - dual_size, - stream_view_); - dot_product_bytes_ = std::max(dot_product_bytes_, byte_needed); - - cub::DeviceSegmentedReduce::Sum( - nullptr, - byte_needed, - thrust::make_transform_iterator(dual_dot_.data(), power_two_func_t{}), - dual_dot_.data(), - climber_strategies_.size(), - primal_size, - stream_view_); - dot_product_bytes_ = std::max(dot_product_bytes_, byte_needed); - - cub::DeviceSegmentedReduce::Sum( - nullptr, - byte_needed, - thrust::make_transform_iterator(thrust::make_zip_iterator(dual_dot_.data(), dual_dot_.data()), - tuple_multiplies{}), - dual_dot_.data(), - climber_strategies_.size(), - primal_size, - stream_view_); - dot_product_bytes_ = std::max(dot_product_bytes_, byte_needed); - - cub::DeviceSegmentedReduce::Sum(nullptr, - byte_needed, - dual_dot_.data(), - dual_dot_.data(), - climber_strategies_.size(), - dual_size_h_, - stream_view_); - dot_product_bytes_ = std::max(dot_product_bytes_, byte_needed); - - cub::DeviceSegmentedReduce::Sum( - nullptr, - dot_product_bytes_, - thrust::make_transform_iterator( - thrust::make_zip_iterator(dual_dot_.data(), - problem_wrap_container(problem_ptr->objective_coefficients)), - tuple_multiplies{}), - primal_objective_.data(), - climber_strategies_.size(), - primal_size_h_, - stream_view_); - dot_product_bytes_ = std::max(dot_product_bytes_, byte_needed); - - dot_product_storage_.resize(dot_product_bytes_, stream_view_); - } - - RAFT_CUDA_TRY(cudaMemsetAsync( - primal_residual_.data(), 0.0, sizeof(f_t) * primal_residual_.size(), stream_view_)); - RAFT_CUDA_TRY( - cudaMemsetAsync(dual_residual_.data(), 0.0, sizeof(f_t) * dual_residual_.size(), stream_view_)); } template @@ -204,10 +244,15 @@ __global__ void convergence_information_swap_device_vectors_kernel( raft::device_span dual_objective, raft::device_span l2_primal_residual, raft::device_span l2_dual_residual, + raft::device_span linf_primal_residual, + raft::device_span linf_dual_residual, raft::device_span gap, raft::device_span abs_objective, raft::device_span dual_dot, - raft::device_span sum_primal_slack) + raft::device_span sum_primal_slack, + raft::device_span objective_offsets, + raft::device_span l2_norm_primal_linear_objective, + raft::device_span l2_norm_primal_right_hand_side) { const i_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); if (idx >= swap_count) { return; } @@ -218,10 +263,15 @@ __global__ void convergence_information_swap_device_vectors_kernel( cuda::std::swap(dual_objective[left], dual_objective[right]); cuda::std::swap(l2_primal_residual[left], l2_primal_residual[right]); cuda::std::swap(l2_dual_residual[left], l2_dual_residual[right]); + cuda::std::swap(linf_primal_residual[left], linf_primal_residual[right]); + cuda::std::swap(linf_dual_residual[left], linf_dual_residual[right]); cuda::std::swap(gap[left], gap[right]); cuda::std::swap(abs_objective[left], abs_objective[right]); cuda::std::swap(dual_dot[left], dual_dot[right]); cuda::std::swap(sum_primal_slack[left], sum_primal_slack[right]); + cuda::std::swap(objective_offsets[left], objective_offsets[right]); + cuda::std::swap(l2_norm_primal_linear_objective[left], l2_norm_primal_linear_objective[right]); + cuda::std::swap(l2_norm_primal_right_hand_side[left], l2_norm_primal_right_hand_side[right]); } template @@ -252,10 +302,15 @@ void convergence_information_t::swap_context( make_span(dual_objective_), make_span(l2_primal_residual_), make_span(l2_dual_residual_), + make_span(linf_primal_residual_), + make_span(linf_dual_residual_), make_span(gap_), make_span(abs_objective_), make_span(dual_dot_), - make_span(sum_primal_slack_)); + make_span(sum_primal_slack_), + make_span(objective_offsets_), + make_span(l2_norm_primal_linear_objective_), + make_span(l2_norm_primal_right_hand_side_)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -276,36 +331,40 @@ void convergence_information_t::resize_context(i_t new_size) dual_objective_.resize(new_size, stream_view_); l2_primal_residual_.resize(new_size, stream_view_); l2_dual_residual_.resize(new_size, stream_view_); + linf_primal_residual_.resize(new_size, stream_view_); + linf_dual_residual_.resize(new_size, stream_view_); + l2_norm_primal_linear_objective_.resize(new_size, stream_view_); + l2_norm_primal_right_hand_side_.resize(new_size, stream_view_); + if (objective_offsets_.size() > 1) { objective_offsets_.resize(new_size, stream_view_); } gap_.resize(new_size, stream_view_); abs_objective_.resize(new_size, stream_view_); dual_dot_.resize(new_size, stream_view_); sum_primal_slack_.resize(new_size, stream_view_); } -template -void convergence_information_t::set_relative_dual_tolerance_factor( - f_t dual_tolerance_factor) -{ - l2_norm_primal_linear_objective_.set_value_async(dual_tolerance_factor, stream_view_); -} - template void convergence_information_t::set_relative_primal_tolerance_factor( f_t primal_tolerance_factor) { - l2_norm_primal_right_hand_side_.set_value_async(primal_tolerance_factor, stream_view_); + cub::DeviceTransform::Transform(thrust::make_constant_iterator(primal_tolerance_factor), + l2_norm_primal_right_hand_side_.data(), + l2_norm_primal_right_hand_side_.size(), + cuda::std::identity{}, + stream_view_); } template -f_t convergence_information_t::get_relative_dual_tolerance_factor() const +const rmm::device_uvector& +convergence_information_t::get_l2_norm_primal_linear_objective() const { - return l2_norm_primal_linear_objective_.value(stream_view_); + return l2_norm_primal_linear_objective_; } template -f_t convergence_information_t::get_relative_primal_tolerance_factor() const +const rmm::device_uvector& +convergence_information_t::get_l2_norm_primal_right_hand_side() const { - return l2_norm_primal_right_hand_side_.value(stream_view_); + return l2_norm_primal_right_hand_side_; } template @@ -368,14 +427,11 @@ void convergence_information_t::compute_convergence_information( if (!batch_mode_) my_l2_norm(primal_residual_, l2_primal_residual_, handle_ptr_); else { - cub::DeviceSegmentedReduce::Sum( - dot_product_storage_.data(), - dot_product_bytes_, + segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(primal_residual_.data(), power_two_func_t{}), l2_primal_residual_.data(), climber_strategies_.size(), - dual_size_h_, - stream_view_); + dual_size_h_); cub::DeviceTransform::Transform( l2_primal_residual_.data(), l2_primal_residual_.data(), @@ -389,34 +445,25 @@ void convergence_information_t::compute_convergence_information( #endif // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { - // TODO later batch mode: handle per_constraint_residual here - cuopt_expects(!batch_mode_, - error_type_t::ValidationError, - "Batch mode not supported for per_constraint_residual"); - // Compute the linf of (residual_i - rel * b_i) if (settings.save_best_primal_so_far) { const i_t zero_int = 0; nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream()); } + // We may be solving a batch of problems so have a bigger primal_residual_ vector but not have + // per climber combined bounds (if it's the same accross climbers) So we need to use a wrapped + // iterator to iterate over the combined bounds + cuopt_assert(primal_residual_.size() % combined_bounds.size() == 0, + "primal_residual_.size() must be divisible by combined_bounds.size()"); auto transform_iter = thrust::make_transform_iterator( - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), + thrust::make_zip_iterator(primal_residual_.cbegin(), problem_wrap_container(combined_bounds)), relative_residual_t{settings.tolerances.relative_primal_tolerance}); - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); - rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); - RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); + segmented_sum_handler_.segmented_reduce_helper(transform_iter, + linf_primal_residual_.data(), + climber_strategies_.size(), + dual_size_h_, + cuda::maximum<>{}, + std::numeric_limits::lowest()); } compute_dual_residual(op_problem_cusparse_view_, @@ -432,14 +479,11 @@ void convergence_information_t::compute_convergence_information( if (!batch_mode_) my_l2_norm(dual_residual_, l2_dual_residual_, handle_ptr_); else { - cub::DeviceSegmentedReduce::Sum( - dot_product_storage_.data(), - dot_product_bytes_, + segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(dual_residual_.data(), power_two_func_t{}), l2_dual_residual_.data(), climber_strategies_.size(), - primal_size_h_, - stream_view_); + primal_size_h_); cub::DeviceTransform::Transform( l2_dual_residual_.data(), l2_dual_residual_.data(), @@ -452,32 +496,17 @@ void convergence_information_t::compute_convergence_information( #endif // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt if (settings.per_constraint_residual) { - // TODO later batch mode: handle per_constraint_residual here - cuopt_expects(!batch_mode_, - error_type_t::ValidationError, - "Batch mode not supported for per_constraint_residual"); - // Compute the linf of (residual_i - rel * c_i) - { - auto transform_iter = thrust::make_transform_iterator( - thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), - relative_residual_t{settings.tolerances.relative_dual_tolerance}); - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); - rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); - cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); - } + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(dual_residual_.cbegin(), + problem_wrap_container(objective_coefficients)), + relative_residual_t{settings.tolerances.relative_dual_tolerance}); + segmented_sum_handler_.segmented_reduce_helper(transform_iter, + linf_dual_residual_.data(), + climber_strategies_.size(), + primal_size_h_, + cuda::maximum<>{}, + std::numeric_limits::lowest()); } const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); @@ -577,13 +606,13 @@ void convergence_information_t::compute_primal_residual( template __global__ void apply_objective_scaling_and_offset(raft::device_span objective, f_t objective_scaling_factor, - f_t objective_offset, + raft::device_span objective_offsets, int batch_size) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= batch_size) { return; } - objective[idx] = objective_scaling_factor * (objective[idx] + objective_offset); + objective[idx] = objective_scaling_factor * (objective[idx] + objective_offsets[idx]); } template @@ -602,27 +631,24 @@ void convergence_information_t::compute_primal_objective( primal_objective_.data(), stream_view_)); } else { - cub::DeviceSegmentedReduce::Sum( - dot_product_storage_.data(), - dot_product_bytes_, + segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator( thrust::make_zip_iterator(primal_solution.data(), problem_wrap_container(problem_ptr->objective_coefficients)), tuple_multiplies{}), primal_objective_.data(), climber_strategies_.size(), - primal_size_h_, - stream_view_); + primal_size_h_); } - // primal_objective = 1 * (primal_objective + 0) = primal_objective - if (problem_ptr->presolve_data.objective_scaling_factor != 1 || - problem_ptr->presolve_data.objective_offset != 0) { + // Apply per-climber objective scaling and offset. objective_offsets_ is always populated + // (defaults to the scalar problem offset replicated, or user-specified per-climber offsets). + { const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); apply_objective_scaling_and_offset<<>>( make_span(primal_objective_), problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset, + make_span(objective_offsets_), climber_strategies_.size()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -774,24 +800,16 @@ void convergence_information_t::compute_dual_objective( dual_size_h_, stream_view_); } else { - cub::DeviceSegmentedReduce::Sum( - dot_product_storage_.data(), - dot_product_bytes_, + segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator( thrust::make_zip_iterator(dual_slack.data(), primal_solution.data()), tuple_multiplies{}), dual_dot_.data(), climber_strategies_.size(), - primal_size_h_, - stream_view_); - - cub::DeviceSegmentedReduce::Sum(dot_product_storage_.data(), - dot_product_bytes_, - primal_slack_.data(), - sum_primal_slack_.data(), - climber_strategies_.size(), - dual_size_h_, - stream_view_); + primal_size_h_); + + segmented_sum_handler_.segmented_sum_helper( + primal_slack_.data(), sum_primal_slack_.data(), climber_strategies_.size(), dual_size_h_); } cub::DeviceTransform::Transform( @@ -802,14 +820,13 @@ void convergence_information_t::compute_dual_objective( stream_view_); } - // dual_objective = 1 * (dual_objective + 0) = dual_objective - if (problem_ptr->presolve_data.objective_scaling_factor != 1 || - problem_ptr->presolve_data.objective_offset != 0) { + // Apply per-climber objective scaling and offset. + { const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); apply_objective_scaling_and_offset<<>>( make_span(dual_objective_), problem_ptr->presolve_data.objective_scaling_factor, - problem_ptr->presolve_data.objective_offset, + make_span(objective_offsets_), climber_strategies_.size()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -912,14 +929,14 @@ const rmm::device_uvector& convergence_information_t::get_l2_dual } template -const rmm::device_scalar& +const rmm::device_uvector& convergence_information_t::get_relative_linf_primal_residual() const { return linf_primal_residual_; } template -const rmm::device_scalar& +const rmm::device_uvector& convergence_information_t::get_relative_linf_dual_residual() const { return linf_dual_residual_; @@ -942,18 +959,16 @@ template f_t convergence_information_t::get_relative_l2_primal_residual_value( i_t climber_strategy_id) const { - // TODO later batch mode: handle per climber rhs return l2_primal_residual_.element(climber_strategy_id, stream_view_) / - (f_t(1.0) + l2_norm_primal_right_hand_side_.value(stream_view_)); + (f_t(1.0) + l2_norm_primal_right_hand_side_.element(climber_strategy_id, stream_view_)); } template f_t convergence_information_t::get_relative_l2_dual_residual_value( i_t climber_strategy_id) const { - // TODO later batch mode: handle per climber objective return l2_dual_residual_.element(climber_strategy_id, stream_view_) / - (f_t(1.0) + l2_norm_primal_linear_objective_.value(stream_view_)); + (f_t(1.0) + l2_norm_primal_linear_objective_.element(climber_strategy_id, stream_view_)); } template @@ -963,15 +978,15 @@ typename convergence_information_t::view_t convergence_information_t #include +#include #include @@ -34,7 +35,7 @@ class convergence_information_t { i_t primal_size, i_t dual_size, const std::vector& climber_strategies, - const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params); + const pdlp_solver_settings_t& settings); void compute_convergence_information( pdhg_solver_t& current_pdhg_solver, @@ -54,17 +55,16 @@ class convergence_information_t { const rmm::device_uvector& get_dual_objective() const; const rmm::device_uvector& get_l2_primal_residual() const; const rmm::device_uvector& get_l2_dual_residual() const; - const rmm::device_scalar& get_relative_linf_primal_residual() const; - const rmm::device_scalar& get_relative_linf_dual_residual() const; + const rmm::device_uvector& get_relative_linf_primal_residual() const; + const rmm::device_uvector& get_relative_linf_dual_residual() const; const rmm::device_uvector& get_gap() const; f_t get_relative_gap_value(i_t climber_strategy_id = 0) const; f_t get_relative_l2_primal_residual_value(i_t climber_strategy_id = 0) const; f_t get_relative_l2_dual_residual_value(i_t climber_strategy_id = 0) const; - void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor); void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor); - f_t get_relative_dual_tolerance_factor() const; - f_t get_relative_primal_tolerance_factor() const; + const rmm::device_uvector& get_l2_norm_primal_linear_objective() const; + const rmm::device_uvector& get_l2_norm_primal_right_hand_side() const; struct view_t { i_t primal_size; @@ -74,16 +74,16 @@ class convergence_information_t { f_t* l_inf_norm_primal_linear_objective; f_t* l_inf_norm_primal_right_hand_side; - f_t* l2_norm_primal_linear_objective; - f_t* l2_norm_primal_right_hand_side; + raft::device_span l2_norm_primal_linear_objective; + raft::device_span l2_norm_primal_right_hand_side; raft::device_span primal_objective; raft::device_span dual_objective; raft::device_span l2_primal_residual; raft::device_span l2_dual_residual; - f_t* relative_l_inf_primal_residual; - f_t* relative_l_inf_dual_residual; + raft::device_span relative_l_inf_primal_residual; + raft::device_span relative_l_inf_dual_residual; raft::device_span gap; raft::device_span abs_objective; @@ -143,6 +143,11 @@ class convergence_information_t { void compute_reduced_costs_dual_objective_contribution(); + // Ctor helpers — each handles both batch and non-batch internally. + void init_objective_offsets(); + void init_l2_norms(); + void init_reduction_storage(); + const bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; @@ -155,8 +160,13 @@ class convergence_information_t { problem_t* problem_ptr; cusparse_view_t& op_problem_cusparse_view_; - rmm::device_scalar l2_norm_primal_linear_objective_; - rmm::device_scalar l2_norm_primal_right_hand_side_; + rmm::device_uvector l2_norm_primal_linear_objective_; + rmm::device_uvector l2_norm_primal_right_hand_side_; + + // Per-climber objective offsets. Always populated: + // - Non-batch mode: size = 1 with problem's scalar offset + // - Batch mode: size = batch_size, either per-climber (from settings) or replicated + rmm::device_uvector objective_offsets_; rmm::device_uvector primal_objective_; rmm::device_uvector dual_objective_; @@ -166,9 +176,10 @@ class convergence_information_t { // Useful in per constraint mode // To compute residual we check: residual[i] < absolute_tolerance + relative_tolerance * rhs[i] // Which can be rewritten as: residual[i] - relative_tolerance * rhs[i] < absolute_tolerance - // We thus store l_inf(residual_i - rel * b/c_i) ran over all the constraints - rmm::device_scalar linf_primal_residual_; - rmm::device_scalar linf_dual_residual_; + // We thus store l_inf(residual_i - rel * b/c_i) ran over all the constraints. + // Per-climber in batch mode (size = climber_strategies_.size()); size 1 in non-batch mode. + rmm::device_uvector linf_primal_residual_; + rmm::device_uvector linf_dual_residual_; // Useful for best_primal_so_far rmm::device_scalar nb_violated_constraints_; @@ -190,8 +201,7 @@ class convergence_information_t { const rmm::device_scalar reusable_device_scalar_value_1_; const rmm::device_scalar reusable_device_scalar_value_0_; const rmm::device_scalar reusable_device_scalar_value_neg_1_; - rmm::device_buffer dot_product_storage_; - size_t dot_product_bytes_{0}; + segmented_sum_handler_t segmented_sum_handler_; rmm::device_uvector dual_dot_; rmm::device_uvector sum_primal_slack_; diff --git a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu index dbb35b732d..9268e17910 100644 --- a/cpp/src/pdlp/termination_strategy/infeasibility_information.cu +++ b/cpp/src/pdlp/termination_strategy/infeasibility_information.cu @@ -15,6 +15,8 @@ #include +#include + #include #include #include @@ -24,6 +26,14 @@ #include #include +#include +#include +#include +#include +#include +#include +#include + namespace cuopt::linear_programming::detail { template infeasibility_information_t::infeasibility_information_t( @@ -71,11 +81,11 @@ infeasibility_information_t::infeasibility_information_t( (!infeasibility_detection) ? 0 : static_cast(dual_size_h_), stream_view_}, homogenous_dual_upper_bounds_{ (!infeasibility_detection) ? 0 : static_cast(dual_size_h_), stream_view_}, - primal_slack_{(is_cupdlpx_restart(hyper_params)) + primal_slack_{(is_cupdlpx_restart(hyper_params) && infeasibility_detection) ? static_cast(dual_size_h_ * climber_strategies.size()) : 0, stream_view_}, - dual_slack_{(is_cupdlpx_restart(hyper_params)) + dual_slack_{(is_cupdlpx_restart(hyper_params) && infeasibility_detection) ? static_cast(primal_size_h_ * climber_strategies.size()) : 0, stream_view_}, diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu index 5a621daaef..d1a88799d6 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu @@ -36,13 +36,8 @@ pdlp_termination_strategy_t::pdlp_termination_strategy_t( : handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), problem_ptr(&op_problem), - convergence_information_{handle_ptr_, - op_problem, - cusparse_view, - primal_size, - dual_size, - climber_strategies, - settings.hyper_params}, + convergence_information_{ + handle_ptr_, op_problem, cusparse_view, primal_size, dual_size, climber_strategies, settings}, infeasibility_information_{handle_ptr_, op_problem, scaled_op_problem, @@ -91,13 +86,6 @@ void pdlp_termination_strategy_t::resize_context(i_t new_size) termination_status_.resize(new_size); } -template -void pdlp_termination_strategy_t::set_relative_dual_tolerance_factor( - f_t dual_tolerance_factor) -{ - convergence_information_.set_relative_dual_tolerance_factor(dual_tolerance_factor); -} - template void pdlp_termination_strategy_t::set_relative_primal_tolerance_factor( f_t primal_tolerance_factor) @@ -105,18 +93,6 @@ void pdlp_termination_strategy_t::set_relative_primal_tolerance_factor convergence_information_.set_relative_primal_tolerance_factor(primal_tolerance_factor); } -template -f_t pdlp_termination_strategy_t::get_relative_dual_tolerance_factor() const -{ - return convergence_information_.get_relative_dual_tolerance_factor(); -} - -template -f_t pdlp_termination_strategy_t::get_relative_primal_tolerance_factor() const -{ - return convergence_information_.get_relative_primal_tolerance_factor(); -} - template pdlp_termination_status_t pdlp_termination_strategy_t::get_termination_status( i_t id) const @@ -257,15 +233,14 @@ __global__ void check_termination_criteria_kernel( printf( "Primal residual : convergence_information.linf_relative_primal_resiprimal %lf < " "tolerance.absolute_primal_tolerance %lf\n", - *convergence_information.relative_l_inf_primal_residual, + convergence_information.relative_l_inf_primal_residual[idx], tolerance.absolute_primal_tolerance); printf( "Dual residual : convergence_information.linf_relative_dual_residual %lf < " "tolerance.absolute_dual_tolerance %lf\n", - *convergence_information.relative_l_inf_dual_residual, + convergence_information.relative_l_inf_dual_residual[idx], tolerance.absolute_dual_tolerance); } else { - // TODO later batch mode: per problem rhs printf( "Primal residual %lf <= %lf [%d] (tolerance.absolute_primal_tolerance %lf + " "tolerance.relative_primal_tolerance %lf * " @@ -273,14 +248,14 @@ __global__ void check_termination_criteria_kernel( convergence_information.l2_primal_residual[idx], tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * - *convergence_information.l2_norm_primal_right_hand_side, + convergence_information.l2_norm_primal_right_hand_side[idx], convergence_information.l2_primal_residual[idx] <= tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * - *convergence_information.l2_norm_primal_right_hand_side, + convergence_information.l2_norm_primal_right_hand_side[idx], tolerance.absolute_primal_tolerance, tolerance.relative_primal_tolerance, - *convergence_information.l2_norm_primal_right_hand_side); + convergence_information.l2_norm_primal_right_hand_side[idx]); printf( "Dual residual %lf <= %lf [%d] (tolerance.absolute_dual_tolerance %lf + " "tolerance.relative_dual_tolerance %lf * " @@ -288,14 +263,14 @@ __global__ void check_termination_criteria_kernel( convergence_information.l2_dual_residual[idx], tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * - *convergence_information.l2_norm_primal_linear_objective, + convergence_information.l2_norm_primal_linear_objective[idx], convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * - *convergence_information.l2_norm_primal_linear_objective, + convergence_information.l2_norm_primal_linear_objective[idx], tolerance.absolute_dual_tolerance, tolerance.relative_dual_tolerance, - *convergence_information.l2_norm_primal_linear_objective); + convergence_information.l2_norm_primal_linear_objective[idx]); } if (infeasibility_detection) { printf( @@ -325,10 +300,10 @@ __global__ void check_termination_criteria_kernel( // test if respect constraints if (per_constraint_residual) { // In residual we store l_inf(residual_i - rel * b/c_i) - const bool primal_feasible = *convergence_information.relative_l_inf_primal_residual <= + const bool primal_feasible = convergence_information.relative_l_inf_primal_residual[idx] <= tolerance.absolute_primal_tolerance; // First check for optimality - if (*convergence_information.relative_l_inf_dual_residual <= + if (convergence_information.relative_l_inf_dual_residual[idx] <= tolerance.absolute_dual_tolerance && primal_feasible && optimal_gap) { termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; @@ -337,16 +312,18 @@ __global__ void check_termination_criteria_kernel( { termination_status[idx] = (i_t)pdlp_termination_status_t::PrimalFeasible; return; + } else { + termination_status[idx] = (i_t)pdlp_termination_status_t::NoTermination; } } else { const bool primal_feasible = convergence_information.l2_primal_residual[idx] <= tolerance.absolute_primal_tolerance + tolerance.relative_primal_tolerance * - *convergence_information.l2_norm_primal_right_hand_side; + convergence_information.l2_norm_primal_right_hand_side[idx]; if (convergence_information.l2_dual_residual[idx] <= tolerance.absolute_dual_tolerance + tolerance.relative_dual_tolerance * - *convergence_information.l2_norm_primal_linear_objective && + convergence_information.l2_norm_primal_linear_objective[idx] && primal_feasible && optimal_gap) { termination_status[idx] = (i_t)pdlp_termination_status_t::Optimal; return; @@ -393,20 +370,35 @@ bool pdlp_termination_strategy_t::all_optimal_status() const template __host__ __device__ bool pdlp_termination_strategy_t::is_done( - pdlp_termination_status_t termination_status) + pdlp_termination_status_t termination_status, bool accept_primal_feasible) { return termination_status == pdlp_termination_status_t::Optimal || termination_status == pdlp_termination_status_t::PrimalInfeasible || termination_status == pdlp_termination_status_t::DualInfeasible || - termination_status == pdlp_termination_status_t::ConcurrentLimit; + termination_status == pdlp_termination_status_t::ConcurrentLimit || + (accept_primal_feasible && + termination_status == pdlp_termination_status_t::PrimalFeasible); } template -bool pdlp_termination_strategy_t::all_done() const +bool pdlp_termination_strategy_t::all_done(bool accept_primal_feasible) const { - return std::all_of( + return std::all_of(termination_status_.cbegin(), + termination_status_.cend(), + [accept_primal_feasible](i_t termination_status) { + return is_done((pdlp_termination_status_t)termination_status, + accept_primal_feasible); + }); +} + +template +bool pdlp_termination_strategy_t::any_primal_feasible_or_optimal() const +{ + return std::any_of( termination_status_.cbegin(), termination_status_.cend(), [](i_t termination_status) { - return is_done((pdlp_termination_status_t)termination_status); + const auto status = static_cast(termination_status); + return status == pdlp_termination_status_t::Optimal || + status == pdlp_termination_status_t::PrimalFeasible; }); } @@ -436,32 +428,40 @@ __global__ void fill_gpu_terms_stats_kernel( f_t>::gpu_batch_additional_termination_information_t::view_t additional_termination_information, typename convergence_information_t::view_t convergence_information_view, - i_t number_of_steps_taken) + i_t number_of_steps_taken, + bool accept_primal_feasible, + bool per_constraint_residual, + bool force_all) { const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= termination_status.size()) { return; } // TODO later batch mode: add infeasibility information here - // TODO later batch mode: handle per climber rhs and objective - // Will be removed store its data in the struct - if (pdlp_termination_strategy_t::is_done( - (pdlp_termination_status_t)termination_status[idx])) { + // Snapshot stats for climbers that just terminated + if (force_all || pdlp_termination_strategy_t::is_done( + (pdlp_termination_status_t)termination_status[idx], accept_primal_feasible)) { const i_t original_index = original_indices[idx]; additional_termination_information.number_of_steps_taken[original_index] = number_of_steps_taken; additional_termination_information.total_number_of_attempted_steps[original_index] = number_of_steps_taken; + // When `per_constraint_residual` is on the primary primal/dual residual stat exposed to + // the user is the per-row `relative_l_inf_*_residual` (the quantity the kernel actually + // checks against the tolerances), mirroring the non-batch `fill_return_problem_solution` + // path. Otherwise the classic L2 residual is reported. additional_termination_information.l2_primal_residual[original_index] = - convergence_information_view.l2_primal_residual[idx]; + per_constraint_residual ? convergence_information_view.relative_l_inf_primal_residual[idx] + : convergence_information_view.l2_primal_residual[idx]; additional_termination_information.l2_relative_primal_residual[original_index] = convergence_information_view.l2_primal_residual[idx] / - (f_t(1.0) + *convergence_information_view.l2_norm_primal_right_hand_side); + (f_t(1.0) + convergence_information_view.l2_norm_primal_right_hand_side[idx]); additional_termination_information.l2_dual_residual[original_index] = - convergence_information_view.l2_dual_residual[idx]; + per_constraint_residual ? convergence_information_view.relative_l_inf_dual_residual[idx] + : convergence_information_view.l2_dual_residual[idx]; additional_termination_information.l2_relative_dual_residual[original_index] = convergence_information_view.l2_dual_residual[idx] / - (f_t(1.0) + *convergence_information_view.l2_norm_primal_linear_objective); + (f_t(1.0) + convergence_information_view.l2_norm_primal_linear_objective[idx]); additional_termination_information.primal_objective[original_index] = convergence_information_view.primal_objective[idx]; additional_termination_information.dual_objective[original_index] = @@ -474,23 +474,30 @@ __global__ void fill_gpu_terms_stats_kernel( } template -void pdlp_termination_strategy_t::fill_gpu_terms_stats(i_t number_of_iterations) +void pdlp_termination_strategy_t::fill_gpu_terms_stats(i_t number_of_iterations, + bool force_all) { typename convergence_information_t::view_t convergence_information_view = convergence_information_.view(); - // Update original index pinned view so that we can read it safely from the kernel + // Refresh the local->original index map so the kernel can write to original-index space. + // `climber_strategies_` is reordered by `swap_context`, so this must be rebuilt each call. for (size_t i = 0; i < climber_strategies_.size(); ++i) { original_index_[i] = climber_strategies_[i].original_index; } + const bool accept_primal_feasible = + settings_.first_primal_feasible || settings_.all_primal_feasible; const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); fill_gpu_terms_stats_kernel<<>>( make_span(termination_status_), make_span(original_index_), gpu_batch_additional_termination_information_.view(), convergence_information_view, - number_of_iterations); + number_of_iterations, + accept_primal_feasible, + settings_.per_constraint_residual, + force_all); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } @@ -501,6 +508,9 @@ void pdlp_termination_strategy_t::convert_gpu_terms_stats_to_host( typename optimization_problem_solution_t::additional_termination_information_t>& additional_termination_informations) { + cuopt_assert(additional_termination_informations.size() == + gpu_batch_additional_termination_information_.number_of_steps_taken.size(), + "Additional termination informations size mismatch"); for (size_t i = 0; i < additional_termination_informations.size(); ++i) { additional_termination_informations[i].number_of_steps_taken = gpu_batch_additional_termination_information_.number_of_steps_taken[i]; @@ -558,9 +568,7 @@ pdlp_termination_strategy_t::fill_return_problem_solution( raft::copy(&term_stats_vector[i].l2_primal_residual, (settings_.per_constraint_residual) - ? convergence_information_view - .relative_l_inf_primal_residual // TODO later batch mode: handle per climber - // overall residual + ? convergence_information_view.relative_l_inf_primal_residual.data() + i : convergence_information_view.l2_primal_residual.data() + i, 1, stream_view_); @@ -570,7 +578,7 @@ pdlp_termination_strategy_t::fill_return_problem_solution( raft::copy(&term_stats_vector[i].l2_dual_residual, (settings_.per_constraint_residual) - ? convergence_information_view.relative_l_inf_dual_residual + ? convergence_information_view.relative_l_inf_dual_residual.data() + i : convergence_information_view.l2_dual_residual.data() + i, 1, stream_view_); diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp index efb7a41d7b..5cd43d7be7 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp @@ -56,7 +56,30 @@ class pdlp_termination_strategy_t { objective_coefficients // Only useful if per_constraint_residual ); - // Only useful in batch mode to store information of removed climber faster + // Pinned-memory mirror of `optimization_problem_solution_t::additional_termination_information_t` + // for the whole batch. Used only in batch mode. + // + // Why we need this: + // The convergence stats (primal/dual residuals, objectives, gap, ...) live on the device for + // every climber. When a climber terminates, we need those stats on the host. Doing one + // device->host copy per field per climber would be too slow, especially since climbers may + // terminate at different iterations and their device-side arrays get permuted/shrunk by + // `swap_context` / `resize_context` as the batch evolves. + // Instead, `fill_gpu_terms_stats_kernel` writes every field of every just-terminated climber + // into these pinned vectors at a single, stable slot: the climber's *original* batch index + // (see `original_index_` below). The host eventually bulk-copies the pinned vectors into the + // user- facing `std::vector` in + // `convert_gpu_terms_stats_to_host` without having to know anything about the current + // device-side ordering. + // + // Sizing / indexing invariants: + // - Allocated once with `batch_size == original_batch_size_` and never resized; slot `k` + // always corresponds to original climber `k`, regardless of how many climbers have been + // removed or how device-side arrays have been swapped. + // - `fill_gpu_terms_stats_kernel` must be called every time we want to capture the latest + // numbers for any climber that just became `is_done`, because the underlying device-side + // residual/objective arrays are reshuffled by `swap_context` / `resize_context` and would + // otherwise be lost on the next batch resize. struct gpu_batch_additional_termination_information_t { gpu_batch_additional_termination_information_t(size_t batch_size) : number_of_steps_taken(batch_size), @@ -128,23 +151,37 @@ class pdlp_termination_strategy_t { void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); - void fill_gpu_terms_stats(i_t number_of_iterations); + // Snapshot the device-side convergence stats for every climber that just became `is_done` into + // the pinned `gpu_batch_additional_termination_information_` mirror, indexed by the climber's + // original batch index. Must be called before any subsequent `swap_context` / + // `resize_context`, otherwise the underlying device-side stats arrays get permuted/truncated + // and the corresponding climber's numbers are lost. + void fill_gpu_terms_stats(i_t number_of_iterations, bool force_all = false); + + // Bulk-copy the pinned `gpu_batch_additional_termination_information_` mirror into the user- + // facing host vector `additional_termination_informations`, slot-by-slot. + // + // Both `additional_termination_informations` and the pinned mirror are sized to + // `original_batch_size_` and indexed by *original* climber id, so this is a straight 1:1 copy. + // No remapping via `original_index_` is needed here -- the kernel already wrote into + // original-index space when filling the pinned mirror. + // + // Must be called before doing the final return. void convert_gpu_terms_stats_to_host( std::vector< typename optimization_problem_solution_t::additional_termination_information_t>& additional_termination_informations); - void set_relative_dual_tolerance_factor(f_t dual_tolerance_factor); void set_relative_primal_tolerance_factor(f_t primal_tolerance_factor); - f_t get_relative_dual_tolerance_factor() const; - f_t get_relative_primal_tolerance_factor() const; pdlp_termination_status_t get_termination_status(i_t id) const; void set_termination_status(i_t id, pdlp_termination_status_t status); std::vector get_terminations_status(); bool all_optimal_status() const; - bool all_done() const; - static __host__ __device__ bool is_done(pdlp_termination_status_t term); + bool all_done(bool accept_primal_feasible = false) const; + bool any_primal_feasible_or_optimal() const; + static __host__ __device__ bool is_done(pdlp_termination_status_t term, + bool accept_primal_feasible = false); bool has_optimal_status() const; i_t nb_optimal_solutions() const; i_t get_optimal_solution_id() const; @@ -186,7 +223,14 @@ class pdlp_termination_strategy_t { thrust::universal_host_pinned_vector termination_status_; const pdlp_solver_settings_t& settings_; + // Pinned-memory mirror of the per-climber stats. See the docs on + // `gpu_batch_additional_termination_information_t` above. Sized to `original_batch_size_` and + // never resized; slot `k` always corresponds to original climber `k`. gpu_batch_additional_termination_information_t gpu_batch_additional_termination_information_; + // Maps a *current* (post-removal) climber slot `i` to its *original* batch index. + // Refreshed before each `fill_gpu_terms_stats` from `climber_strategies_[i].original_index`. + // The kernel uses it as a destination remap so that the pinned mirror stays in original-index + // space across resizes/swaps. thrust::universal_host_pinned_vector original_index_; const std::vector& climber_strategies_; diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp index b143a206d4..cb2bb3bbba 100644 --- a/cpp/src/pdlp/translate.hpp +++ b/cpp/src/pdlp/translate.hpp @@ -16,6 +16,80 @@ namespace cuopt::linear_programming { +template +static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( + raft::handle_t const* handle_ptr, const optimization_problem_interface_t& problem) +{ + dual_simplex::user_problem_t user_problem(handle_ptr); + + int m = problem.get_n_constraints(); + int n = problem.get_n_variables(); + auto A_values = problem.get_constraint_matrix_values_host(); + auto A_indices = problem.get_constraint_matrix_indices_host(); + auto A_offsets = problem.get_constraint_matrix_offsets_host(); + user_problem.num_rows = m; + user_problem.num_cols = n; + user_problem.objective = problem.get_objective_coefficients_host(); + + dual_simplex::csr_matrix_t csr_A(m, n, static_cast(A_values.size())); + csr_A.x = std::move(A_values); + csr_A.j = std::move(A_indices); + csr_A.row_start = std::move(A_offsets); + + csr_A.to_compressed_col(user_problem.A); + + user_problem.rhs.resize(m); + user_problem.row_sense.resize(m); + user_problem.range_rows.clear(); + user_problem.range_value.clear(); + + auto constraint_lower_bounds = problem.get_constraint_lower_bounds_host(); + auto constraint_upper_bounds = problem.get_constraint_upper_bounds_host(); + + for (int i = 0; i < m; ++i) { + const f_t constraint_lower_bound = constraint_lower_bounds[i]; + const f_t constraint_upper_bound = constraint_upper_bounds[i]; + if (constraint_lower_bound == constraint_upper_bound) { + user_problem.row_sense[i] = 'E'; + user_problem.rhs[i] = constraint_lower_bound; + } else if (constraint_upper_bound == std::numeric_limits::infinity()) { + user_problem.row_sense[i] = 'G'; + user_problem.rhs[i] = constraint_lower_bound; + } else if (constraint_lower_bound == -std::numeric_limits::infinity()) { + user_problem.row_sense[i] = 'L'; + user_problem.rhs[i] = constraint_upper_bound; + } else { + user_problem.row_sense[i] = 'E'; + user_problem.rhs[i] = constraint_lower_bound; + user_problem.range_rows.push_back(i); + user_problem.range_value.push_back(constraint_upper_bound - constraint_lower_bound); + } + } + user_problem.num_range_rows = user_problem.range_rows.size(); + user_problem.lower = problem.get_variable_lower_bounds_host(); + user_problem.upper = problem.get_variable_upper_bounds_host(); + user_problem.problem_name = problem.get_problem_name(); + user_problem.row_names = problem.get_row_names(); + user_problem.col_names = problem.get_variable_names(); + user_problem.obj_constant = problem.get_objective_offset(); + user_problem.obj_scale = problem.get_sense() ? f_t(-1) : f_t(1); + user_problem.var_types.resize(n); + + auto variable_types = problem.get_variable_types_host(); + for (int j = 0; j < n; ++j) { + user_problem.var_types[j] = + variable_types[j] == var_t::CONTINUOUS + ? cuopt::linear_programming::dual_simplex::variable_type_t::CONTINUOUS + : cuopt::linear_programming::dual_simplex::variable_type_t::INTEGER; + } + + user_problem.Q_offsets = problem.get_quadratic_objective_offsets(); + user_problem.Q_indices = problem.get_quadratic_objective_indices(); + user_problem.Q_values = problem.get_quadratic_objective_values(); + + return user_problem; +} + template static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( raft::handle_t const* handle_ptr, detail::problem_t& model) @@ -76,7 +150,11 @@ static dual_simplex::user_problem_t cuopt_problem_to_simplex_problem( if (model.row_names.size() > 0) { user_problem.row_names.resize(m); for (int i = 0; i < m; ++i) { - user_problem.row_names[i] = model.row_names[i]; + if (i < (int)model.row_names.size()) { + user_problem.row_names[i] = model.row_names[i]; + } else { + user_problem.row_names[i] = "c" + std::to_string(i); + } } } if (model.var_names.size() > 0) { diff --git a/cpp/src/pdlp/utilities/problem_checking.cu b/cpp/src/pdlp/utilities/problem_checking.cu index b10850de27..35483c9c8e 100644 --- a/cpp/src/pdlp/utilities/problem_checking.cu +++ b/cpp/src/pdlp/utilities/problem_checking.cu @@ -70,8 +70,17 @@ void problem_checking_t::check_initial_primal_representation( thrust::make_counting_iterator(0) + op_problem.get_n_variables(), [lower_bounds = make_span(op_problem.get_variable_lower_bounds()), upper_bounds = make_span(op_problem.get_variable_upper_bounds()), + variable_types = make_span(op_problem.get_variable_types()), assignment_span = make_span(primal_initial_solution), int_tol = 1e-8] __device__(i_t idx) -> bool { + if (variable_types[idx] == var_t::SEMI_CONTINUOUS) { + const bool is_off = assignment_span[idx] >= -int_tol && + assignment_span[idx] <= int_tol; + const bool is_on = + assignment_span[idx] >= lower_bounds[idx] - int_tol && + assignment_span[idx] <= upper_bounds[idx] + int_tol; + return !is_off && !is_on; + } return assignment_span[idx] < lower_bounds[idx] - int_tol || assignment_span[idx] > upper_bounds[idx] + int_tol; }), @@ -217,6 +226,33 @@ void problem_checking_t::check_problem_representation( op_problem.get_objective_coefficients().size(), op_problem.get_variable_upper_bounds().size()); } + if (!op_problem.get_variable_types().is_empty()) { + cuopt_expects( + op_problem.get_variable_types().size() == op_problem.get_objective_coefficients().size(), + error_type_t::ValidationError, + "Sizes for vectors related to the variables are not the same. The objective " + "vector has size %zu and the variable types vector has size %zu.", + op_problem.get_objective_coefficients().size(), + op_problem.get_variable_types().size()); + + if (!op_problem.get_variable_lower_bounds().is_empty() && + !op_problem.get_variable_upper_bounds().is_empty()) { + const bool sc_bounds_valid = thrust::all_of( + op_problem.get_handle_ptr()->get_thrust_policy(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator( + static_cast(op_problem.get_variable_types().size())), + [var_types = make_span(op_problem.get_variable_types()), + var_lb = make_span(op_problem.get_variable_lower_bounds()), + var_ub = make_span(op_problem.get_variable_upper_bounds())] __device__(i_t i) -> bool { + return var_types[i] != var_t::SEMI_CONTINUOUS || + (var_lb[i] >= f_t(0) && var_lb[i] <= var_ub[i]); + }); + cuopt_expects(sc_bounds_valid, + error_type_t::ValidationError, + "Semi-continuous variable must satisfy 0 <= lower bound <= upper bound."); + } + } // Check constraints sizes cuopt_expects( diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 138c9c2ab9..3f589da470 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -24,6 +24,9 @@ #include #include +#include +#include +#include #include #include @@ -213,66 +216,87 @@ static inline auto problem_wrap_container(const rmm::device_uvector& in) problem_wrapped_iterator(in.data(), in.size())); } +// Used when one scalar applies to each contiguous problem block in a batched vector: +// [problem_0 block][problem_1 block]... +template +struct batch_wrapped_iterator { + batch_wrapped_iterator(const f_t* problem_input, int problem_size) + : problem_input_(problem_input), problem_size_(problem_size) + { + } + HDI f_t operator()(int id) { return problem_input_[id / problem_size_]; } + + const f_t* problem_input_; + // TODO use i_t + int problem_size_; +}; + +template +static inline auto batch_wrapped_container(const rmm::device_uvector& in, int problem_size) +{ + return thrust::make_transform_iterator(thrust::make_counting_iterator(0), + batch_wrapped_iterator(in.data(), problem_size)); +} + template struct power_two_func_t { HDI f_t operator()(f_t val) { return val * val; } }; +template +struct sqrt_func_t { + HDI f_t operator()(f_t val) { return raft::sqrt(val); } +}; + +// Per-element contribution to the sum-of-squares used to form the L2 norm of the RHS. +// Mirrors compute_sum_bounds' main_op: add lower^2 only when finite and lower != upper, +// and add upper^2 when finite. +template +struct rhs_sum_of_squares_t { + HDI f_t operator()(const thrust::tuple& t) const + { + const f_t lower = thrust::get<0>(t); + const f_t upper = thrust::get<1>(t); + f_t sum = f_t(0); + if (isfinite(lower) && (lower != upper)) sum += lower * lower; + if (isfinite(upper)) sum += upper * upper; + return sum; + } +}; + template void inline combine_constraint_bounds(const problem_t& op_problem, - rmm::device_uvector& combined_bounds, - bool batch_mode = false) + rmm::device_uvector& combined_bounds) { - if (!batch_mode) { - combined_bounds.resize(op_problem.n_constraints, op_problem.handle_ptr->get_stream()); - if (combined_bounds.size() > 0) { - raft::linalg::binaryOp(combined_bounds.data(), - op_problem.constraint_lower_bounds.data(), - op_problem.constraint_upper_bounds.data(), - op_problem.n_constraints, - combine_finite_abs_bounds(), - op_problem.handle_ptr->get_stream()); - } - } else { - // In batch mode we use combined_constraint_bounds in convergeance_information to fill the - // primal residual which will be bigger - cuopt_assert(combined_bounds.size() % op_problem.n_constraints == 0, - "combined_bounds size must be a multiple of op_problem.n_constraints"); - // TODO later batch mode: different constraint bounds - cub::DeviceTransform::Transform( - cuda::std::make_tuple(problem_wrap_container(op_problem.constraint_lower_bounds), - problem_wrap_container(op_problem.constraint_upper_bounds)), - combined_bounds.data(), - combined_bounds.size(), - combine_finite_abs_bounds(), - op_problem.handle_ptr->get_stream()); - } + cuopt_assert( + op_problem.constraint_lower_bounds.size() == op_problem.constraint_upper_bounds.size(), + "constraint_lower_bounds and constraint_upper_bounds must have the same size"); + combined_bounds.resize(op_problem.constraint_lower_bounds.size(), + op_problem.handle_ptr->get_stream()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(op_problem.constraint_lower_bounds.data(), + op_problem.constraint_upper_bounds.data()), + combined_bounds.data(), + combined_bounds.size(), + combine_finite_abs_bounds(), + op_problem.handle_ptr->get_stream()); } template void inline compute_sum_bounds(const rmm::device_uvector& constraint_lower_bounds, const rmm::device_uvector& constraint_upper_bounds, - rmm::device_scalar& out, + f_t* out, rmm::cuda_stream_view stream_view) { rmm::device_buffer d_temp_storage; size_t bytes = 0; - auto main_op = [] HD(const thrust::tuple t) { - const f_t lower = thrust::get<0>(t); - const f_t upper = thrust::get<1>(t); - f_t sum = f_t(0); - if (isfinite(lower) && (lower != upper)) sum += lower * lower; - if (isfinite(upper)) sum += upper * upper; - return sum; - }; cub::DeviceReduce::TransformReduce( nullptr, bytes, thrust::make_zip_iterator(constraint_lower_bounds.data(), constraint_upper_bounds.data()), - out.data(), + thrust::make_transform_output_iterator(out, sqrt_func_t{}), constraint_lower_bounds.size(), cuda::std::plus<>{}, - main_op, + rhs_sum_of_squares_t{}, f_t(0), stream_view); @@ -282,20 +306,24 @@ void inline compute_sum_bounds(const rmm::device_uvector& constraint_lower_ d_temp_storage.data(), bytes, thrust::make_zip_iterator(constraint_lower_bounds.data(), constraint_upper_bounds.data()), - out.data(), + thrust::make_transform_output_iterator(out, sqrt_func_t{}), constraint_lower_bounds.size(), cuda::std::plus<>{}, - main_op, + rhs_sum_of_squares_t{}, f_t(0), stream_view); - - const f_t res = std::sqrt(out.value(stream_view)); - out.set_value_async(res, stream_view); - - // Sync since we are using local variable RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); } +template +void inline compute_sum_bounds(const rmm::device_uvector& constraint_lower_bounds, + const rmm::device_uvector& constraint_upper_bounds, + rmm::device_scalar& out, + rmm::cuda_stream_view stream_view) +{ + compute_sum_bounds(constraint_lower_bounds, constraint_upper_bounds, out.data(), stream_view); +} + template struct violation { violation() {} @@ -550,7 +578,8 @@ void inline my_l2_norm(const rmm::device_uvector& input_vector, } template -void inline my_l2_weighted_norm(const rmm::device_uvector& input_vector, +void inline my_l2_weighted_norm(const f_t* input_vector, + size_t size, f_t weight, rmm::device_scalar& result, rmm::cuda_stream_view stream) @@ -558,8 +587,8 @@ void inline my_l2_weighted_norm(const rmm::device_uvector& input_vector, auto fin_op = [] __device__(f_t in) { return raft::sqrt(in); }; auto main_op = [weight] __device__(f_t in, i_t _) { return in * in * weight; }; raft::linalg::reduce(result.data(), - input_vector.data(), - (i_t)input_vector.size(), + input_vector, + (i_t)size, 1, f_t(0.0), stream, @@ -569,6 +598,15 @@ void inline my_l2_weighted_norm(const rmm::device_uvector& input_vector, fin_op); } +template +void inline my_l2_weighted_norm(rmm::device_uvector& input_vector, + f_t weight, + rmm::device_scalar& result, + rmm::cuda_stream_view stream) +{ + my_l2_weighted_norm(input_vector.data(), input_vector.size(), weight, result, stream); +} + template struct is_nan_or_inf { __device__ bool operator()(const f_t x) { return isnan(x) || isinf(x); } @@ -579,9 +617,9 @@ template struct relative_residual_t { __device__ f_t operator()(const thrust::tuple& t) const { - const f_t residual = thrust::get<0>(t); + const f_t residual = raft::abs(thrust::get<0>(t)); // Rhs for either primal (b) and dual (c) - const f_t rhs = thrust::get<1>(t); + const f_t rhs = raft::abs(thrust::get<1>(t)); // Used for best primal so far, count how many constraints are violated if (abs_.has_value() && nb_violated_constraints_.has_value()) { @@ -614,6 +652,7 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream); rmm::device_buffer temp_buf(temp_bytes, stream); cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } template diff --git a/cpp/src/routing/data_model_view.cu b/cpp/src/routing/data_model_view.cu index f18251fb82..4b8bdd446f 100644 --- a/cpp/src/routing/data_model_view.cu +++ b/cpp/src/routing/data_model_view.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -294,20 +294,46 @@ void data_model_view_t::set_skip_first_trips(bool const* skip_first_tr template void data_model_view_t::add_vehicle_order_match(const i_t vehicle_id, i_t const* orders, - const i_t norders) + const i_t norders, + bool validate_input) { + cuopt_expects(vehicle_id >= 0 && vehicle_id < fleet_size_, + error_type_t::ValidationError, + "vehicle_id in vehicle_order_match must be in [0, fleet size)"); + cuopt_expects(norders > 0, + error_type_t::ValidationError, + "number of orders in vehicle_order_match must be positive"); cuopt_expects( orders != nullptr, error_type_t::ValidationError, "vehicle_order_match cannot be null"); + if (validate_input) { + cuopt_expects( + detail::check_min_max_values(orders, norders, 0, num_orders_, handle_ptr_->get_stream()), + error_type_t::ValidationError, + "orders in vehicle_order_match must be in [0, num_orders]"); + } vehicle_order_match_[vehicle_id] = raft::device_span(orders, norders); } template void data_model_view_t::add_order_vehicle_match(const i_t order_id, i_t const* vehicles, - const i_t nvehicles) + const i_t nvehicles, + bool validate_input) { + cuopt_expects(order_id >= 0 && order_id < num_orders_, + error_type_t::ValidationError, + "order_id in order_vehicle_match must be in [0, num_orders)"); + cuopt_expects(nvehicles > 0, + error_type_t::ValidationError, + "number of vehicles in order_vehicle_match must be positive"); cuopt_expects( vehicles != nullptr, error_type_t::ValidationError, "order_vehicle_match cannot be null"); + if (validate_input) { + cuopt_expects(detail::check_min_max_values( + vehicles, nvehicles, 0, fleet_size_ - 1, handle_ptr_->get_stream()), + error_type_t::ValidationError, + "vehicles in order_vehicle_match must be in [0, fleet size)"); + } order_vehicle_match_[order_id] = raft::device_span(vehicles, nvehicles); } diff --git a/cpp/src/routing/ges/lexicographic_search/node_stack.cuh b/cpp/src/routing/ges/lexicographic_search/node_stack.cuh index 19e06a6e2c..0f0263261e 100644 --- a/cpp/src/routing/ges/lexicographic_search/node_stack.cuh +++ b/cpp/src/routing/ges/lexicographic_search/node_stack.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -123,8 +123,8 @@ struct node_stack_t { double transit_time_forward; double latest_arrival_forward; double unavoidable_wait_forward; - f_t departure_forward; - f_t excess_forward; + double departure_forward; + double excess_forward; i_t intra_idx; i_t from_idx; // TODO later we might use multiple node inheritence, but for now this will be in shared memory diff --git a/cpp/src/routing/ges_solver.cu b/cpp/src/routing/ges_solver.cu index 194f73b99e..a660f84909 100644 --- a/cpp/src/routing/ges_solver.cu +++ b/cpp/src/routing/ges_solver.cu @@ -16,8 +16,6 @@ #include "adapters/assignment_adapter.cuh" #include "ges/guided_ejection_search.cuh" -#include - namespace cuopt { namespace routing { diff --git a/cpp/src/routing/local_search/compute_compatible.cu b/cpp/src/routing/local_search/compute_compatible.cu index 8386cb087b..457e970632 100644 --- a/cpp/src/routing/local_search/compute_compatible.cu +++ b/cpp/src/routing/local_search/compute_compatible.cu @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -9,6 +9,8 @@ #include "compute_compatible.cuh" #include "local_search.cuh" +#include +#include #include #include diff --git a/cpp/src/routing/route/break_route.cuh b/cpp/src/routing/route/break_route.cuh index 68ab015646..1d5b3472f9 100644 --- a/cpp/src/routing/route/break_route.cuh +++ b/cpp/src/routing/route/break_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/capacity_route.cuh b/cpp/src/routing/route/capacity_route.cuh index a39ef46a93..388e573c1c 100644 --- a/cpp/src/routing/route/capacity_route.cuh +++ b/cpp/src/routing/route/capacity_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,9 @@ #include #include + +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/dimensions_route.cuh b/cpp/src/routing/route/dimensions_route.cuh index d1131ea550..bc08ba9819 100644 --- a/cpp/src/routing/route/dimensions_route.cuh +++ b/cpp/src/routing/route/dimensions_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -30,6 +30,8 @@ #include #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/distance_route.cuh b/cpp/src/routing/route/distance_route.cuh index e01c552080..a5f98c13ce 100644 --- a/cpp/src/routing/route/distance_route.cuh +++ b/cpp/src/routing/route/distance_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/mismatch_route.cuh b/cpp/src/routing/route/mismatch_route.cuh index d72f01735a..78975750e0 100644 --- a/cpp/src/routing/route/mismatch_route.cuh +++ b/cpp/src/routing/route/mismatch_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/pdp_route.cuh b/cpp/src/routing/route/pdp_route.cuh index dc9b8ad699..dd20e2fec3 100644 --- a/cpp/src/routing/route/pdp_route.cuh +++ b/cpp/src/routing/route/pdp_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/prize_route.cuh b/cpp/src/routing/route/prize_route.cuh index 0330d14590..80b27061b5 100644 --- a/cpp/src/routing/route/prize_route.cuh +++ b/cpp/src/routing/route/prize_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/route.cuh b/cpp/src/routing/route/route.cuh index e6367a4836..b624acb903 100644 --- a/cpp/src/routing/route/route.cuh +++ b/cpp/src/routing/route/route.cuh @@ -11,6 +11,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/service_time_route.cuh b/cpp/src/routing/route/service_time_route.cuh index b35e53c2d8..03c48b2e42 100644 --- a/cpp/src/routing/route/service_time_route.cuh +++ b/cpp/src/routing/route/service_time_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/tasks_route.cuh b/cpp/src/routing/route/tasks_route.cuh index 6da9e4372a..3624d647e7 100644 --- a/cpp/src/routing/route/tasks_route.cuh +++ b/cpp/src/routing/route/tasks_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -15,6 +15,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/time_route.cuh b/cpp/src/routing/route/time_route.cuh index bb5ec653e1..21448c4273 100644 --- a/cpp/src/routing/route/time_route.cuh +++ b/cpp/src/routing/route/time_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -17,6 +17,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/tsp_route.cuh b/cpp/src/routing/route/tsp_route.cuh index ee1ba5370c..9b7eeeee56 100644 --- a/cpp/src/routing/route/tsp_route.cuh +++ b/cpp/src/routing/route/tsp_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -16,6 +16,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh index 83ea5db481..1e246fbb6e 100644 --- a/cpp/src/routing/route/vehicle_fixed_cost_route.cuh +++ b/cpp/src/routing/route/vehicle_fixed_cost_route.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -14,6 +14,8 @@ #include +#include + namespace cuopt { namespace routing { namespace detail { diff --git a/cpp/src/routing/solution/route_node_map.cuh b/cpp/src/routing/solution/route_node_map.cuh index 25a6c4919b..a4a1b171aa 100644 --- a/cpp/src/routing/solution/route_node_map.cuh +++ b/cpp/src/routing/solution/route_node_map.cuh @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include diff --git a/cpp/src/routing/structures.hpp b/cpp/src/routing/structures.hpp index 3ee0a6245a..72ee165891 100644 --- a/cpp/src/routing/structures.hpp +++ b/cpp/src/routing/structures.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -32,7 +32,7 @@ class __attribute__((aligned(4))) NodeInfo { constexpr NodeInfo(i_t node, i_t location, node_type_t node_type) { cuopt_assert(node < (1 << 17), "node id should be less than 131072"); - cuopt_assert(location < (1 << 15), "location id should be less than 32678"); + cuopt_assert(location < (1 << 15), "location id should be less than 32768"); number_ = (uint32_t)node << 17 | (uint32_t)location << 2 | (uint32_t)node_type; cuopt_assert(is_valid(), "Corner case in NodeInfo struct!"); diff --git a/cpp/src/routing/utilities/check_input.cu b/cpp/src/routing/utilities/check_input.cu index e902f2d460..eccc3179bb 100644 --- a/cpp/src/routing/utilities/check_input.cu +++ b/cpp/src/routing/utilities/check_input.cu @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 36a4659059..6aa9efbab8 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/cpp/src/utilities/cuda_helpers.cuh b/cpp/src/utilities/cuda_helpers.cuh index 946099648d..eccf8e1538 100644 --- a/cpp/src/utilities/cuda_helpers.cuh +++ b/cpp/src/utilities/cuda_helpers.cuh @@ -16,8 +16,7 @@ #include #include #include -#include -#include +#include #include namespace cuopt { @@ -175,24 +174,49 @@ HDI To bit_cast(const From& src) return *(To*)(&src); } +/** + * @brief Raises the dynamic shared-memory limit for a CUDA kernel, with caching. + * + * Calls cudaFuncSetAttribute(cudaFuncAttributeMaxDynamicSharedMemorySize) only when + * @p dynamic_request_size exceeds the previously set limit for @p function. The + * per-kernel high-water mark is stored in a process-wide cache so that repeated + * calls with the same or smaller sizes are cheap shared-lock reads. + * + * Thread safety: safe to call concurrently from multiple host threads. + * + * @param function Host pointer to the __global__ kernel function. + * @param dynamic_request_size Requested dynamic shared memory in bytes. + * A value of 0 is a no-op and always returns true. + * @return true if the attribute was successfully set (or was already sufficient). + * @return false if cudaFuncSetAttribute failed (e.g. size exceeds device limit); + * the sticky CUDA error is consumed so it cannot surface later. + */ template inline bool set_shmem_of_kernel(Function* function, size_t dynamic_request_size) { - static std::mutex mtx; + static std::shared_mutex mtx; static std::unordered_map shmem_sizes; if (dynamic_request_size != 0) { dynamic_request_size = raft::alignTo(dynamic_request_size, size_t(1024)); - size_t current_size = shmem_sizes[function]; - if (dynamic_request_size > current_size) { - std::lock_guard lock(mtx); - current_size = shmem_sizes[function]; - if (dynamic_request_size > current_size) { - cudaFuncSetAttribute( - function, cudaFuncAttributeMaxDynamicSharedMemorySize, dynamic_request_size); + { + std::shared_lock rlock(mtx); + auto it = shmem_sizes.find(function); + if (it != shmem_sizes.end() && dynamic_request_size <= it->second) { return true; } + } + + std::unique_lock wlock(mtx); + size_t current_size = shmem_sizes.count(function) ? shmem_sizes[function] : 0; + if (dynamic_request_size > current_size) { + auto err = cudaFuncSetAttribute( + function, cudaFuncAttributeMaxDynamicSharedMemorySize, dynamic_request_size); + if (err == cudaSuccess) { shmem_sizes[function] = dynamic_request_size; - return (cudaSuccess == cudaGetLastError()); + return true; + } else { + cudaGetLastError(); // clear sticky error so later RAFT_CHECK_CUDA doesn't catch it + return false; } } } @@ -216,25 +240,10 @@ DI void sorted_insert(T* array, T item, int curr_size, int max_size) inline size_t get_device_memory_size() { - // Otherwise, we need to get the free memory from the device size_t free_mem, total_mem; - cudaMemGetInfo(&free_mem, &total_mem); - - auto res = rmm::mr::get_current_device_resource(); - auto limiting_adaptor = - dynamic_cast*>(res); - // Did we specifiy an explicit memory limit? - if (limiting_adaptor) { - printf("limiting_adaptor->get_allocation_limit(): %fMiB\n", - limiting_adaptor->get_allocation_limit() / (double)1e6); - printf("used_mem: %fMiB\n", limiting_adaptor->get_allocated_bytes() / (double)1e6); - printf("free_mem: %fMiB\n", - (limiting_adaptor->get_allocation_limit() - limiting_adaptor->get_allocated_bytes()) / - (double)1e6); - return std::min(total_mem, limiting_adaptor->get_allocation_limit()); - } else { - return total_mem; - } + RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem)); + // TODO (bdice): Restore limiting adaptor check after updating CCCL to support resource_cast + return total_mem; } } // namespace cuopt diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index f6e66472dd..a13b9ec887 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -54,6 +54,15 @@ class omp_mutex_t { std::unique_ptr mutex; }; +// Empty class with the same methods as `omp_mutex_t`. This is mainly used for cleanly disabling +// the `omp_mutex_t` via type alias (`lock` and `unlock` are replaced by NOOPs). +class fake_omp_mutex_t { + public: + static void lock() {} + static void unlock() {} + static bool try_lock() { return true; } +}; + // Wrapper for omp atomic operations. See // https://www.openmp.org/spec-html/5.1/openmpsu105.html. template @@ -79,44 +88,118 @@ class omp_atomic_t { T operator--() { return fetch_sub(T(1)) - 1; } T operator--(int) { return fetch_sub(T(1)); } - T load() const + // Possible values for memory order: relaxed, acquire, seq_cst + T load(std::memory_order memory_order = std::memory_order::seq_cst) const { T res; + if (memory_order == std::memory_order::relaxed) { +#pragma omp atomic read relaxed + res = val; + } else if (memory_order == std::memory_order::acquire) { +#pragma omp atomic read acquire + res = val; + } else { #pragma omp atomic read - res = val; + res = val; + } return res; } - void store(T new_val) + // Possible values for memory order: relaxed, release, seq_cst + void store(T new_val, std::memory_order memory_order = std::memory_order::seq_cst) { + if (memory_order == std::memory_order::relaxed) { +#pragma omp atomic write relaxed + val = new_val; + } else if (memory_order == std::memory_order::release) { +#pragma omp atomic write release + val = new_val; + } else { #pragma omp atomic write - val = new_val; + val = new_val; + } } - T exchange(T other) + T exchange(T other, std::memory_order memory_order = std::memory_order::seq_cst) { T old; + if (memory_order == std::memory_order::relaxed) { +#pragma omp atomic capture relaxed + { + old = val; + val = other; + } + } else if (memory_order == std::memory_order::acquire) { +#pragma omp atomic capture acquire + { + old = val; + val = other; + } + } else if (memory_order == std::memory_order::release) { +#pragma omp atomic capture release + { + old = val; + val = other; + } + } else if (memory_order == std::memory_order::acq_rel) { +#pragma omp atomic capture acq_rel + { + old = val; + val = other; + } + } else { #pragma omp atomic capture - { - old = val; - val = other; + { + old = val; + val = other; + } } return old; } - T fetch_add(T inc) + T fetch_add(T inc, std::memory_order memory_order = std::memory_order::seq_cst) { T old; + if (memory_order == std::memory_order::relaxed) { +#pragma omp atomic capture relaxed + { + old = val; + val += inc; + } + } else if (memory_order == std::memory_order::acquire) { +#pragma omp atomic capture acquire + { + old = val; + val += inc; + } + } else if (memory_order == std::memory_order::release) { +#pragma omp atomic capture release + { + old = val; + val += inc; + } + } else if (memory_order == std::memory_order::acq_rel) { +#pragma omp atomic capture acq_rel + { + old = val; + val += inc; + } + } else { #pragma omp atomic capture - { - old = val; - val += inc; + { + old = val; + val += inc; + } } return old; } T fetch_sub(T inc) { return fetch_add(-inc); } + // Get the underlying value without atomics + T& underlying() { return val; } + T underlying() const { return val; } + private: T val; diff --git a/cpp/src/utilities/producer_sync.hpp b/cpp/src/utilities/producer_sync.hpp index dfc316c24a..afb91a11b6 100644 --- a/cpp/src/utilities/producer_sync.hpp +++ b/cpp/src/utilities/producer_sync.hpp @@ -71,7 +71,7 @@ class producer_sync_t { return registration_complete_; } - /** + /** WARNING: Do not use this within OpenMP. This will cause a deadlock! * Wait until: * 1. registration_complete() has been called, AND * 2. All registered producers have work units >= target_work_units diff --git a/cpp/src/utilities/version_info.cpp b/cpp/src/utilities/version_info.cpp index ec9db5130b..54eb8f48bf 100644 --- a/cpp/src/utilities/version_info.cpp +++ b/cpp/src/utilities/version_info.cpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -165,33 +165,46 @@ static double get_available_memory_gb() void print_version_info() { + bool has_gpu = true; int device_id = 0; - cudaGetDevice(&device_id); - cudaDeviceProp device_prop; - cudaGetDeviceProperties(&device_prop, device_id); - cudaUUID_t uuid = device_prop.uuid; + cudaDeviceProp device_prop{}; char uuid_str[37] = {0}; - snprintf(uuid_str, - sizeof(uuid_str), - "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - uuid.bytes[0], - uuid.bytes[1], - uuid.bytes[2], - uuid.bytes[3], - uuid.bytes[4], - uuid.bytes[5], - uuid.bytes[6], - uuid.bytes[7], - uuid.bytes[8], - uuid.bytes[9], - uuid.bytes[10], - uuid.bytes[11], - uuid.bytes[12], - uuid.bytes[13], - uuid.bytes[14], - uuid.bytes[15]); - int version = 0; - cudaRuntimeGetVersion(&version); + int version = 0; + + if (cudaGetDevice(&device_id) != cudaSuccess) { + CUOPT_LOG_WARN("No CUDA device available, skipping GPU info"); + has_gpu = false; + } + if (has_gpu && cudaGetDeviceProperties(&device_prop, device_id) != cudaSuccess) { + CUOPT_LOG_WARN("Failed to query CUDA device properties"); + has_gpu = false; + } + if (has_gpu) { + cudaUUID_t uuid = device_prop.uuid; + snprintf(uuid_str, + sizeof(uuid_str), + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + (unsigned char)uuid.bytes[0], + (unsigned char)uuid.bytes[1], + (unsigned char)uuid.bytes[2], + (unsigned char)uuid.bytes[3], + (unsigned char)uuid.bytes[4], + (unsigned char)uuid.bytes[5], + (unsigned char)uuid.bytes[6], + (unsigned char)uuid.bytes[7], + (unsigned char)uuid.bytes[8], + (unsigned char)uuid.bytes[9], + (unsigned char)uuid.bytes[10], + (unsigned char)uuid.bytes[11], + (unsigned char)uuid.bytes[12], + (unsigned char)uuid.bytes[13], + (unsigned char)uuid.bytes[14], + (unsigned char)uuid.bytes[15]); + if (cudaRuntimeGetVersion(&version) != cudaSuccess) { + CUOPT_LOG_WARN("Failed to query CUDA runtime version"); + version = 0; + } + } int major = version / 1000; int minor = (version % 1000) / 10; CUOPT_LOG_INFO("cuOpt version: %d.%d.%d, git hash: %s, host arch: %s, device archs: %s", @@ -206,13 +219,15 @@ void print_version_info() get_physical_cores(), std::thread::hardware_concurrency(), get_available_memory_gb()); - CUOPT_LOG_INFO("CUDA %d.%d, device: %s (ID %d), VRAM: %.2f GiB", - major, - minor, - device_prop.name, - device_id, - (double)device_prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); - CUOPT_LOG_INFO("CUDA device UUID: %s\n", uuid_str); + if (has_gpu) { + CUOPT_LOG_INFO("CUDA %d.%d, device: %s (ID %d), VRAM: %.2f GiB", + major, + minor, + device_prop.name, + device_id, + (double)device_prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); + CUOPT_LOG_INFO("CUDA device UUID: %s\n", uuid_str); + } } } // namespace cuopt diff --git a/cpp/src/utilities/work_unit_scheduler.cpp b/cpp/src/utilities/work_unit_scheduler.cpp index b0e5c5f12f..37744fe088 100644 --- a/cpp/src/utilities/work_unit_scheduler.cpp +++ b/cpp/src/utilities/work_unit_scheduler.cpp @@ -15,18 +15,13 @@ * limitations under the License. */ -#include "work_unit_scheduler.hpp" - -#include "work_limit_context.hpp" +#include +#include #include #include #include -#include - -#include - namespace cuopt { work_unit_scheduler_t::work_unit_scheduler_t(double sync_interval) : sync_interval_(sync_interval) diff --git a/cpp/src/utilities/work_unit_scheduler.hpp b/cpp/src/utilities/work_unit_scheduler.hpp index 84e7b95fab..8d238c28a6 100644 --- a/cpp/src/utilities/work_unit_scheduler.hpp +++ b/cpp/src/utilities/work_unit_scheduler.hpp @@ -16,7 +16,8 @@ */ #pragma once -#include +#include + #include #include @@ -56,14 +57,14 @@ class work_unit_scheduler_t { double sync_interval_; std::vector> contexts_; - size_t barrier_generation_{0}; + omp_atomic_t barrier_generation_{0}; double current_sync_target_{0}; // Sync callback - executed when all contexts reach sync point sync_callback_t sync_callback_; // Shutdown flag - prevents threads from entering barriers after termination is signaled - std::atomic shutdown_{false}; + omp_atomic_t shutdown_{false}; }; // RAII helper for registering multiple contexts with automatic cleanup diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a73a3361ce..2c1aa5be73 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,4 +1,4 @@ -# cmake-format: off +# cmake-format: off # SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on @@ -34,15 +34,12 @@ endif() set(CUOPT_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) # ################################################################ ------------------------------------------------------------------ +# ConfigureTest(NAME source1.cu source2.cu [LABELS label1 label2 ...]) +# +# LABELS sets CTest labels for selective local test execution via `ctest -L