Add cusparseSpMVOp backend with compile-time auto-detection (#88)

ZedongPeng · web-flow · commit 7bb53e8d82b2 · 2026-03-16T22:11:02.000-04:00
* working SpMVop

* update

* improve SpMVOp via cupdlpx_spmv_ctx_t

* update feasibility polishing spmv

* Apply formatter

* fix spmvop version issue

* add more CUDA versions to CI

* CI: update cuda-toolkit setting

* update build.yml

* update build.yml

* update SpMV backend in readme

* update version to v0.2.8
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -13,6 +13,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest]
+        cuda: ["12.4.0", "12.5.0", "12.6.0", "12.8.0", "12.9.0", "13.0.0", "13.1.0"]
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -21,7 +22,8 @@ jobs:
       - uses: Jimver/cuda-toolkit@v0.2.30
         id: cuda-toolkit
         with:
-          cuda: "13.1.0"
+          cuda: ${{ matrix.cuda }}
+          linux-local-args: '["--toolkit"]'
 
       - name: CUDA info
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,10 +8,11 @@ project(cupdlpx LANGUAGES C CXX CUDA)
 
 set(CUPDLPX_VERSION_MAJOR 0)
 set(CUPDLPX_VERSION_MINOR 2)
-set(CUPDLPX_VERSION_PATCH 7)
+set(CUPDLPX_VERSION_PATCH 8)
 
 set(CUPDLPX_VERSION "${CUPDLPX_VERSION_MAJOR}.${CUPDLPX_VERSION_MINOR}.${CUPDLPX_VERSION_PATCH}")
 add_compile_definitions(CUPDLPX_VERSION="${CUPDLPX_VERSION}")
+add_compile_definitions(CUSPARSE_ENABLE_EXPERIMENTAL_API)
 
 if (WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
diff --git a/README.md b/README.md
@@ -29,6 +29,9 @@ Our work is presented in two papers:
 * **GPU:** NVIDIA GPU with CUDA 12.4+.
 * **Build Tools:** CMake (≥ 3.20), GCC, NVCC.
 
+> **SpMV backend** is selected automatically at compile time based on cuSPARSE version:
+> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3)
+> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3)
 
 ### Build from Source
 Clone the repository and compile the project using CMake.
diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cusparse.h>
+
+// cusparseSpMVOp_bufferSize was introduced in cuSPARSE 12.7.3 (CUDA 13.1 Update 1).
+// CUSPARSE_VERSION encoding: major*1000 + minor*100 + patch.
+#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION >= 12703
+#define CUPDLPX_HAS_SPMVOP 1
+#else
+#define CUPDLPX_HAS_SPMVOP 0
+#endif
+
+#if !CUPDLPX_HAS_SPMVOP
+// The SpMVOp types were added to cusparse.h before the functions
+// (e.g. CUDA 13.1 base has the types but not the functions).
+// Only provide fallback typedefs for cuSPARSE versions that lack them entirely.
+#if !defined(CUSPARSE_VERSION) || CUSPARSE_VERSION < 12700
+typedef void *cusparseSpMVOpDescr_t;
+typedef void *cusparseSpMVOpPlan_t;
+#endif
+#endif
diff --git a/internal/internal_types.h b/internal/internal_types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #pragma once
 
 #include "cupdlpx_types.h"
+#include "cusparse_compat.h"
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <stdbool.h>
@@ -114,19 +115,7 @@ typedef struct
 
     cusparseHandle_t sparse_handle;
     cublasHandle_t blas_handle;
-    size_t spmv_buffer_size;
-    size_t primal_spmv_buffer_size;
-    size_t dual_spmv_buffer_size;
-    void *primal_spmv_buffer;
-    void *dual_spmv_buffer;
-    void *spmv_buffer;
-
-    cusparseSpMatDescr_t matA;
-    cusparseSpMatDescr_t matAt;
-    cusparseDnVecDescr_t vec_primal_sol;
-    cusparseDnVecDescr_t vec_dual_sol;
-    cusparseDnVecDescr_t vec_primal_prod;
-    cusparseDnVecDescr_t vec_dual_prod;
+    void *spmv_ctx;
 
     double *ones_primal_d;
     double *ones_dual_d;
diff --git a/internal/utils.h b/internal/utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 
 #pragma once
 
+#include "cusparse_compat.h"
 #include "internal_types.h"
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
@@ -63,9 +64,6 @@ extern "C"
 
 #define THREADS_PER_BLOCK 256
 
-    extern const double HOST_ONE;
-    extern const double HOST_ZERO;
-
     void *safe_malloc(size_t size);
 
     void *safe_calloc(size_t num, size_t size);
@@ -79,6 +77,45 @@ extern "C"
                                            int max_iterations,
                                            double tolerance);
 
+    bool cupdlpx_use_spmvop_by_default(void);
+
+    void cupdlpx_spmv_buffer_size(cusparseHandle_t sparse_handle,
+                                  cusparseSpMatDescr_t mat,
+                                  cusparseDnVecDescr_t vec_x,
+                                  cusparseDnVecDescr_t vec_y,
+                                  size_t *buffer_size);
+
+    void cupdlpx_spmv_prepare(cusparseHandle_t sparse_handle,
+                              cusparseSpMatDescr_t mat,
+                              cusparseDnVecDescr_t vec_x,
+                              cusparseDnVecDescr_t vec_y,
+                              void *buffer,
+                              void **descr,
+                              void **plan);
+
+    void cupdlpx_spmv_release(void *descr, void *plan);
+
+    void cupdlpx_spmv_execute(cusparseHandle_t sparse_handle,
+                              cusparseSpMatDescr_t mat,
+                              cusparseDnVecDescr_t vec_x,
+                              cusparseDnVecDescr_t vec_y,
+                              void *buffer,
+                              void *plan);
+
+    void *cupdlpx_spmv_ctx_create(cusparseHandle_t sparse_handle,
+                                  const cu_sparse_matrix_csr_t *A,
+                                  const cu_sparse_matrix_csr_t *AT,
+                                  const double *ax_x_init,
+                                  double *ax_y_init,
+                                  const double *atx_x_init,
+                                  double *atx_y_init);
+
+    void cupdlpx_spmv_ctx_destroy(void *ctx);
+
+    void cupdlpx_spmv_Ax(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y);
+
+    void cupdlpx_spmv_ATx(cusparseHandle_t sparse_handle, void *ctx, const double *x, double *y);
+
     void compute_interaction_and_movement(pdhg_solver_state_t *solver_state, double *interaction, double *movement);
 
     bool should_do_adaptive_restart(pdhg_solver_state_t *solver_state,
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "cupdlpx"
-version = "0.2.7"
+version = "0.2.8"
 description = "Python bindings for cuPDLPx (GPU-accelerated first-order LP solver)"
 readme = "README.md"
 license = { text = "Apache-2.0" }
diff --git a/python/README.md b/python/README.md
@@ -18,6 +18,11 @@ It provides a high-level, Pythonic API for constructing, modifying, and solving
 - An NVIDIA GPU with CUDA support (≥12.4 required)  
 - A C/C++ toolchain with GCC and NVCC  
 
+> **SpMV backend** is selected automatically at compile time based on cuSPARSE version:
+> - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3)
+> - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3)
+
+
 ### Install
 Install from PyPI:
 
@@ -262,4 +267,4 @@ or
 
 ```python
 m.setWarmStart(primal=None, dual=None)
-```
+```
diff --git a/python/cupdlpx/PDLP.py b/python/cupdlpx/PDLP.py
@@ -60,4 +60,4 @@
     # presolve
     "Presolve": "presolve",
     "MatrixZeroTol": "matrix_zero_tol",
-}
+}
diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu
@@ -341,6 +341,14 @@ static pdhg_solver_state_t *initialize_primal_feas_polish_state(const pdhg_solve
     primal_state->relative_objective_gap = 0.0;
     primal_state->objective_gap = 0.0;
 
+    primal_state->spmv_ctx = cupdlpx_spmv_ctx_create(primal_state->sparse_handle,
+                                                     primal_state->constraint_matrix,
+                                                     primal_state->constraint_matrix_t,
+                                                     primal_state->pdhg_primal_solution,
+                                                     primal_state->primal_product,
+                                                     primal_state->pdhg_dual_solution,
+                                                     primal_state->dual_product);
+
     return primal_state;
 }
 
@@ -372,6 +380,7 @@ void primal_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->dual_residual);
     SAFE_CUDA_FREE(state->delta_primal_solution);
     SAFE_CUDA_FREE(state->delta_dual_solution);
+    cupdlpx_spmv_ctx_destroy(state->spmv_ctx);
     free(state);
 }
 
@@ -473,6 +482,15 @@ static pdhg_solver_state_t *initialize_dual_feas_polish_state(const pdhg_solver_
     dual_state->absolute_primal_residual = 0.0;
     dual_state->relative_objective_gap = 0.0;
     dual_state->objective_gap = 0.0;
+
+    dual_state->spmv_ctx = cupdlpx_spmv_ctx_create(dual_state->sparse_handle,
+                                                   dual_state->constraint_matrix,
+                                                   dual_state->constraint_matrix_t,
+                                                   dual_state->pdhg_primal_solution,
+                                                   dual_state->primal_product,
+                                                   dual_state->pdhg_dual_solution,
+                                                   dual_state->dual_product);
+
     return dual_state;
 }
 
@@ -514,6 +532,7 @@ void dual_feas_polish_state_free(pdhg_solver_state_t *state)
     SAFE_CUDA_FREE(state->dual_residual);
     SAFE_CUDA_FREE(state->delta_primal_solution);
     SAFE_CUDA_FREE(state->delta_dual_solution);
+    cupdlpx_spmv_ctx_destroy(state->spmv_ctx);
     free(state);
 }
 
diff --git a/src/solver.cu b/src/solver.cu
diff --git a/src/spmv_backend.cu b/src/spmv_backend.cu
diff --git a/src/utils.cu b/src/utils.cu