SparseBLAS · yhmtsai · Apr 2, 2025 · Apr 2, 2025 · Apr 3, 2025 · Apr 17, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -69,7 +69,7 @@ jobs:
     - name: Test
       run: |
         source /opt/intel/oneapi/setvars.sh
-        ./build/test/gtest/spblas-tests
+        ONEMKL_DEVICE_SELECTOR=*:cpu ./build/test/gtest/spblas-tests
 
   macos:
     runs-on: 'macos-latest'
@@ -111,3 +111,23 @@ jobs:
       shell: bash -l {0}
       run: |
         ./build/test/gtest/spblas-tests
+
+  intel-llvm-gpu:
+    runs-on: 'gpu_intel'
+    steps:
+    - uses: actions/checkout@v4
+    - name: CMake
+      shell: bash -l {0}
+      run: |
+        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl cmake
+        cmake -B build -DCMAKE_CXX_COMPILER=icpx -DENABLE_ONEMKL_SYCL=ON
+    - name: Build
+      shell: bash -l {0}
+      run: |
+        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl
+        make -C build -j `nproc`
+    - name: Test
+      shell: bash -l {0}
+      run: |
+        module load intel-oneapi-compilers intel-oneapi-dpl intel-oneapi-mkl
+        ONEMKL_DEVICE_SELECTOR=level_zero:gpu ./build/test/gtest/spblas-gpu-tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,7 +22,7 @@ include(FetchContent)
 if (ENABLE_ONEMKL_SYCL)
   find_package(MKL REQUIRED)
   target_link_libraries(spblas INTERFACE MKL::MKL_SYCL) # SYCL APIs
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ONEMKL_SYCL")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -DSPBLAS_ENABLE_ONEMKL_SYCL")
 endif()
 
 if (ENABLE_ARMPL)

diff --git a/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp b/include/spblas/vendor/onemkl_sycl/mkl_allocator.hpp
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <sycl/sycl.hpp>
+
+namespace spblas {
+namespace mkl {
+
+template <typename T, std::size_t Alignment = 0>
+class mkl_allocator {
+public:
+  using value_type = T;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  mkl_allocator() noexcept {
+    auto* queue = new sycl::queue{sycl::default_selector_v};
+    queue_manager_ =
+        std::move(std::shared_ptr<sycl::queue>{queue, [](sycl::queue* q) {
+                                                 q->wait_and_throw();
+                                                 delete q;
+                                               }});
+  }
+
+  // taking a shallow copy of queue from elsewhere, so we don't own destruction
+  mkl_allocator(sycl::queue* q) noexcept
+      : queue_manager_(q, [](sycl::queue* q) {}) {}
-  mkl_allocator(sycl::queue* q) noexcept
-      : queue_manager_(q, [](sycl::queue* q) {}) {}
+/* taking a shallow copy of queue from elsewhere, so we don't own destruction */
+  mkl_allocator(sycl::queue* q) noexcept
+      : queue_manager_(q, [](sycl::queue* q) {}) {}
-  mkl_allocator(sycl::queue* q) noexcept
-      : queue_manager_(q, [](sycl::queue* q) {}) {}
+/* taking a shallow copy of queue from elsewhere, so we don't own destruction */
+  mkl_allocator(sycl::queue* q) noexcept
+      : queue_manager_(q, [](sycl::queue* q) {}) {}
+
+  template <typename U>
+  mkl_allocator(const mkl_allocator<U, Alignment>& other) noexcept
+      : queue_manager_(other.queue_) {}
+
+  mkl_allocator(const mkl_allocator&) = default;
+  mkl_allocator& operator=(const mkl_allocator&) = default;
+  ~mkl_allocator() = default;
+
+  using is_always_equal = std::false_type;
+
+  pointer allocate(std::size_t size) {
+    return sycl::malloc_device<value_type>(size, *(this->queue()));
+  }
+
+  void deallocate(pointer ptr, std::size_t n = 0) {
+    if (ptr != nullptr) {
+      sycl::free(ptr, *(this->queue()));
+    }
+  }
+
+  bool operator==(const mkl_allocator&) const = default;
+  bool operator!=(const mkl_allocator&) const = default;
+
+  template <typename U>
+  struct rebind {
+    using other = mkl_allocator<U, Alignment>;
+  };
+
+  sycl::queue* queue() const noexcept {
+    return queue_manager_.get();
+  }
+
+private:
+  // using shared_ptr to support copy constructor
+  std::shared_ptr<sycl::queue> queue_manager_;
+};
+
+} // namespace mkl
+} // namespace spblas
diff --git a/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp b/include/spblas/vendor/onemkl_sycl/spmv_impl.hpp
@@ -2,6 +2,7 @@
 
 #include <oneapi/mkl.hpp>
 
+#include "mkl_allocator.hpp"
 #include <spblas/detail/log.hpp>
 #include <spblas/detail/operation_info_t.hpp>
 #include <spblas/detail/ranges.hpp>
@@ -24,28 +25,53 @@
 
 namespace spblas {
 
+class spmv_state_t {
+public:
+  spmv_state_t() : spmv_state_t(mkl::mkl_allocator<char>{}) {}
+
+  spmv_state_t(sycl::queue* q) : spmv_state_t(mkl::mkl_allocator<char>{q}) {}
+
+  spmv_state_t(mkl::mkl_allocator<char> alloc) : alloc_(alloc) {}
+
+  sycl::queue* queue() {
+    return alloc_.queue();
+  }
+
+private:
+  mkl::mkl_allocator<char> alloc_;
+};
+
 template <matrix A, vector X, vector Y>
   requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) &&
            __detail::has_contiguous_range_base<X> &&
            __ranges::contiguous_range<Y>)
-void multiply(A&& a, X&& x, Y&& y) {
+void multiply(spmv_state_t& state, A&& a, X&& x, Y&& y) {
   log_trace("");
   auto a_base = __detail::get_ultimate_base(a);
   auto x_base = __detail::get_ultimate_base(x);
 
   auto alpha_optional = __detail::get_scaling_factor(a, x);
   tensor_scalar_t<A> alpha = alpha_optional.value_or(1);
 
-  sycl::queue q(sycl::cpu_selector_v);
+  auto q_ptr = state.queue();
 
-  auto a_handle = __mkl::create_matrix_handle(q, a_base);
+  auto a_handle = __mkl::create_matrix_handle(*q_ptr, a_base);
   auto a_transpose = __mkl::get_transpose(a);
 
-  oneapi::mkl::sparse::gemv(q, a_transpose, alpha, a_handle,
+  oneapi::mkl::sparse::gemv(*q_ptr, a_transpose, alpha, a_handle,
                             __ranges::data(x_base), 0.0, __ranges::data(y))
       .wait();
 
-  oneapi::mkl::sparse::release_matrix_handle(q, &a_handle).wait();
+  oneapi::mkl::sparse::release_matrix_handle(*q_ptr, &a_handle).wait();
+}
+
+template <matrix A, vector X, vector Y>
+  requires((__detail::has_csr_base<A> || __detail::has_csc_base<A>) &&
+           __detail::has_contiguous_range_base<X> &&
+           __ranges::contiguous_range<Y>)
+void multiply(A&& a, X&& x, Y&& y) {
+  spmv_state_t state;
+  multiply(state, a, x, y);
 }
 
 } // namespace spblas
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
@@ -21,3 +21,11 @@ target_link_libraries(spblas-tests spblas fmt GTest::gtest_main)
 
 include(GoogleTest)
 gtest_discover_tests(spblas-tests)
+
+# unify it together after cusparse
+if(ENABLE_ONEMKL_SYCL)
+  add_executable(spblas-gpu-tests rocsparse/spmv_test.cpp)
+  target_include_directories(spblas-gpu-tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+  target_link_libraries(spblas-gpu-tests spblas fmt GTest::gtest_main)
+  gtest_discover_tests(spblas-gpu-tests)
+endif()
diff --git a/test/gtest/onemkl/device_vector.hpp b/test/gtest/onemkl/device_vector.hpp
@@ -0,0 +1,59 @@
+#pragma once
+#include <iterator>
+#include <memory>
+#include <spblas/vendor/onemkl_sycl/mkl_allocator.hpp>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+namespace thrust {
+
+template <typename InputIt, typename OutputIt>
+  requires(std::contiguous_iterator<InputIt> &&
+           std::contiguous_iterator<OutputIt>)
+OutputIt copy(InputIt first, InputIt last, OutputIt d_first) {
+  sycl::queue queue(sycl::default_selector_v);
+  using input_value_type = typename std::iterator_traits<InputIt>::value_type;
+  using output_value_type = typename std::iterator_traits<OutputIt>::value_type;
+  input_value_type* first_ptr = std::to_address(first);
+  output_value_type* d_first_ptr = std::to_address(d_first);
+  auto num = std::distance(first, last);
+  queue.memcpy(d_first_ptr, first_ptr, num * sizeof(input_value_type))
+      .wait_and_throw();
+  return d_first + num;
+}
+
+// incompleted impl for thrust vector in oneMKL just for test usage
+template <typename ValueType>
+class device_vector {
+public:
+  device_vector(std::vector<ValueType> host_vector)
+      : alloc_{}, size_(host_vector.size()), ptr_(nullptr) {
+    ptr_ = alloc_.allocate(size_);
+    thrust::copy(host_vector.begin(), host_vector.end(), ptr_);
+  }
+
+  ~device_vector() {
+    alloc_.deallocate(ptr_, size_);
+    ptr_ = nullptr;
+  }
+
+  ValueType* begin() {
+    return ptr_;
+  }
+
+  ValueType* end() {
+    return ptr_ + size_;
+  }
+
+  // just to give data().get()
+  std::shared_ptr<ValueType> data() {
+    return std::shared_ptr<ValueType>(ptr_, [](ValueType* ptr) {});
+  }
+
+private:
+  spblas::mkl::mkl_allocator<ValueType> alloc_;
+  std::size_t size_;
+  ValueType* ptr_;
+};
+
+} // namespace thrust
diff --git a/test/gtest/rocsparse/spmv_test.cpp b/test/gtest/rocsparse/spmv_test.cpp
@@ -3,7 +3,12 @@
 #include <spblas/spblas.hpp>
 
 #include <gtest/gtest.h>
+
+#ifdef SPBLAS_ENABLE_ONEMKL_SYCL
+#include "onemkl/device_vector.hpp"
+#else
 #include <thrust/device_vector.h>
+#endif
 
 using value_t = float;
 using index_t = spblas::index_t;