Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/opencl/cl_algo_registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
#include <core/registry.hpp>
#include <core/top.hpp>

#include <opencl/cl_m_extract_row.hpp>
#include <opencl/cl_m_reduce.hpp>
#include <opencl/cl_mxmT_masked.hpp>
#include <opencl/cl_mxv.hpp>
#include <opencl/cl_v_assign.hpp>
#include <opencl/cl_v_count_mf.hpp>
#include <opencl/cl_v_eadd.hpp>
#include <opencl/cl_v_eadd_fdb.hpp>
#include <opencl/cl_v_emult.hpp>
#include <opencl/cl_v_map.hpp>
#include <opencl/cl_v_reduce.hpp>
#include <opencl/cl_vxm.hpp>
Expand Down Expand Up @@ -93,6 +95,16 @@ namespace spla {
g_registry->add(MAKE_KEY_CL_0("mxmT_masked", INT), std::make_shared<Algo_mxmT_masked_cl<T_INT>>());
g_registry->add(MAKE_KEY_CL_0("mxmT_masked", UINT), std::make_shared<Algo_mxmT_masked_cl<T_UINT>>());
g_registry->add(MAKE_KEY_CL_0("mxmT_masked", FLOAT), std::make_shared<Algo_mxmT_masked_cl<T_FLOAT>>());

// algorthm m_extract_row
g_registry->add(MAKE_KEY_CL_0("m_extract_row", INT), std::make_shared<Algo_m_extract_row_cl<T_INT>>());
g_registry->add(MAKE_KEY_CL_0("m_extract_row", UINT), std::make_shared<Algo_m_extract_row_cl<T_UINT>>());
g_registry->add(MAKE_KEY_CL_0("m_extract_row", FLOAT), std::make_shared<Algo_m_extract_row_cl<T_FLOAT>>());

// algorthm v_emult
g_registry->add(MAKE_KEY_CL_0("v_emult", INT), std::make_shared<Algo_v_emult_cl<T_INT>>());
g_registry->add(MAKE_KEY_CL_0("v_emult", UINT), std::make_shared<Algo_v_emult_cl<T_UINT>>());
g_registry->add(MAKE_KEY_CL_0("v_emult", FLOAT), std::make_shared<Algo_v_emult_cl<T_FLOAT>>());
}

}// namespace spla
120 changes: 120 additions & 0 deletions src/opencl/cl_m_extract_row.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/**********************************************************************************/
/* This file is part of spla project */
/* https://github.com/SparseLinearAlgebra/spla */
/**********************************************************************************/
/* MIT License */
/* */
/* Copyright (c) 2025 SparseLinearAlgebra */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
/* of this software and associated documentation files (the "Software"), to deal */
/* in the Software without restriction, including without limitation the rights */
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
/* copies of the Software, and to permit persons to whom the Software is */
/* furnished to do so, subject to the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be included in all */
/* copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
/* SOFTWARE. */
/**********************************************************************************/

#ifndef SPLA_CL_M_EXTRACT_ROW_HPP
#define SPLA_CL_M_EXTRACT_ROW_HPP

#include <schedule/schedule_tasks.hpp>

#include <core/dispatcher.hpp>
#include <core/registry.hpp>
#include <core/tmatrix.hpp>
#include <core/top.hpp>
#include <core/tscalar.hpp>
#include <core/ttype.hpp>
#include <core/tvector.hpp>

#include <opencl/cl_counter.hpp>
#include <opencl/cl_debug.hpp>
#include <opencl/cl_formats.hpp>
#include <opencl/cl_program_builder.hpp>
#include <opencl/generated/auto_m_extract_row.hpp>

namespace spla {

template<typename T>
class Algo_m_extract_row_cl final : public RegistryAlgo {
public:
~Algo_m_extract_row_cl() override = default;

std::string get_name() override {
return "m_extract_row";
}

std::string get_description() override {
return "opencl extract row from matrix";
}

Status execute(const DispatchContext& ctx) override {
auto t = ctx.task.template cast_safe<ScheduleTask_m_extract_row>();

ref_ptr<TVector<T>> r = t->r.template cast_safe<TVector<T>>();
ref_ptr<TMatrix<T>> M = t->M.template cast_safe<TMatrix<T>>();
auto op_apply = t->op_apply.template cast_safe<TOpUnary<T, T>>();

r->validate_wd(FormatVector::AccDense);
M->validate_rw(FormatMatrix::AccCsr);

auto* p_cl_r = r->template get<CLDenseVec<T>>();
auto* p_cl_M = M->template get<CLCsr<T>>();
auto* p_cl_acc = get_acc_cl();
auto& queue = p_cl_acc->get_queue_default();

// get the row boundaries from M->Ap
uint row_bounds[2];
cl::Buffer cl_row_bounds(p_cl_acc->get_context(),
CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_USE_HOST_PTR,
sizeof(row_bounds), row_bounds);

queue.enqueueCopyBuffer(p_cl_M->Ap, cl_row_bounds, t->index * sizeof(uint), 0, sizeof(row_bounds));
queue.finish();

std::shared_ptr<CLProgram> program;
ensure_kernel(op_apply, program);

auto kernel = program->make_kernel("extract_row");
kernel.setArg(0, p_cl_r->Ax);
kernel.setArg(1, p_cl_M->Ax);
kernel.setArg(2, p_cl_M->Aj);
kernel.setArg(3, row_bounds[1]);

// amount of elements in the row
const uint n = row_bounds[1] - row_bounds[0] - 1;

cl::NDRange global(p_cl_acc->get_default_wgs() * div_up_clamp(n, p_cl_acc->get_default_wgs(), 1u, 1024u));
cl::NDRange local(p_cl_acc->get_default_wgs());
queue.enqueueNDRangeKernel(kernel, cl::NDRange(row_bounds[0]), global, local);

return Status::Ok;
}

private:
void ensure_kernel(const ref_ptr<TOpUnary<T, T>>& op_apply, std::shared_ptr<CLProgram>& program) {
CLProgramBuilder program_builder;
program_builder
.set_name("m_extract_row")
.add_type("TYPE", get_ttype<T>().template as<Type>())
.add_op("OP_APPLY", op_apply.template as<OpUnary>())
.set_source(source_m_extract_row)
.acquire();
program = program_builder.get_program();
}
};

}// namespace spla

#endif//SPLA_CL_M_EXTRACT_ROW_HPP
128 changes: 128 additions & 0 deletions src/opencl/cl_v_emult.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/**********************************************************************************/
/* This file is part of spla project */
/* https://github.com/JetBrains-Research/spla */
/**********************************************************************************/
/* MIT License */
/* */
/* Copyright (c) 2025 SparseLinearAlgebra */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
/* of this software and associated documentation files (the "Software"), to deal */
/* in the Software without restriction, including without limitation the rights */
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
/* copies of the Software, and to permit persons to whom the Software is */
/* furnished to do so, subject to the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be included in all */
/* copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
/* SOFTWARE. */
/**********************************************************************************/

#ifndef SPLA_CL_VECTOR_EMULT_HPP
#define SPLA_CL_VECTOR_EMULT_HPP

#include <schedule/schedule_tasks.hpp>

#include <core/dispatcher.hpp>
#include <core/registry.hpp>
#include <core/top.hpp>
#include <core/tscalar.hpp>
#include <core/ttype.hpp>
#include <core/tvector.hpp>

#include <opencl/cl_counter.hpp>
#include <opencl/cl_fill.hpp>
#include <opencl/cl_formats.hpp>
#include <opencl/generated/auto_vector_emult.hpp>

namespace spla {

template<typename T>
class Algo_v_emult_cl final : public RegistryAlgo {
public:
~Algo_v_emult_cl() override = default;

std::string get_name() override {
return "v_emult";
}

std::string get_description() override {
return "parallel vector element-wise mult on opencl device";
}

Status execute(const DispatchContext& ctx) override {
auto t = ctx.task.template cast_safe<ScheduleTask_v_emult>();
ref_ptr<TVector<T>> u = t->u.template cast_safe<TVector<T>>();
ref_ptr<TVector<T>> v = t->v.template cast_safe<TVector<T>>();

if (u->is_valid(FormatVector::AccDense) && v->is_valid(FormatVector::AccDense)) {
return execute_dn2dn(ctx);
}

return execute_dn2dn(ctx);
}

private:
Status execute_dn2dn(const DispatchContext& ctx) {
TIME_PROFILE_SCOPE("cl/vector_emult_dn2dn");

auto t = ctx.task.template cast_safe<ScheduleTask_v_emult>();
ref_ptr<TVector<T>> r = t->r.template cast_safe<TVector<T>>();
ref_ptr<TVector<T>> u = t->u.template cast_safe<TVector<T>>();
ref_ptr<TVector<T>> v = t->v.template cast_safe<TVector<T>>();
ref_ptr<TOpBinary<T, T, T>> op = t->op.template cast_safe<TOpBinary<T, T, T>>();

std::shared_ptr<CLProgram> program;
if (!ensure_kernel(op, program)) return Status::CompilationError;

r->validate_wd(FormatVector::AccDense);
u->validate_rw(FormatVector::AccDense);
v->validate_rw(FormatVector::AccDense);

auto* p_cl_r = r->template get<CLDenseVec<T>>();
const auto* p_cl_u = u->template get<CLDenseVec<T>>();
const auto* p_cl_v = v->template get<CLDenseVec<T>>();
auto* p_cl_acc = get_acc_cl();
auto& queue = p_cl_acc->get_queue_default();

const uint n = r->get_n_rows();

auto kernel = program->make_kernel("dense_to_dense");
kernel.setArg(0, p_cl_r->Ax);
kernel.setArg(1, p_cl_u->Ax);
kernel.setArg(2, p_cl_v->Ax);
kernel.setArg(3, n);
kernel.setArg(4, r->get_fill_value());

cl::NDRange global(p_cl_acc->get_default_wgs() * div_up_clamp(n, p_cl_acc->get_default_wgs(), 1u, 1024u));
cl::NDRange local(p_cl_acc->get_default_wgs());
queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local);

return Status::Ok;
}

bool ensure_kernel(const ref_ptr<TOpBinary<T, T, T>>& op, std::shared_ptr<CLProgram>& program) {
CLProgramBuilder program_builder;
program_builder
.set_name("vector_emult")
.add_type("TYPE", get_ttype<T>().template as<Type>())
.add_op("OP_BINARY", op.template as<OpBinary>())
.set_source(source_vector_emult)
.acquire();

program = program_builder.get_program();

return true;
}
};

}// namespace spla

#endif
23 changes: 23 additions & 0 deletions src/opencl/generated/auto_m_extract_row.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
////////////////////////////////////////////////////////////////////
// Copyright (c) 2021 - 2025 SparseLinearAlgebra
// Autogenerated file, do not modify
////////////////////////////////////////////////////////////////////

#pragma once

static const char source_m_extract_row[] = R"(


__kernel void extract_row(__global TYPE* g_rx,
__global const TYPE* g_Ax,
__global const uint* g_Aj,
const uint n) {
const uint gid = get_global_id(0);
const uint gsize = get_global_size(0);

for (uint i = gid; i < n; i += gsize) {
g_rx[g_Aj[i]] = OP_APPLY(g_Ax[i]);
}
}

)";
27 changes: 27 additions & 0 deletions src/opencl/generated/auto_vector_emult.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
////////////////////////////////////////////////////////////////////
// Copyright (c) 2021 - 2025 SparseLinearAlgebra
// Autogenerated file, do not modify
////////////////////////////////////////////////////////////////////

#pragma once

static const char source_vector_emult[] = R"(



__kernel void dense_to_dense(__global TYPE* g_rx,
__global const TYPE* g_ux,
__global const TYPE* g_vx,
const uint n,
const TYPE fill_value) {
const uint gid = get_global_id(0);
const uint gsize = get_global_size(0);

for (uint i = gid; i < n; i += gsize) {
TYPE u = g_ux[i];
TYPE v = g_vx[i];
g_rx[i] = u != fill_value && v != fill_value ? OP_BINARY(u, v) : fill_value;
}
}

)";
40 changes: 40 additions & 0 deletions src/opencl/kernels/m_extract_row.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**********************************************************************************/
/* This file is part of spla project */
/* https://github.com/SparseLinearAlgebra/spla */
/**********************************************************************************/
/* MIT License */
/* */
/* Copyright (c) 2025 SparseLinearAlgebra */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
/* of this software and associated documentation files (the "Software"), to deal */
/* in the Software without restriction, including without limitation the rights */
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
/* copies of the Software, and to permit persons to whom the Software is */
/* furnished to do so, subject to the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be included in all */
/* copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
/* SOFTWARE. */
/**********************************************************************************/

#include "common_def.cl"

__kernel void extract_row(__global TYPE* g_rx,
__global const TYPE* g_Ax,
__global const uint* g_Aj,
const uint n) {
const uint gid = get_global_id(0);
const uint gsize = get_global_size(0);

for (uint i = gid; i < n; i += gsize) {
g_rx[g_Aj[i]] = OP_APPLY(g_Ax[i]);
}
}
Loading