KernelIntrinsics by christiangnrd · Pull Request #403 · JuliaGPU/OpenCL.jl

christiangnrd · 2025-11-18T02:10:57Z

No description provided.

github-actions · 2025-11-18T02:11:47Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/cl/device.jl b/lib/cl/device.jl
index 64bb1f9..cdfa4d5 100644
--- a/lib/cl/device.jl
+++ b/lib/cl/device.jl
@@ -315,7 +315,7 @@ function sub_group_size(d::Device)
     end
 end
 function sub_group_shuffle_supported_types(d::Device)
-    if "cl_khr_subgroup_shuffle" in d.extensions
+    return if "cl_khr_subgroup_shuffle" in d.extensions
         res = [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Float32]
         "cl_khr_fp16" in d.extensions && push!(res, Float16)
         "cl_khr_fp64" in d.extensions && push!(res, Float64)
diff --git a/lib/intrinsics/src/synchronization.jl b/lib/intrinsics/src/synchronization.jl
index 7d88b69..c8abe6d 100644
--- a/lib/intrinsics/src/synchronization.jl
+++ b/lib/intrinsics/src/synchronization.jl
@@ -156,7 +156,9 @@ export barrier, work_group_barrier, sub_group_barrier
                     MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))
 
 @inline sub_group_barrier(flags, scope = memory_scope_sub_group) =
-    control_barrier(Scope.Subgroup, cl_scope_to_spirv(scope),
-                    MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))
+    control_barrier(
+    Scope.Subgroup, cl_scope_to_spirv(scope),
+    MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags)
+)
 
 barrier(flags) = work_group_barrier(flags)
diff --git a/src/OpenCLKernels.jl b/src/OpenCLKernels.jl
index 626b7fa..4dcfbdc 100644
--- a/src/OpenCLKernels.jl
+++ b/src/OpenCLKernels.jl
@@ -129,9 +129,9 @@ end
 
 KI.argconvert(::OpenCLBackend, arg) = kernel_convert(arg)
 
-function KI.kernel_function(::OpenCLBackend, f::F, tt::TT=Tuple{}; name = nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::OpenCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = clfunction(f, tt; name, kwargs...)
-    KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern)
+    return KI.Kernel{OpenCLBackend, typeof(kern)}(OpenCLBackend(), kern)
 end
 
 function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsize = 1)
@@ -147,15 +147,15 @@ function (obj::KI.Kernel{OpenCLBackend})(args...; numworkgroups = 1, workgroupsi
 end
 
 
-function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int=typemax(Int))::Int
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:OpenCLBackend}; max_work_items::Int = typemax(Int))::Int
     wginfo = cl.work_group_info(kernel.kern.fun, cl.device())
-    Int(min(wginfo.size, max_work_items))
+    return Int(min(wginfo.size, max_work_items))
 end
 function KI.max_work_group_size(::OpenCLBackend)::Int
-    Int(cl.device().max_work_group_size)
+    return Int(cl.device().max_work_group_size)
 end
 function KI.multiprocessor_count(::OpenCLBackend)::Int
-    Int(cl.device().max_compute_units)
+    return Int(cl.device().max_compute_units)
 end
 
 ## Indexing Functions
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index 407faec..bac459c 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -166,82 +166,82 @@ end
     @test call_on_device(OpenCL.mad, x, y, z) ≈ x * y + z
 end
 
-if cl.sub_groups_supported(cl.device())
-
-struct SubgroupData
-    sub_group_size::UInt32
-    max_sub_group_size::UInt32
-    num_sub_groups::UInt32
-    sub_group_id::UInt32
-    sub_group_local_id::UInt32
-end
-function test_subgroup_kernel(results)
-    i = get_global_id(1)
-
-    if i <= length(results)
-        @inbounds results[i] = SubgroupData(
-            get_sub_group_size(),
-            get_max_sub_group_size(),
-            get_num_sub_groups(),
-            get_sub_group_id(),
-            get_sub_group_local_id()
-        )
-    end
-    return
-end
-
-@testset "Sub-groups" begin
-    sg_size = cl.sub_group_size(cl.device())
-
-    @testset "Indexing intrinsics" begin
-        # Test with small kernel
-        sg_n = 2
-        local_size = sg_size * sg_n
-        numworkgroups = 2
-        N = local_size * numworkgroups
-
-        results = CLVector{SubgroupData}(undef, N)
-        kernel = @opencl launch = false test_subgroup_kernel(results)
-
-        kernel(results; local_size, global_size=N)
-
-        host_results = Array(results)
-
-        # Verify results make sense
-        for (i, sg_data) in enumerate(host_results)
-            @test sg_data.sub_group_size == sg_size
-            @test sg_data.max_sub_group_size == sg_size
-            @test sg_data.num_sub_groups == sg_n
-
-            # Group ID should be 1-based
-            expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
-            @test sg_data.sub_group_id == expected_sub_group
-
-            # Local ID should be 1-based within group
-            expected_sg_local = ((i - 1) % sg_size) + 1
-            @test sg_data.sub_group_local_id == expected_sg_local
-        end
-    end
-
-    @testset "shuffle idx" begin
-        function shfl_idx_kernel(d)
-            i = get_sub_group_local_id()
-            j = get_sub_group_size() - i + 1
-
-            d[i] = sub_group_shuffle(d[i], j)
-
-            return
-        end
-
-        @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
-            a = rand(T, sg_size)
-            d_a = CLArray(a)
-            @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
-            @test Array(d_a) == reverse(a)
-        end
-    end
-end
-end # if cl.sub_groups_supported(cl.device())
+        if cl.sub_groups_supported(cl.device())
+
+            struct SubgroupData
+                sub_group_size::UInt32
+                max_sub_group_size::UInt32
+                num_sub_groups::UInt32
+                sub_group_id::UInt32
+                sub_group_local_id::UInt32
+            end
+            function test_subgroup_kernel(results)
+                i = get_global_id(1)
+
+                if i <= length(results)
+                    @inbounds results[i] = SubgroupData(
+                        get_sub_group_size(),
+                        get_max_sub_group_size(),
+                        get_num_sub_groups(),
+                        get_sub_group_id(),
+                        get_sub_group_local_id()
+                    )
+                end
+                return
+            end
+
+            @testset "Sub-groups" begin
+                sg_size = cl.sub_group_size(cl.device())
+
+                @testset "Indexing intrinsics" begin
+                    # Test with small kernel
+                    sg_n = 2
+                    local_size = sg_size * sg_n
+                    numworkgroups = 2
+                    N = local_size * numworkgroups
+
+                    results = CLVector{SubgroupData}(undef, N)
+                    kernel = @opencl launch = false test_subgroup_kernel(results)
+
+                    kernel(results; local_size, global_size = N)
+
+                    host_results = Array(results)
+
+                    # Verify results make sense
+                    for (i, sg_data) in enumerate(host_results)
+                        @test sg_data.sub_group_size == sg_size
+                        @test sg_data.max_sub_group_size == sg_size
+                        @test sg_data.num_sub_groups == sg_n
+
+                        # Group ID should be 1-based
+                        expected_sub_group = div(((i - 1) % local_size), sg_size) + 1
+                        @test sg_data.sub_group_id == expected_sub_group
+
+                        # Local ID should be 1-based within group
+                        expected_sg_local = ((i - 1) % sg_size) + 1
+                        @test sg_data.sub_group_local_id == expected_sg_local
+                    end
+                end
+
+                @testset "shuffle idx" begin
+                    function shfl_idx_kernel(d)
+                        i = get_sub_group_local_id()
+                        j = get_sub_group_size() - i + 1
+
+                        d[i] = sub_group_shuffle(d[i], j)
+
+                        return
+                    end
+
+                    @testset for T in cl.sub_group_shuffle_supported_types(cl.device())
+                        a = rand(T, sg_size)
+                        d_a = CLArray(a)
+                        @opencl local_size = sg_size global_size = sg_size shfl_idx_kernel(d_a)
+                        @test Array(d_a) == reverse(a)
+                    end
+                end
+            end
+        end # if cl.sub_groups_supported(cl.device())
 
 @testset "SIMD - $N x $T" for N in simd_ns, T in float_types
     # codegen emits i48 here, which SPIR-V doesn't support

codecov · 2025-11-18T02:47:16Z

Codecov Report

❌ Patch coverage is 90.00000% with 3 lines in your changes missing coverage. Please review.
✅ Project coverage is 82.46%. Comparing base (a1f79db) to head (de7c136).

Files with missing lines	Patch %	Lines
src/OpenCLKernels.jl	88.88%	2 Missing ⚠️
src/compiler/compilation.jl	91.66%	1 Missing ⚠️

Additional details and impacted files

@@            Coverage Diff             @@
##           master     #403      +/-   ##
==========================================
+ Coverage   80.84%   82.46%   +1.62%     
==========================================
  Files          12       11       -1     
  Lines         736      730       -6     
==========================================
+ Hits          595      602       +7     
+ Misses        141      128      -13

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Co-Authored-By: Simeon David Schaub <simeon@schaub.rocks>

christiangnrd force-pushed the intrinsics branch from d7c864a to 377f43a Compare November 18, 2025 02:42

christiangnrd force-pushed the intrinsics branch from 377f43a to bea2551 Compare November 18, 2025 03:27

christiangnrd force-pushed the intrinsics branch from bea2551 to d9e8016 Compare December 15, 2025 19:23

christiangnrd force-pushed the intrinsics branch 2 times, most recently from 75cf809 to f41bd02 Compare January 11, 2026 21:44

christiangnrd force-pushed the intrinsics branch from f41bd02 to de7c136 Compare January 22, 2026 15:03

christiangnrd force-pushed the intrinsics branch from de7c136 to f90736c Compare January 31, 2026 17:29

christiangnrd force-pushed the intrinsics branch from f90736c to f7c1884 Compare February 10, 2026 17:01

christiangnrd and others added 6 commits February 13, 2026 18:50

Sub-group shuffle

cb5d65b

Co-Authored-By: Simeon David Schaub <simeon@schaub.rocks>

Sub-group barrier

48905ce

Sub-group

1d3ec4f

Temp

922f426

Add KernelIntrinsics support

838ddcd

Temp CI

69ead4d

christiangnrd force-pushed the intrinsics branch from f7c1884 to 69ead4d Compare February 13, 2026 23:00

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

KernelIntrinsics#403

KernelIntrinsics#403
christiangnrd wants to merge 6 commits intoJuliaGPU:masterfrom
christiangnrd:intrinsics

christiangnrd commented Nov 18, 2025

Uh oh!

github-actions bot commented Nov 18, 2025 •

edited

Loading

Uh oh!

codecov bot commented Nov 18, 2025 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

christiangnrd commented Nov 18, 2025

Uh oh!

github-actions bot commented Nov 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

codecov bot commented Nov 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

github-actions bot commented Nov 18, 2025 •

edited

Loading

codecov bot commented Nov 18, 2025 •

edited

Loading