intel · KornevNikita · Feb 26, 2026 · Feb 25, 2026
@@ -68,8 +68,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Feb 17, 2026
-        return "1ef6c0e6f3ca2e937f86a080594d268b1b895c16"
+        # Feb 20, 2026
+        return "27a3133d298a95af35e69173b06f6030ae33d742"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -351,6 +351,7 @@ def createTorchSingleQueueBench(variant_name: str, **kwargs):
                             "kernelName": "Add",
                             "kernelParamsNum": 5,
                             "kernelSubmitPattern": "Single",
+                            "UseEvents": 0,
                         },
                     )
 
@@ -415,6 +416,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
                         kernelsPerQueue=20,
                         useProfiling=0,
                         measureCompletion=measure_completion,
+                        UseEvents=0,
                     ),
                     createTorchMultiQueueBench(
                         "medium",
@@ -423,6 +425,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
                         kernelsPerQueue=10,
                         useProfiling=0,
                         measureCompletion=measure_completion,
+                        UseEvents=0,
                     ),
                     createTorchMultiQueueBench(
                         "small",
@@ -431,6 +434,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
                         kernelsPerQueue=4,
                         useProfiling=0,
                         measureCompletion=measure_completion,
+                        UseEvents=0,
                     ),
                 ]
 
@@ -490,21 +494,25 @@ def createTorchMemoryReuseBench(variant_name: str, **kwargs):
                         "Int32Large",
                         kernelBatchSize=4096,
                         kernelDataType="Int32",
+                        UseEvents=0,
                     ),
                     createTorchMemoryReuseBench(
                         "Int32Medium",
                         kernelBatchSize=512,
                         kernelDataType="Int32",
+                        UseEvents=0,
                     ),
                     createTorchMemoryReuseBench(
                         "FloatLarge",
                         kernelBatchSize=4096,
                         kernelDataType="Float",
+                        UseEvents=0,
                     ),
                     createTorchMemoryReuseBench(
                         "FloatMedium",
                         kernelBatchSize=512,
                         kernelDataType="Float",
+                        UseEvents=0,
                     ),
                 ]
 
@@ -573,6 +581,7 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
                         kernelGroupsCount=10,
                         kernelBatchSize=10,
                         useProfiling=0,
+                        UseEvents=0,
                     ),
                     createTorchGraphSingleQueueBench(
                         "medium",
@@ -582,6 +591,7 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
                         kernelGroupsCount=32,
                         kernelBatchSize=32,
                         useProfiling=0,
+                        UseEvents=0,
                     ),
                     createTorchGraphSingleQueueBench(
                         "large",
@@ -591,6 +601,41 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
                         kernelGroupsCount=64,
                         kernelBatchSize=64,
                         useProfiling=0,
+                        UseEvents=0,
+                    ),
+                ]
+
+        # Add TorchGraphMultiQueue benchmarks
+        for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
+            for profiler_type in list(PROFILERS):
+
+                def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
+                    return TorchGraphMultiQueue(
+                        self,
+                        runtime,
+                        variant_name,
+                        profiler_type,
+                        **{
+                            **kwargs,
+                            "workgroupCount": 512,
+                            "workgroupSize": 256,
+                            "Profiling": 0,
+                            "UseEvents": 0,
+                        },
+                    )
+
+                benches += [
+                    createTorchGraphMultiQueueBench(
+                        "small",
+                        kernelsPerQueue=10,
+                    ),
+                    createTorchGraphMultiQueueBench(
+                        "medium",
+                        kernelsPerQueue=32,
+                    ),
+                    createTorchGraphMultiQueueBench(
+                        "large",
+                        kernelsPerQueue=64,
                     ),
                 ]
 
@@ -1215,6 +1260,19 @@ def __init__(
         )
 
 
+class TorchGraphMultiQueue(TorchBenchmark):
+    def __init__(
+        self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
+    ):
+        super().__init__(
+            suite,
+            runtime,
+            "KernelSubmitGraphMultiQueue",
+            variant_name,
+            profiler_type,
+            **kwargs,
+        )
+
 class QueueInOrderMemcpy(ComputeBenchmark):
     def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self._is_copy_only = isCopyOnly

@@ -212,24 +212,24 @@ def test_submit_kernel(self):
 
     def test_torch_l0(self):
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512",
+            "torch_benchmark_l0 KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512",
             "KernelSubmitSingleQueue Int32Large",
             {"pytorch", "L0"},
             "--test=KernelSubmitSingleQueue.*--profilerType=timer",
         )
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512 CPU count",
+            "torch_benchmark_l0 KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512 CPU count",
             "KernelSubmitSingleQueue Int32Large, CPU count",
             {"pytorch", "L0"},
             "--test=KernelSubmitSingleQueue.*--profilerType=cpuCounter",
         )
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitMultiQueue kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 0, useProfiling 0",
+            "torch_benchmark_l0 KernelSubmitMultiQueue UseEvents 0, kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 0, useProfiling 0",
             "KernelSubmitMultiQueue large",
             {"pytorch", "L0"},
         )
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitMultiQueue kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 1, useProfiling 0 CPU count",
+            "torch_benchmark_l0 KernelSubmitMultiQueue UseEvents 0, kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 1, useProfiling 0 CPU count",
             "KernelSubmitMultiQueue large with measure completion, CPU count",
             {"pytorch", "L0"},
         )
@@ -244,24 +244,29 @@ def test_torch_l0(self):
             {"pytorch", "L0"},
         )
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitMemoryReuse kernelBatchSize 4096, kernelDataType Int32",
+            "torch_benchmark_l0 KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 4096, kernelDataType Int32",
             "KernelSubmitMemoryReuse Int32Large",
             {"pytorch", "L0"},
         )
         self._checkCase(
-            "torch_benchmark_l0 KernelSubmitGraphSingleQueue kernelBatchSize 10, kernelGroupsCount 10, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0 CPU count",
+            "torch_benchmark_l0 KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 10, kernelGroupsCount 10, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0 CPU count",
             "KernelSubmitGraphSingleQueue small, CPU count",
             {"pytorch", "L0"},
         )
+        self._checkCase(
+            "torch_benchmark_l0 KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 64, workgroupCount 512, workgroupSize 256 CPU count",
+            "KernelSubmitGraphMultiQueue large, CPU count",
+            {"pytorch", "L0"},
+        )
 
     def test_torch_sycl(self):
         self._checkCase(
-            "torch_benchmark_sycl KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 512, kernelWGSize 256",
+            "torch_benchmark_sycl KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 512, kernelWGSize 256",
             "KernelSubmitSingleQueue MixedMedium",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_sycl KernelSubmitMultiQueue kernelWGCount 512, kernelWGSize 256, kernelsPerQueue 10, measureCompletion 1, useProfiling 0",
+            "torch_benchmark_sycl KernelSubmitMultiQueue UseEvents 0, kernelWGCount 512, kernelWGSize 256, kernelsPerQueue 10, measureCompletion 1, useProfiling 0",
             "KernelSubmitMultiQueue medium with measure completion",
             {"pytorch", "SYCL"},
         )
@@ -281,24 +286,29 @@ def test_torch_sycl(self):
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_sycl KernelSubmitMemoryReuse kernelBatchSize 4096, kernelDataType Float",
+            "torch_benchmark_sycl KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 4096, kernelDataType Float",
             "KernelSubmitMemoryReuse FloatLarge",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_sycl KernelSubmitGraphSingleQueue kernelBatchSize 32, kernelGroupsCount 32, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
+            "torch_benchmark_sycl KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 32, kernelGroupsCount 32, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
             "KernelSubmitGraphSingleQueue medium",
             {"pytorch", "SYCL"},
         )
+        self._checkCase(
+            "torch_benchmark_sycl KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 32, workgroupCount 512, workgroupSize 256 CPU count",
+            "KernelSubmitGraphMultiQueue medium, CPU count",
+            {"pytorch", "SYCL"},
+        )
 
     def test_torch_syclpreview(self):
         self._checkCase(
-            "torch_benchmark_syclpreview KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 256, kernelWGSize 128",
+            "torch_benchmark_syclpreview KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 256, kernelWGSize 128",
             "KernelSubmitSingleQueue MixedSmall",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_syclpreview KernelSubmitMultiQueue kernelWGCount 256, kernelWGSize 128, kernelsPerQueue 4, measureCompletion 1, useProfiling 0",
+            "torch_benchmark_syclpreview KernelSubmitMultiQueue UseEvents 0, kernelWGCount 256, kernelWGSize 128, kernelsPerQueue 4, measureCompletion 1, useProfiling 0",
             "KernelSubmitMultiQueue small with measure completion",
             {"pytorch", "SYCL"},
         )
@@ -318,20 +328,25 @@ def test_torch_syclpreview(self):
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_syclpreview KernelSubmitMemoryReuse kernelBatchSize 512, kernelDataType Float",
+            "torch_benchmark_syclpreview KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 512, kernelDataType Float",
             "KernelSubmitMemoryReuse FloatMedium",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_syclpreview KernelSubmitMemoryReuse kernelBatchSize 512, kernelDataType Float CPU count",
+            "torch_benchmark_syclpreview KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 512, kernelDataType Float CPU count",
             "KernelSubmitMemoryReuse FloatMedium, CPU count",
             {"pytorch", "SYCL"},
         )
         self._checkCase(
-            "torch_benchmark_syclpreview KernelSubmitGraphSingleQueue kernelBatchSize 64, kernelGroupsCount 64, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
+            "torch_benchmark_syclpreview KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 64, kernelGroupsCount 64, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
             "KernelSubmitGraphSingleQueue large",
             {"pytorch", "SYCL"},
         )
+        self._checkCase(
+            "torch_benchmark_syclpreview KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
+            "KernelSubmitGraphMultiQueue small",
+            {"pytorch", "SYCL"},
+        )
 
 
 if __name__ == "__main__":