Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 60 additions & 2 deletions devops/scripts/benchmarks/benches/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def git_url(self) -> str:
return "https://github.com/intel/compute-benchmarks.git"

def git_hash(self) -> str:
# Feb 17, 2026
return "1ef6c0e6f3ca2e937f86a080594d268b1b895c16"
# Feb 20, 2026
return "27a3133d298a95af35e69173b06f6030ae33d742"

def setup(self) -> None:
if options.sycl is None:
Expand Down Expand Up @@ -351,6 +351,7 @@ def createTorchSingleQueueBench(variant_name: str, **kwargs):
"kernelName": "Add",
"kernelParamsNum": 5,
"kernelSubmitPattern": "Single",
"UseEvents": 0,
},
)

Expand Down Expand Up @@ -415,6 +416,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
kernelsPerQueue=20,
useProfiling=0,
measureCompletion=measure_completion,
UseEvents=0,
),
createTorchMultiQueueBench(
"medium",
Expand All @@ -423,6 +425,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
kernelsPerQueue=10,
useProfiling=0,
measureCompletion=measure_completion,
UseEvents=0,
),
createTorchMultiQueueBench(
"small",
Expand All @@ -431,6 +434,7 @@ def createTorchMultiQueueBench(variant_name: str, **kwargs):
kernelsPerQueue=4,
useProfiling=0,
measureCompletion=measure_completion,
UseEvents=0,
),
]

Expand Down Expand Up @@ -490,21 +494,25 @@ def createTorchMemoryReuseBench(variant_name: str, **kwargs):
"Int32Large",
kernelBatchSize=4096,
kernelDataType="Int32",
UseEvents=0,
),
createTorchMemoryReuseBench(
"Int32Medium",
kernelBatchSize=512,
kernelDataType="Int32",
UseEvents=0,
),
createTorchMemoryReuseBench(
"FloatLarge",
kernelBatchSize=4096,
kernelDataType="Float",
UseEvents=0,
),
createTorchMemoryReuseBench(
"FloatMedium",
kernelBatchSize=512,
kernelDataType="Float",
UseEvents=0,
),
]

Expand Down Expand Up @@ -573,6 +581,7 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
kernelGroupsCount=10,
kernelBatchSize=10,
useProfiling=0,
UseEvents=0,
),
createTorchGraphSingleQueueBench(
"medium",
Expand All @@ -582,6 +591,7 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
kernelGroupsCount=32,
kernelBatchSize=32,
useProfiling=0,
UseEvents=0,
),
createTorchGraphSingleQueueBench(
"large",
Expand All @@ -591,6 +601,41 @@ def createTorchGraphSingleQueueBench(variant_name: str, **kwargs):
kernelGroupsCount=64,
kernelBatchSize=64,
useProfiling=0,
UseEvents=0,
),
]

# Add TorchGraphMultiQueue benchmarks
for runtime in filter(lambda x: x != RUNTIMES.UR, RUNTIMES):
for profiler_type in list(PROFILERS):

def createTorchGraphMultiQueueBench(variant_name: str, **kwargs):
return TorchGraphMultiQueue(
self,
runtime,
variant_name,
profiler_type,
**{
**kwargs,
"workgroupCount": 512,
"workgroupSize": 256,
"Profiling": 0,
"UseEvents": 0,
},
)

benches += [
createTorchGraphMultiQueueBench(
"small",
kernelsPerQueue=10,
),
createTorchGraphMultiQueueBench(
"medium",
kernelsPerQueue=32,
),
createTorchGraphMultiQueueBench(
"large",
kernelsPerQueue=64,
),
]

Expand Down Expand Up @@ -1215,6 +1260,19 @@ def __init__(
)


class TorchGraphMultiQueue(TorchBenchmark):
def __init__(
self, suite, runtime: RUNTIMES, variant_name: str, profiler_type, **kwargs
):
super().__init__(
suite,
runtime,
"KernelSubmitGraphMultiQueue",
variant_name,
profiler_type,
**kwargs,
)

class QueueInOrderMemcpy(ComputeBenchmark):
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
self._is_copy_only = isCopyOnly
Expand Down
45 changes: 30 additions & 15 deletions devops/scripts/benchmarks/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,24 +212,24 @@ def test_submit_kernel(self):

def test_torch_l0(self):
self._checkCase(
"torch_benchmark_l0 KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512",
"torch_benchmark_l0 KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512",
"KernelSubmitSingleQueue Int32Large",
{"pytorch", "L0"},
"--test=KernelSubmitSingleQueue.*--profilerType=timer",
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512 CPU count",
"torch_benchmark_l0 KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Int32, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 4096, kernelWGSize 512 CPU count",
"KernelSubmitSingleQueue Int32Large, CPU count",
{"pytorch", "L0"},
"--test=KernelSubmitSingleQueue.*--profilerType=cpuCounter",
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitMultiQueue kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 0, useProfiling 0",
"torch_benchmark_l0 KernelSubmitMultiQueue UseEvents 0, kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 0, useProfiling 0",
"KernelSubmitMultiQueue large",
{"pytorch", "L0"},
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitMultiQueue kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 1, useProfiling 0 CPU count",
"torch_benchmark_l0 KernelSubmitMultiQueue UseEvents 0, kernelWGCount 4096, kernelWGSize 512, kernelsPerQueue 20, measureCompletion 1, useProfiling 0 CPU count",
"KernelSubmitMultiQueue large with measure completion, CPU count",
{"pytorch", "L0"},
)
Expand All @@ -244,24 +244,29 @@ def test_torch_l0(self):
{"pytorch", "L0"},
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitMemoryReuse kernelBatchSize 4096, kernelDataType Int32",
"torch_benchmark_l0 KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 4096, kernelDataType Int32",
"KernelSubmitMemoryReuse Int32Large",
{"pytorch", "L0"},
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitGraphSingleQueue kernelBatchSize 10, kernelGroupsCount 10, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0 CPU count",
"torch_benchmark_l0 KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 10, kernelGroupsCount 10, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0 CPU count",
"KernelSubmitGraphSingleQueue small, CPU count",
{"pytorch", "L0"},
)
self._checkCase(
"torch_benchmark_l0 KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 64, workgroupCount 512, workgroupSize 256 CPU count",
"KernelSubmitGraphMultiQueue large, CPU count",
{"pytorch", "L0"},
)

def test_torch_sycl(self):
self._checkCase(
"torch_benchmark_sycl KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 512, kernelWGSize 256",
"torch_benchmark_sycl KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 512, kernelWGSize 256",
"KernelSubmitSingleQueue MixedMedium",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitMultiQueue kernelWGCount 512, kernelWGSize 256, kernelsPerQueue 10, measureCompletion 1, useProfiling 0",
"torch_benchmark_sycl KernelSubmitMultiQueue UseEvents 0, kernelWGCount 512, kernelWGSize 256, kernelsPerQueue 10, measureCompletion 1, useProfiling 0",
"KernelSubmitMultiQueue medium with measure completion",
{"pytorch", "SYCL"},
)
Expand All @@ -281,24 +286,29 @@ def test_torch_sycl(self):
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitMemoryReuse kernelBatchSize 4096, kernelDataType Float",
"torch_benchmark_sycl KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 4096, kernelDataType Float",
"KernelSubmitMemoryReuse FloatLarge",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphSingleQueue kernelBatchSize 32, kernelGroupsCount 32, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
"torch_benchmark_sycl KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 32, kernelGroupsCount 32, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
"KernelSubmitGraphSingleQueue medium",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_sycl KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 32, workgroupCount 512, workgroupSize 256 CPU count",
"KernelSubmitGraphMultiQueue medium, CPU count",
{"pytorch", "SYCL"},
)

def test_torch_syclpreview(self):
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitSingleQueue kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 256, kernelWGSize 128",
"torch_benchmark_syclpreview KernelSubmitSingleQueue UseEvents 0, kernelBatchSize 512, kernelDataType Mixed, kernelName Add, kernelParamsNum 5, kernelSubmitPattern Single, kernelWGCount 256, kernelWGSize 128",
"KernelSubmitSingleQueue MixedSmall",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitMultiQueue kernelWGCount 256, kernelWGSize 128, kernelsPerQueue 4, measureCompletion 1, useProfiling 0",
"torch_benchmark_syclpreview KernelSubmitMultiQueue UseEvents 0, kernelWGCount 256, kernelWGSize 128, kernelsPerQueue 4, measureCompletion 1, useProfiling 0",
"KernelSubmitMultiQueue small with measure completion",
{"pytorch", "SYCL"},
)
Expand All @@ -318,20 +328,25 @@ def test_torch_syclpreview(self):
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitMemoryReuse kernelBatchSize 512, kernelDataType Float",
"torch_benchmark_syclpreview KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 512, kernelDataType Float",
"KernelSubmitMemoryReuse FloatMedium",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitMemoryReuse kernelBatchSize 512, kernelDataType Float CPU count",
"torch_benchmark_syclpreview KernelSubmitMemoryReuse UseEvents 0, kernelBatchSize 512, kernelDataType Float CPU count",
"KernelSubmitMemoryReuse FloatMedium, CPU count",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue kernelBatchSize 64, kernelGroupsCount 64, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
"torch_benchmark_syclpreview KernelSubmitGraphSingleQueue UseEvents 0, kernelBatchSize 64, kernelGroupsCount 64, kernelName Add, kernelWGCount 512, kernelWGSize 256, useProfiling 0",
"KernelSubmitGraphSingleQueue large",
{"pytorch", "SYCL"},
)
self._checkCase(
"torch_benchmark_syclpreview KernelSubmitGraphMultiQueue Profiling 0, UseEvents 0, kernelsPerQueue 10, workgroupCount 512, workgroupSize 256",
"KernelSubmitGraphMultiQueue small",
{"pytorch", "SYCL"},
)


if __name__ == "__main__":
Expand Down