Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/flydsl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ jobs:
docker exec flydsl_test bash -c "rm -rf /tmp/aiter && git clone --depth 1 --recursive --shallow-submodules https://github.com/ROCm/aiter.git /tmp/aiter"
docker exec flydsl_test bash -c "python3 -c \"from pathlib import Path; src = Path('/tmp/aiter/requirements.txt'); dst = Path('/tmp/aiter/requirements-flydsl-ci.txt'); lines = [line for line in src.read_text().splitlines() if line.strip() and not line.strip().startswith('flydsl==')]; dst.write_text('\\n'.join(lines) + '\\n')\" && python3 -m pip install -r /tmp/aiter/requirements-flydsl-ci.txt"
docker exec flydsl_test bash -c "python3 -m pip uninstall -y triton pytorch-triton pytorch-triton-rocm triton-rocm amd-triton || true"
docker exec flydsl_test bash -c "python3 -m pip install --extra-index-url https://pypi.amd.com/triton/rocm-7.2.0/simple/ 'triton==3.7.0+amd.rocm7.2.0.gitd1660454'"
docker exec flydsl_test bash -c "python3 -m pip install --extra-index-url https://pypi.amd.com/triton/rocm-7.2.0/simple/ 'triton==3.7.0+amd.rocm7.2.0.gitd0d77a509'"
docker exec flydsl_test bash -c "python3 -c 'import triton; assert tuple(map(int, triton.__version__.split(\"+\", 1)[0].split(\".\")[:3])) >= (3, 6, 0), triton.__version__; print(\"Installed triton\", triton.__version__)'"

- name: Run tests
Expand Down
50 changes: 34 additions & 16 deletions tests/kernels/test_vec_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
pytest.skip("CUDA/ROCm not available. Skipping GPU benchmarks.", allow_module_level=True)


def _validate_vec_width(vec_width: int):
if vec_width <= 0 or (vec_width not in (1, 2, 4) and vec_width % 4 != 0):
raise ValueError("vec_width must be 1, 2, 4, or a positive multiple of 4")


@flyc.kernel
def vecAddKernel(
A: fx.Tensor,
Expand All @@ -37,6 +42,10 @@ def vecAddKernel(
tid = fx.thread_idx.x

tile_elems = block_dim * vec_width
# CDNA buffer load/store atoms are emitted as up to 128-bit operations.
# Wider per-thread vectors are handled as multiple 128-bit chunks.
copy_width = 4 if vec_width > 4 else vec_width
chunks_per_thread = vec_width // copy_width
Comment on lines +45 to +48

# Wrap in buffer-descriptor-backed tensors for AMD buffer load/store
A = fx.rocdl.make_buffer_tensor(A)
Expand All @@ -51,23 +60,25 @@ def vecAddKernel(
tB = fx.slice(tB, (None, bid))
tC = fx.slice(tC, (None, bid))

tA = fx.logical_divide(tA, fx.make_layout(vec_width, 1))
tB = fx.logical_divide(tB, fx.make_layout(vec_width, 1))
tC = fx.logical_divide(tC, fx.make_layout(vec_width, 1))
tA = fx.logical_divide(tA, fx.make_layout(copy_width, 1))
tB = fx.logical_divide(tB, fx.make_layout(copy_width, 1))
tC = fx.logical_divide(tC, fx.make_layout(copy_width, 1))

copyAtom = fx.make_copy_atom(fx.rocdl.BufferCopy128b(), fx.Float32)
copyAtom = fx.make_copy_atom(fx.rocdl.BufferCopy(copy_width * fx.Float32.width), fx.Float32)

rA = fx.make_rmem_tensor(vec_width, fx.Float32)
rB = fx.make_rmem_tensor(vec_width, fx.Float32)
rC = fx.make_rmem_tensor(vec_width, fx.Float32)
rA = fx.make_rmem_tensor(copy_width, fx.Float32)
rB = fx.make_rmem_tensor(copy_width, fx.Float32)
rC = fx.make_rmem_tensor(copy_width, fx.Float32)

fx.copy_atom_call(copyAtom, fx.slice(tA, (None, tid)), rA)
fx.copy_atom_call(copyAtom, fx.slice(tB, (None, tid)), rB)
for chunk in fx.range_constexpr(chunks_per_thread):
chunk_idx = chunk * block_dim + tid
fx.copy_atom_call(copyAtom, fx.slice(tA, (None, chunk_idx)), rA)
fx.copy_atom_call(copyAtom, fx.slice(tB, (None, chunk_idx)), rB)

vC = fx.arith.addf(fx.memref_load_vec(rA), fx.memref_load_vec(rB))
fx.memref_store_vec(vC, rC)
vC = fx.arith.addf(fx.memref_load_vec(rA), fx.memref_load_vec(rB))
fx.memref_store_vec(vC, rC)

fx.copy_atom_call(copyAtom, rC, fx.slice(tC, (None, tid)))
fx.copy_atom_call(copyAtom, rC, fx.slice(tC, (None, chunk_idx)))


@flyc.jit
Expand Down Expand Up @@ -118,13 +129,15 @@ def torch_launch():
}


def benchmark_vector_add(vec_width: int = 4):
def benchmark_vector_add(vec_width: int = 4, *, size_multiplier: int = 10000, run_benchmark: bool = True):
"""Benchmark vector addition kernel performance."""

_validate_vec_width(vec_width)

THREADS_PER_BLOCK = 256
VEC_WIDTH = vec_width
TILE_ELEMS = THREADS_PER_BLOCK * VEC_WIDTH
SIZE = TILE_ELEMS * 10000 # align to tile boundary
SIZE = TILE_ELEMS * size_multiplier # align to tile boundary

print("\n" + "=" * 80)
print("Benchmark: Vector Addition (C = A + B) - flydsl API")
Expand All @@ -148,6 +161,8 @@ def benchmark_vector_add(vec_width: int = 4):

error = checkAllclose(c_dev, a_dev + b_dev)
print(f" Correctness: max error = {error:.2e}")
if not run_benchmark:
return error < 1e-5

def kernel_launch():
vecAdd(tA, b_dev, c_dev, SIZE, SIZE, THREADS_PER_BLOCK, VEC_WIDTH, stream=stream)
Expand Down Expand Up @@ -178,13 +193,16 @@ def kernel_launch():
return error < 1e-5


def test_benchmark_vector_add():
@pytest.mark.parametrize("vec_width", [4, 8, 16])
def test_benchmark_vector_add(vec_width):
"""Pytest wrapper for vector addition benchmark."""
print("\n" + "=" * 80)
print("ROCm GPU Benchmark - Vector Addition with flydsl API")
print(f"GPU: {get_rocm_arch()}")
print("=" * 80)
assert benchmark_vector_add(), "Vector addition benchmark failed correctness check"
assert benchmark_vector_add(
vec_width=vec_width, size_multiplier=1024, run_benchmark=False
), "Vector addition benchmark failed correctness check"


if __name__ == "__main__":
Expand Down