ROCm · jhinpan · May 26, 2026 · May 28, 2026 · May 28, 2026 · May 29, 2026
diff --git a/.github/workflows/flydsl.yaml b/.github/workflows/flydsl.yaml
@@ -162,7 +162,7 @@ jobs:
           docker exec flydsl_test bash -c "rm -rf /tmp/aiter && git clone --depth 1 --recursive --shallow-submodules https://github.com/ROCm/aiter.git /tmp/aiter"
           docker exec flydsl_test bash -c "python3 -c \"from pathlib import Path; src = Path('/tmp/aiter/requirements.txt'); dst = Path('/tmp/aiter/requirements-flydsl-ci.txt'); lines = [line for line in src.read_text().splitlines() if line.strip() and not line.strip().startswith('flydsl==')]; dst.write_text('\\n'.join(lines) + '\\n')\" && python3 -m pip install -r /tmp/aiter/requirements-flydsl-ci.txt"
           docker exec flydsl_test bash -c "python3 -m pip uninstall -y triton pytorch-triton pytorch-triton-rocm triton-rocm amd-triton || true"
-          docker exec flydsl_test bash -c "python3 -m pip install --extra-index-url https://pypi.amd.com/triton/rocm-7.2.0/simple/ 'triton==3.7.0+amd.rocm7.2.0.gitd1660454'"
+          docker exec flydsl_test bash -c "python3 -m pip install --extra-index-url https://pypi.amd.com/triton/rocm-7.2.0/simple/ 'triton==3.7.0+amd.rocm7.2.0.gitd0d77a509'"
           docker exec flydsl_test bash -c "python3 -c 'import triton; assert tuple(map(int, triton.__version__.split(\"+\", 1)[0].split(\".\")[:3])) >= (3, 6, 0), triton.__version__; print(\"Installed triton\", triton.__version__)'"
 
       - name: Run tests

diff --git a/tests/kernels/test_vec_add.py b/tests/kernels/test_vec_add.py
@@ -25,6 +25,11 @@
     pytest.skip("CUDA/ROCm not available. Skipping GPU benchmarks.", allow_module_level=True)
 
 
+def _validate_vec_width(vec_width: int):
+    if vec_width <= 0 or (vec_width not in (1, 2, 4) and vec_width % 4 != 0):
+        raise ValueError("vec_width must be 1, 2, 4, or a positive multiple of 4")
+
+
 @flyc.kernel
 def vecAddKernel(
     A: fx.Tensor,
@@ -37,6 +42,10 @@ def vecAddKernel(
     tid = fx.thread_idx.x
 
     tile_elems = block_dim * vec_width
+    # CDNA buffer load/store atoms are emitted as up to 128-bit operations.
+    # Wider per-thread vectors are handled as multiple 128-bit chunks.
+    copy_width = 4 if vec_width > 4 else vec_width
+    chunks_per_thread = vec_width // copy_width
 
     # Wrap in buffer-descriptor-backed tensors for AMD buffer load/store
     A = fx.rocdl.make_buffer_tensor(A)
@@ -51,23 +60,25 @@ def vecAddKernel(
     tB = fx.slice(tB, (None, bid))
     tC = fx.slice(tC, (None, bid))
 
-    tA = fx.logical_divide(tA, fx.make_layout(vec_width, 1))
-    tB = fx.logical_divide(tB, fx.make_layout(vec_width, 1))
-    tC = fx.logical_divide(tC, fx.make_layout(vec_width, 1))
+    tA = fx.logical_divide(tA, fx.make_layout(copy_width, 1))
+    tB = fx.logical_divide(tB, fx.make_layout(copy_width, 1))
+    tC = fx.logical_divide(tC, fx.make_layout(copy_width, 1))
 
-    copyAtom = fx.make_copy_atom(fx.rocdl.BufferCopy128b(), fx.Float32)
+    copyAtom = fx.make_copy_atom(fx.rocdl.BufferCopy(copy_width * fx.Float32.width), fx.Float32)
 
-    rA = fx.make_rmem_tensor(vec_width, fx.Float32)
-    rB = fx.make_rmem_tensor(vec_width, fx.Float32)
-    rC = fx.make_rmem_tensor(vec_width, fx.Float32)
+    rA = fx.make_rmem_tensor(copy_width, fx.Float32)
+    rB = fx.make_rmem_tensor(copy_width, fx.Float32)
+    rC = fx.make_rmem_tensor(copy_width, fx.Float32)
 
-    fx.copy_atom_call(copyAtom, fx.slice(tA, (None, tid)), rA)
-    fx.copy_atom_call(copyAtom, fx.slice(tB, (None, tid)), rB)
+    for chunk in fx.range_constexpr(chunks_per_thread):
+        chunk_idx = chunk * block_dim + tid
+        fx.copy_atom_call(copyAtom, fx.slice(tA, (None, chunk_idx)), rA)
+        fx.copy_atom_call(copyAtom, fx.slice(tB, (None, chunk_idx)), rB)
 
-    vC = fx.arith.addf(fx.memref_load_vec(rA), fx.memref_load_vec(rB))
-    fx.memref_store_vec(vC, rC)
+        vC = fx.arith.addf(fx.memref_load_vec(rA), fx.memref_load_vec(rB))
+        fx.memref_store_vec(vC, rC)
 
-    fx.copy_atom_call(copyAtom, rC, fx.slice(tC, (None, tid)))
+        fx.copy_atom_call(copyAtom, rC, fx.slice(tC, (None, chunk_idx)))
 
 
 @flyc.jit
@@ -118,13 +129,15 @@ def torch_launch():
     }
 
 
-def benchmark_vector_add(vec_width: int = 4):
+def benchmark_vector_add(vec_width: int = 4, *, size_multiplier: int = 10000, run_benchmark: bool = True):
     """Benchmark vector addition kernel performance."""
 
+    _validate_vec_width(vec_width)
+
     THREADS_PER_BLOCK = 256
     VEC_WIDTH = vec_width
     TILE_ELEMS = THREADS_PER_BLOCK * VEC_WIDTH
-    SIZE = TILE_ELEMS * 10000  # align to tile boundary
+    SIZE = TILE_ELEMS * size_multiplier  # align to tile boundary
 
     print("\n" + "=" * 80)
     print("Benchmark: Vector Addition (C = A + B) - flydsl API")
@@ -148,6 +161,8 @@ def benchmark_vector_add(vec_width: int = 4):
 
     error = checkAllclose(c_dev, a_dev + b_dev)
     print(f"  Correctness: max error = {error:.2e}")
+    if not run_benchmark:
+        return error < 1e-5
 
     def kernel_launch():
         vecAdd(tA, b_dev, c_dev, SIZE, SIZE, THREADS_PER_BLOCK, VEC_WIDTH, stream=stream)
@@ -178,13 +193,16 @@ def kernel_launch():
     return error < 1e-5
 
 
-def test_benchmark_vector_add():
+@pytest.mark.parametrize("vec_width", [4, 8, 16])
+def test_benchmark_vector_add(vec_width):
     """Pytest wrapper for vector addition benchmark."""
     print("\n" + "=" * 80)
     print("ROCm GPU Benchmark - Vector Addition with flydsl API")
     print(f"GPU: {get_rocm_arch()}")
     print("=" * 80)
-    assert benchmark_vector_add(), "Vector addition benchmark failed correctness check"
+    assert benchmark_vector_add(
+        vec_width=vec_width, size_multiplier=1024, run_benchmark=False
+    ), "Vector addition benchmark failed correctness check"
 
 
 if __name__ == "__main__":