hw-native-sys · vegetabledoww · May 30, 2026 · May 30, 2026
diff --git a/docs/dfx/tensor-dump.md b/docs/dfx/tensor-dump.md
@@ -21,15 +21,18 @@ saw, without the timing distortion of inline printing.
   dispatch, outputs snapshotted after FIN; `INOUT` tensors at both
   stages.
 - **Logical shape preserved.** Records carry dtype, shape,
-  `raw_shape`, offsets, and `is_contiguous` so non-contiguous views
-  are reconstructable.
-- **Manifest + binary payload.** A single JSON manifest plus one
-  `.bin` payload per run; each manifest entry has `bin_offset` /
-  `bin_size` into the payload.
-- **Cross-architecture.** Same `--dump-tensor` flag, same on-disk
-  format on `a2a3` and `a5`. Both runtimes are wired through.
-
-Enable in one line:
+  `strides`, `start_offset`, and `is_contiguous` so logical views are
+  reconstructable.
+- **Manifest + binary payload.** `--dump-tensor` writes a JSON
+  manifest plus one `.bin` payload per run; tensor entries carry
+  `bin_offset` / `bin_size`, while scalar entries stay manifest-only.
+- **Unified scalar args.** Scalar callable slots are emitted as
+  `kind: scalar`, `stage: before_dispatch`, zero-dim records in
+  `tensor_dump.json`; there is no separate args-only manifest.
+- **Cross-architecture.** Same flags and on-disk layout family on
+  `a2a3` and `a5`. Both runtimes are wired through.
+
+Enable dump capture in one line:
 
 ```bash
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
@@ -40,7 +43,6 @@ python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 ### 3.1 Enable Tensor Dump
 
 ```bash
-# Standalone runner
 python tests/st/<case>/test_<name>.py -p a5sim --dump-tensor
 python tests/st/<case>/test_<name>.py -p a2a3 -d 0 --dump-tensor
 
@@ -49,16 +51,12 @@ pytest tests/st/<case> --platform a5sim --dump-tensor
 pytest examples/a5/host_build_graph/vector_example --platform a5sim --dump-tensor
 ```
 
-The flag flips `CallConfig::enable_dump_tensor`. The host then
+`--dump-tensor` flips `CallConfig::enable_dump_tensor`. The host
 allocates dump storage, publishes its base address through
-`kernel_args.dump_data_base`, and sets
-`PROFILING_FLAG_DUMP_TENSOR` in each worker handshake's
-`enable_profiling_flag`. The on-device AICPU kernel reads both:
-the storage base via `set_platform_dump_base()` and the enable bit
-via `set_enable_dump_tensor(GET_PROFILING_FLAG(...))`. AICore
-executors read the same handshake bit to insert a
-`pipe_barrier(PIPE_ALL)` before FIN when dump is on, so
-`AFTER_COMPLETION` snapshots see the kernel's final writes.
+`kernel_args.dump_data_base`, and sets `PROFILING_FLAG_DUMP_TENSOR`
+in the worker profiling bitmask. AICPU reads the storage base via
+`set_platform_dump_base()` and the enable state via
+`set_dump_tensor_enabled(...)`.
 
 ### 3.2 Select Specific Task Tensors
 
@@ -107,12 +105,15 @@ The dump artifacts land under the per-task output prefix
 ```text
 <output_prefix>/
 └── tensor_dump/
-    ├── tensor_dump.json
-    └── tensor_dump.bin
+    ├── tensor_dump.json  # unified tensor/scalar manifest (`--dump-tensor`)
+    └── tensor_dump.bin   # raw tensor payload (`--dump-tensor`)
 ```
 
 Filenames are fixed (no per-file timestamp) — the directory is the
-per-task uniqueness boundary.
+per-task uniqueness boundary. `--dump-tensor` emits both files in the
+same `tensor_dump/` directory.
+
+#### `tensor_dump.json` — Unified manifest
 
 `tensor_dump.json` is the manifest; its `bin_file` field points at
 the sibling binary payload.
@@ -139,15 +140,14 @@ Example manifest (one input tensor captured before dispatch):
   "tensors": [
     {
       "task_id": "0x0000000200000a00",
-      "subtask_id": 1,
       "role": "input",
       "stage": "before_dispatch",
-      "func_id": 0,
       "arg_index": 0,
+      "kind": "tensor",
       "dtype": "float32",
       "shape": [16384],
-      "raw_shape": [16384],
-      "offsets": [0],
+      "strides": [1],
+      "start_offset": 0,
       "is_contiguous": true,
       "truncated": false,
       "overwritten": false,
@@ -160,15 +160,19 @@ Example manifest (one input tensor captured before dispatch):
 
 Key fields:
 
-- `task_id` / `subtask_id` / `func_id` — runtime task identity. Use
-  to correlate with swimlane / PMU output. `subtask_id` distinguishes
-  AIC / AIV0 / AIV1 within a single task.
-- `arg_index` — position in the formal callable signature.
+- `task_id` — runtime task identity. Use to correlate with swimlane / PMU
+  output.
+- `arg_index` — position in the formal callable signature. For scalar
+  entries this equals `payload.tensor_count + scalar_index`.
 - `role` / `stage` — `input` / `output` / `inout`, captured
   `before_dispatch` / `after_completion`.
-- `dtype` / `shape` / `raw_shape` / `offsets` / `is_contiguous` —
-  view geometry. `bin_size` is `numel × elem_size` of the *logical*
-  view, gathered if non-contiguous.
+- `kind` — `tensor` for arena-backed payloads, `scalar` for
+  zero-dim `before_dispatch` scalar args stored directly in the
+  manifest.
+- `dtype` / `shape` / `strides` / `start_offset` /
+  `is_contiguous` — logical view geometry. Tensor `bin_size` is
+  `numel × elem_size` of the *logical* view, gathered if
+  non-contiguous; scalar entries have `bin_size = 0`.
 - `bin_offset` — byte offset into `tensor_dump.bin` where the
   payload starts.
 - `truncated` / `overwritten` — set when the tensor exceeded arena
@@ -187,7 +191,7 @@ uses its `bin_file` field to find the payload:
 python -m simpler_setup.tools.dump_viewer
 
 # Filter and save matching tensors to human-readable .txt files
-python -m simpler_setup.tools.dump_viewer --func 0 --stage before --role input --export
+python -m simpler_setup.tools.dump_viewer --stage before --role input --export
 
 # Export one specific entry by its manifest index
 python -m simpler_setup.tools.dump_viewer --index 42
@@ -204,8 +208,8 @@ spreadsheet.
 
 ### 3.4 Add dump support to a new test
 
-Only `host_build_graph` needs explicit wiring; other runtimes pick
-up metadata automatically.
+Only `host_build_graph` needs explicit wiring; other runtimes derive
+tensor view metadata automatically.
 
 ```cpp
 // In orchestration C++ (host_build_graph only)
@@ -234,13 +238,17 @@ What you can read out of `tensor_dump.json` + `tensor_dump.bin`:
 
 - **Per-task input snapshots** (`role: input`, `stage:
   before_dispatch`) — what each kernel was given.
+- **Per-dispatch scalar args** (`kind: scalar`, `stage:
+  before_dispatch`) — the raw callable-slot scalar values AICPU
+  handed to the kernel.
 - **Per-task output snapshots** (`role: output`, `stage:
   after_completion`) — what each kernel produced. The barrier
   ensures these reflect the kernel's final writes.
 - **`INOUT` deltas** — same arg captured at both stages; diff
   before vs after to see exactly what the kernel modified.
-- **Non-contiguous view reconstruction** — `raw_shape` / `offsets`
-  / `is_contiguous` plus the gathered logical-contiguous payload.
+- **Logical view reconstruction** — `shape` / `strides` /
+  `start_offset` / `is_contiguous` plus the gathered
+  logical-contiguous payload.
 - **Per-task identity** — `task_id` / `subtask_id` / `func_id`
   correlates dump entries with swimlane and PMU rows.
 - **Loss accounting** — `truncated` / `overwritten` per-record
@@ -318,8 +326,9 @@ Each runtime's scheduler dispatch code calls
 ```
 
 `dump_tensors_for_task` walks the formal callable signature,
-matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape + offsets + device address), and calls `dump_tensor_record` for
-slots that match the current stage.
+matches each non-scalar slot to a `TensorDumpInfo` (dtype + shape +
+strides + start offset + device address), and calls
+`dump_tensor_record` for slots that match the current stage.
 
 When dump is enabled, AICore executors also issue
 `pipe_barrier(PIPE_ALL)` after kernel execution and before writing
@@ -564,7 +573,9 @@ Tensor Dump is opt-in and zero-overhead when disabled — without
 AICore skip the dump-specific code paths. The `pipe_barrier(PIPE_ALL)`
 before FIN is also gated on the same handshake bit.
 
-When enabled, the per-task overhead is dominated by:
+With `--dump-tensor`, AICPU records full `BEFORE_DISPATCH` /
+`AFTER_COMPLETION` tensor payloads plus manifest-only
+`BEFORE_DISPATCH` scalar args. The per-task overhead is dominated by:
 
 - The `BEFORE_DISPATCH` / `AFTER_COMPLETION` payload memcpy into
   the per-thread arena (contiguous fast-path; logical traversal for

diff --git a/docs/testing.md b/docs/testing.md
@@ -72,7 +72,7 @@ If a module is pure C++ with no Python binding, test in **ut-cpp** (`tests/ut/cp
 
 Scene tests support advanced CLI options for benchmarking, profiling, and runtime control. These work identically in both pytest and standalone mode.
 
-> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (per-task tensor I/O), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
+> "Profiling" is the umbrella for three parallel diagnostics sub-features: `--enable-l2-swimlane` (L2 swimlane), `--dump-tensor` (unified tensor/scalar dump), and `--enable-pmu` (PMU CSV). They are independent and can be combined.
 
 ### pytest
 
@@ -91,7 +91,7 @@ pytest --platform a2a3sim --log-level debug                        # verbose C++
 python test_xxx.py -p a2a3sim                                    # default: 1 round + golden
 python test_xxx.py -p a2a3 -d 0 --rounds 100 --skip-golden       # benchmark mode
 python test_xxx.py -p a2a3 --enable-l2-swimlane                         # L2 swimlane (first round)
-python test_xxx.py -p a2a3 --dump-tensor                         # dump per-task tensor I/O
+python test_xxx.py -p a2a3 --dump-tensor                         # dump unified tensor/scalar artifacts
 python test_xxx.py -p a2a3 --enable-pmu 4                        # PMU CSV (MEMORY)
 python test_xxx.py -p a2a3sim --build                            # compile runtime from source
 python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ logging
@@ -110,7 +110,7 @@ python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ l
 | `--manual` | | `exclude` | `exclude`/`include`/`only` for manual cases |
 | `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
 | `--enable-l2-swimlane [PERF_LEVEL]` | | `0` | Enable L2 swimlane collection on first round only. The flag takes an integer perf_level 0–4 (bare = 4); see [docs/dfx/l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md#31-enable-l2-swimlane) for the level table. Each test case gets its own `outputs/<case>_<ts>/` directory under which `l2_perf_records.json` lands; parallel runs never collide. |
-| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
+| `--dump-tensor` | | false | Dump tensors plus scalar args into unified runtime artifacts |
 | `--enable-pmu [EVENT_TYPE]` | | `0` | Enable a2a3 PMU CSV collection. Bare flag selects `PIPE_UTILIZATION` (`2`); pass an event type such as `4` for `MEMORY`. |
 | `--build` | | false | Compile runtime from source (not pre-built) |
 | `--exitfirst` | `-x` | false | Stop on first failing test (fail-fast, primarily for CI) |
@@ -325,7 +325,7 @@ A single file can declare both L2 and L3 classes; they're grouped by `(runtime,
 Each test case sets its own `CallConfig.output_prefix` (chosen by `scene_test.py::_build_output_prefix` as `outputs/<ClassName>_<case>_<YYYYMMDD_HHMMSS>/`). The C++ runtime writes all diagnostic artifacts under that prefix with fixed filenames:
 
 - `outputs/<case>_<ts>/l2_perf_records.json` — swimlane (`--enable-l2-swimlane`)
-- `outputs/<case>_<ts>/tensor_dump/` — tensor dump (`--dump-tensor`)
+- `outputs/<case>_<ts>/tensor_dump/` — unified tensor/scalar dump artifacts (`--dump-tensor`)
 - `outputs/<case>_<ts>/pmu.csv` — PMU counters (`--enable-pmu`)
 
 Because each case gets its own directory, parallel runs (xdist workers, L3 case fanout, L2 device fanout) can never collide on filename — there is no per-file timestamp, no env-var scoping, and no post-run flatten step. `CallConfig::validate()` throws if any diagnostic flag is enabled but `output_prefix` is empty; `scene_test.py::run_class_cases` always fills it from the case label.

diff --git a/simpler_setup/tools/dump_viewer.py b/simpler_setup/tools/dump_viewer.py
@@ -11,7 +11,6 @@
 
 Filters (freely combinable):
     --task   Filter by task_id (hex, e.g. 0x0000000200000a00)
-    --func   Filter by func_id (int)
     --stage  Filter by stage (before / after)
     --role   Filter by role (input / output / inout)
     --arg    Filter by arg_index (int)
@@ -26,11 +25,11 @@
     # List all tensors in a specific dump dir
     python -m simpler_setup.tools.dump_viewer outputs/<case>_<ts>/tensor_dump/
 
-    # List before-dispatch inputs of func_id=3 (latest dir)
-    python -m simpler_setup.tools.dump_viewer --func 3 --stage before --role input
+    # List before-dispatch inputs of task_id 0x... (latest dir)
+    python -m simpler_setup.tools.dump_viewer --task 0x0000000200000a00 --stage before --role input
 
     # Export them to txt
-    python -m simpler_setup.tools.dump_viewer outputs/<case>/tensor_dump/ --func 3 --stage before --export
+    python -m simpler_setup.tools.dump_viewer outputs/<case>/tensor_dump/ --stage before --export
 
     # Export a specific tensor by index
     python -m simpler_setup.tools.dump_viewer outputs/<case>_<ts>/tensor_dump/ --index 42 --export
@@ -94,17 +93,15 @@ def tensor_filename(t: dict) -> str:
     role_map = {"input": "in", "output": "out", "inout": "inout"}
     stage_str = stage_map.get(t["stage"], t["stage"])
     role_str = role_map.get(t["role"], t["role"])
-    return f"task_{t['task_id']}_s{t['subtask_id']}_{stage_str}_{role_str}{t['arg_index']}.txt"
+    return f"task_{t['task_id']}_{stage_str}_{role_str}{t['arg_index']}.txt"
 
 
 def write_tensor(tensor: dict, bin_path: Path, out):
     t = tensor
     out.write(f"# task_id: {t['task_id']}\n")
-    out.write(f"# subtask_id: {t['subtask_id']}\n")
     out.write(f"# role: {t['role']}\n")
     out.write(f"# stage: {t['stage']}\n")
     out.write(f"# arg_index: {t['arg_index']}\n")
-    out.write(f"# func_id: {t['func_id']}\n")
     out.write(f"# dtype: {t['dtype']}\n")
     out.write(f"# is_contiguous: {t['is_contiguous']}\n")
     out.write(f"# shape: {t['shape']}\n")
@@ -182,15 +179,15 @@ def collect_valid_values(tensors: list, field: str) -> list:
 
 def list_tensors(tensors: list):
     print(
-        f"{'idx':>6}  {'task_id':>18}  {'s':>1}  {'stage':>7}  {'role':>5}"
-        f"  {'arg':>3}  {'func':>4}  {'dtype':>8}  {'shape':<20}  {'bytes':>10}"
+        f"{'idx':>6}  {'task_id':>18}  {'stage':>7}  {'role':>5}"
+        f"  {'arg':>3}  {'dtype':>8}  {'shape':<20}  {'bytes':>10}"
     )
-    print("-" * 100)
+    print("-" * 90)
     for i, t in enumerate(tensors):
         stage_short = "before" if t["stage"] == "before_dispatch" else "after"
         print(
-            f"{i:>6}  {t['task_id']:>18}  {t['subtask_id']:>1}  {stage_short:>7}  {t['role']:>5}  "
-            f"{t['arg_index']:>3}  {t['func_id']:>4}  {t['dtype']:>8}  {str(t['shape']):<20}  {t['bin_size']:>10}"
+            f"{i:>6}  {t['task_id']:>18}  {stage_short:>7}  {t['role']:>5}  "
+            f"{t['arg_index']:>3}  {t['dtype']:>8}  {str(t['shape']):<20}  {t['bin_size']:>10}"
         )
 
 
@@ -218,14 +215,6 @@ def _apply_filters(tensors: list, args: argparse.Namespace) -> list:
             sys.exit(1)
         filtered = [t for t in filtered if t["task_id"] == args.task]
 
-    if args.func is not None:
-        valid = collect_valid_values(filtered, "func_id")
-        if str(args.func) not in valid:
-            print(f"Error: --func {args.func} not found in current selection.", file=sys.stderr)
-            print(f"  Valid func_ids: {', '.join(valid)}", file=sys.stderr)
-            sys.exit(1)
-        filtered = [t for t in filtered if t["func_id"] == args.func]
-
     if args.stage:
         stage_map = {"before": "before_dispatch", "after": "after_completion"}
         if args.stage not in stage_map:
@@ -263,7 +252,6 @@ def main():
         help="Path to outputs/<case>_<ts>/tensor_dump directory (default: latest outputs/*/tensor_dump dir)",
     )
     parser.add_argument("--task", "-t", help="Filter by task_id (e.g. 0x0000000200000a00)")
-    parser.add_argument("--func", "-f", type=int, help="Filter by func_id")
     parser.add_argument("--stage", "-s", help="Filter by stage (before / after)")
     parser.add_argument("--role", "-r", help="Filter by role (input / output / inout)")
     parser.add_argument("--arg", "-a", type=int, help="Filter by arg_index")
@@ -272,12 +260,12 @@ def main():
     args = parser.parse_args()
 
     dump_dir = _resolve_dump_dir(args.dump_dir)
-    manifest_files = list(dump_dir.glob("*.json"))
-    if not manifest_files:
-        print(f"Error: no manifest JSON found in {dump_dir}", file=sys.stderr)
+    manifest_path = dump_dir / "tensor_dump.json"
+    if not manifest_path.exists():
+        print(f"Error: tensor_dump.json not found in {dump_dir}", file=sys.stderr)
         sys.exit(1)
 
-    with open(manifest_files[0]) as f:
+    with open(manifest_path) as f:
         manifest = json.load(f)
 
     bin_path = dump_dir / manifest.get("bin_file", "tensors.bin")
@@ -298,7 +286,7 @@ def main():
         sys.exit(1)
 
     # --- Export or list ---
-    has_filters = any([args.task, args.func is not None, args.stage, args.role, args.arg is not None])
+    has_filters = any([args.task, args.stage, args.role, args.arg is not None])
 
     if args.export or args.index is not None:
         for t in filtered: