indigo1973 · ChaoZheng109 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -221,11 +221,23 @@ jobs:
       - name: Check A5 file changes
         id: check
         run: |
-          if git diff --name-only ${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }} \
-            | grep -qE '^(src/a5/|examples/a5/|tests/(st|device_tests)/a5/)'; then
+          FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }})
+
+          # Skip A5 only when ALL changed files are confined to a2a3-only or non-code paths.
+          # Shared code (src/common/, python/, examples/scripts/, build files) affects A5.
+          A2A3_ONLY='^(src/a2a3/|examples/a2a3/|tests/(st|device_tests)/a2a3/)'
+          NON_CODE='^(docs/|\.docs/|\.claude/|KNOWN_ISSUES\.md$|\.gitignore$|README\.md$|\.pre-commit-config\.yaml$)'
+
+          # Filter out a2a3-only and non-code files; if anything remains, it may affect A5
+          REMAINING=$(echo "$FILES" | grep -vE "$A2A3_ONLY" | grep -vE "$NON_CODE" || true)
+
+          if [ -n "$REMAINING" ]; then
             echo "a5_changed=true" >> "$GITHUB_OUTPUT"
+            echo "Files affecting A5:"
+            echo "$REMAINING"
           else
             echo "a5_changed=false" >> "$GITHUB_OUTPUT"
+            echo "All changes are a2a3-only or non-code; skipping A5"
           fi
   # TODO: Uncomment when a5 hardware runner is available.
   #       Add the "a5" label to the runner, matching [self-hosted, a5] below.

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -14,26 +14,30 @@ The pto-isa dependency will be automatically cloned when you first run an exampl
 The pto-isa repository provides header files needed for kernel compilation on the `a2a3` (hardware) platform.
 
 The test framework automatically handles PTO_ISA_ROOT setup:
+
 1. Checks if `PTO_ISA_ROOT` is already set
 2. If not, clones pto-isa to `examples/scripts/_deps/pto-isa` on first run
 3. Passes the resolved path to the kernel compiler
 
 **Automatic Setup (Recommended):**
 Just run your example - pto-isa will be cloned automatically on first run:
+
 ```bash
 python examples/scripts/run_example.py -k examples/a2a3/host_build_graph/vector_example/kernels \
                                        -g examples/a2a3/host_build_graph/vector_example/golden.py \
                                        -p a2a3sim
 ```
 
 By default, the auto-clone uses SSH (`git@github.com:...`). In CI or environments without SSH keys, use `--clone-protocol https`:
+
 ```bash
 python examples/scripts/run_example.py -k examples/a2a3/host_build_graph/vector_example/kernels \
                                        -g examples/a2a3/host_build_graph/vector_example/golden.py \
                                        -p a2a3sim --clone-protocol https
 ```
 
 **Manual Setup** (if auto-setup fails or you prefer manual control):
+
 ```bash
 mkdir -p examples/scripts/_deps
 git clone --branch main git@github.com:PTO-ISA/pto-isa.git examples/scripts/_deps/pto-isa
@@ -46,11 +50,13 @@ export PTO_ISA_ROOT=$(pwd)/examples/scripts/_deps/pto-isa
 ```
 
 **Using a Different Location:**
+
 ```bash
 export PTO_ISA_ROOT=/path/to/your/pto-isa
 ```
 
 **Troubleshooting:**
+
 - If git is not available: Clone pto-isa manually and set `PTO_ISA_ROOT`
 - If clone fails due to network: Try again or clone manually
 - If SSH clone fails (e.g., in CI): Use `--clone-protocol https` or clone manually with HTTPS
@@ -93,6 +99,7 @@ host_binary = compiler.compile("host", include_dirs, source_dirs)        # → .
 ```
 
 **Toolchains used:**
+
 - **AICore**: Bisheng CCE (`ccec` compiler) → `.o` object file (a2a3 only)
 - **AICPU**: aarch64 cross-compiler → `.so` shared object (a2a3 only)
 - **Host**: Standard gcc/g++ → `.so` shared library
@@ -117,7 +124,8 @@ python examples/scripts/run_example.py \
 ```
 
 Expected output:
-```
+
+```text
 === Building Runtime: host_build_graph (platform: a2a3sim) ===
 ...
 === Comparing Results ===
@@ -132,42 +140,40 @@ TEST PASSED
 ### Python API Example
 
 ```python
-from bindings import bind_host_binary
-from runtime_compiler import RuntimeCompiler
+from task_interface import ChipWorker, CallConfig
+from runtime_builder import RuntimeBuilder
+
+# Build or locate pre-built runtime binaries
+builder = RuntimeBuilder(platform="a2a3sim")
+binaries = builder.get_binaries("tensormap_and_ringbuffer")
+
+# Create worker and initialize with platform binaries
+worker = ChipWorker()
+worker.init(device_id=0, host_path=str(binaries.host_path),
+            aicpu_binary=binaries.aicpu_path.read_bytes(),
+            aicore_binary=binaries.aicore_path.read_bytes())
+
+# Execute callable on device
+worker.run(chip_callable, orch_args, CallConfig(block_dim=24))
 
-# Compile all binaries
-compiler = RuntimeCompiler()
-aicore_bin = compiler.compile("aicore", [...include_dirs...], [...source_dirs...])
-aicpu_bin = compiler.compile("aicpu", [...include_dirs...], [...source_dirs...])
-host_bin = compiler.compile("host", [...include_dirs...], [...source_dirs...])
-
-# Load and initialize runtime
-Runtime = bind_host_binary(host_bin)
-runtime = Runtime()
-runtime.initialize()
-
-# Execute runtime on device
-launch_runtime(runtime,
-               aicpu_thread_num=1,
-               block_dim=1,
-               device_id=9,
-               aicpu_binary=aicpu_bin,
-               aicore_binary=aicore_bin)
-
-runtime.finalize()
+# Cleanup
+worker.reset()
 ```
 
 ## Configuration
 
 ### Compile-time Configuration (Runtime Limits)
+
 In `src/{arch}/runtime/host_build_graph/runtime/runtime.h`:
+
 ```cpp
 #define RUNTIME_MAX_TASKS 131072   // Maximum number of tasks
 #define RUNTIME_MAX_ARGS 16        // Maximum arguments per task
 #define RUNTIME_MAX_FANOUT 512     // Maximum successors per task
 ```
 
 ### Runtime Configuration
+
 ```python
 runner.init(
     device_id=0,              # Device ID (0-15)
@@ -191,6 +197,7 @@ runner.init(
 Device logs written to `~/ascend/log/debug/device-<id>/`
 
 Kernel uses macros:
+
 - `DEV_INFO`: Informational messages
 - `DEV_DEBUG`: Debug messages
 - `DEV_WARN`: Warnings

diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
@@ -69,8 +69,10 @@ def compute_golden(tensors: dict, params: dict) -> None:
 # Argument construction — uses nanobind bindings from task_interface
 # =============================================================================
 from task_interface import (  # type: ignore[import-not-found]
+    CallConfig,  # pyright: ignore[reportAttributeAccessIssue]
     ChipCallable,  # pyright: ignore[reportAttributeAccessIssue]
     ChipStorageTaskArgs,  # pyright: ignore[reportAttributeAccessIssue]
+    ChipWorker,  # pyright: ignore[reportAttributeAccessIssue]
     CoreCallable,  # pyright: ignore[reportAttributeAccessIssue]
     make_tensor_arg,
     scalar_to_uint64,
@@ -701,17 +703,14 @@ def run(self) -> None:  # noqa: PLR0912, PLR0915
         """
         Execute the full test flow:
         1. Check environment
-        2. Build runtime
-        3. Load runtime and set device
-        4. Compile orchestration
-        5. Compile and register kernels
-        6. For each params in params_list:
+        2. Build runtime, orchestration, and kernels in parallel
+        3. Create ChipWorker
+        4. For each params in params_list:
            - Generate inputs using golden.py
-           - Initialize and launch runtime
-           - Finalize and compare with golden
+           - Run via ChipWorker
+           - Compare with golden
         """
         # Import runtime modules (deferred import to avoid top-level dependency)
-        from bindings import bind_host_binary, launch_runtime, set_device  # noqa: PLC0415
         from elf_parser import extract_text_section  # noqa: PLC0415
         from kernel_compiler import KernelCompiler  # noqa: PLC0415
         from runtime_builder import RuntimeBuilder  # noqa: PLC0415
@@ -826,17 +825,18 @@ def _compile_one_kernel(kernel):
             children=kernel_binaries,
         )
 
-        # Step 2: Load runtime and set device
+        # Step 2: Create ChipWorker
         binaries = runtime_result
-        logger.info(f"=== Loading Runtime ({binaries.host_path}) ===")
-        Runtime = bind_host_binary(binaries.host_path)
-        aicpu_binary = binaries.aicpu_path.read_bytes()
-        aicore_binary = binaries.aicore_path.read_bytes()
-
-        logger.info(f"=== Setting Device {self.device_id} ===")
-        set_device(self.device_id)
+        logger.info(f"=== Creating ChipWorker (host: {binaries.host_path}, device: {self.device_id}) ===")
+        worker = ChipWorker()
+        worker.init(
+            self.device_id,
+            str(binaries.host_path),
+            binaries.aicpu_path.read_bytes(),
+            binaries.aicore_path.read_bytes(),
+        )
 
-        # Step 5: Run each parameter set
+        # Step 3: Run each parameter set
         total_cases = len(self.params_list)
         for case_idx, params in enumerate(self.params_list):
             logger.info("=" * 60)
@@ -864,10 +864,6 @@ def _compile_one_kernel(kernel):
             logger.debug(f"Tensor order: {list(tensors.keys())}")
             logger.debug(f"orch_args count: {len(orch_args)}")
 
-            # Create and initialize runtime (including kernel registration)
-            logger.info("=== Initializing Runtime ===")
-            runtime = Runtime()
-
             # Build environment for runtime initialization
             run_env = _kernel_config_runtime_env(self._kernel_config, self.kernels_dir)
             if run_env:
@@ -891,32 +887,23 @@ def _compile_one_kernel(kernel):
                 for k, v in initial_outputs.items():
                     outputs[k].copy_(v)
 
-                runtime = Runtime()
-
-                # Enable profiling if requested (only first round)
+                config = CallConfig()
+                config.block_dim = self.block_dim
+                config.aicpu_thread_num = self.aicpu_thread_num
+                config.orch_thread_num = self.orch_thread_num
                 if self.enable_profiling and round_idx == 0:
-                    runtime.enable_profiling(True)
+                    config.enable_profiling = True
                     logger.info("Profiling enabled")
 
                 with _temporary_env(run_env):
-                    runtime.initialize(chip_callable, orch_args)
-
-                launch_runtime(
-                    runtime,
-                    aicpu_thread_num=self.aicpu_thread_num,
-                    block_dim=self.block_dim,
-                    device_id=self.device_id,
-                    aicpu_binary=aicpu_binary,
-                    aicore_binary=aicore_binary,
-                    orch_thread_num=self.orch_thread_num,
-                )
+                    worker.run(chip_callable, orch_args, config)
 
-                runtime.finalize()
                 if not self.skip_golden:
                     self._compare_with_golden(outputs, golden)
 
             logger.info(f"=== Case {case_idx + 1}/{total_cases} Passed ===")
 
+        worker.reset()
         logger.info("=" * 60)
         logger.info(f"=== All {total_cases} cases passed ===")
         logger.info("=" * 60)