Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions .github/workflows/sanitizers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: Sanitizers

# Nightly sanitizer sweep of main. Kept in its OWN workflow (not ci.yml) so the
# schedule trigger fires ONLY these jobs — adding `schedule` to ci.yml would run
# every unguarded job (pre-commit, ut, packaging, self-hosted hardware) on cron.
#
# ASAN and TSAN are separate, mutually-exclusive builds; both instrument only
# host-compiled code (sim runtime + kernels + orchestration), and sim unifies on
# g++-15 so the preloaded runtime matches the kernels' ABI. Not a PR gate: too
# slow (TSAN ~5-15x) and subject to the pre-existing sim-oversubscription flake,
# so a generous per-session timeout + manual rerun is expected. detect_leaks=0
# until LSan suppressions exist for the device custom arenas.
on:
schedule:
- cron: "0 18 * * *" # 02:00 Beijing

concurrency:
group: sanitizers-${{ github.ref }}
cancel-in-progress: true

env:
PTO_ISA_COMMIT: ddafa8da9c760ecd13fe9fe2833d6ee55fb20bd8

jobs:
sanitizer-sim:
runs-on: ubuntu-latest
timeout-minutes: 90
strategy:
fail-fast: false
matrix:
sanitizer: [asan, tsan]
platform: [a2a3sim, a5sim]

steps:
- name: Checkout repository
uses: actions/checkout@v5

- name: Set up C++ compiler
run: |
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
sudo apt-get update
sudo apt-get install -y ninja-build graphviz
sudo apt-get install -y g++-15 || sudo apt-get install -y g++
if ! command -v g++-15; then sudo ln -s "$(which g++)" /usr/local/bin/g++-15; fi

- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.10'

- name: Install with sanitizer
run: |
pip install torch --index-url https://download.pytorch.org/whl/cpu
# --no-cache-dir so pip rebuilds rather than reusing a non-sanitizer wheel.
pip install --no-cache-dir \
--config-settings=cmake.define.SIMPLER_SANITIZER=${{ matrix.sanitizer }} '.[test]'

- name: Run sanitized scene tests (${{ matrix.sanitizer }}, ${{ matrix.platform }})
run: |
# Sim unifies host compilation on g++-15, so preload g++-15's runtime.
LIB=$(g++-15 -print-file-name=lib${{ matrix.sanitizer }}.so)
LD_PRELOAD="$LIB" \
ASAN_OPTIONS=detect_leaks=0:abort_on_error=1:halt_on_error=1 \
UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 \
TSAN_OPTIONS=halt_on_error=1 \
pytest examples tests/st --platform ${{ matrix.platform }} --device 0-15 \
--sanitizer ${{ matrix.sanitizer }} -v --pto-session-timeout 1200 \
--pto-isa-commit ${{ env.PTO_ISA_COMMIT }} --clone-protocol https --require-pto-isa
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,21 @@ add_subdirectory(python/bindings)
set(SIMPLER_PTO_CLONE_PROTOCOL "ssh" CACHE STRING
"Protocol for cloning pto-isa during install (ssh or https)")

# Compiler sanitizer for host-compiled targets (sim runtime/kernels and the
# onboard host runtime). Default `none`. Preset (asan/ubsan/tsan) or a raw
# -fsanitize token list. Enable at `pip install` time via
# pip install --no-build-isolation --config-settings=cmake.define.SIMPLER_SANITIZER=asan .
set(SIMPLER_SANITIZER "none" CACHE STRING
"Sanitizer for host targets during install (none/asan/ubsan/tsan)")

# Pre-build runtime binaries (persistent build dirs for incremental compilation)
add_custom_target(build_runtimes ALL
COMMAND ${Python_EXECUTABLE}
${CMAKE_SOURCE_DIR}/simpler_setup/build_runtimes.py
--lib-dir ${CMAKE_SOURCE_DIR}/build/lib
--cache-dir ${CMAKE_SOURCE_DIR}/build/cache
--clone-protocol ${SIMPLER_PTO_CLONE_PROTOCOL}
--sanitizer ${SIMPLER_SANITIZER}
COMMENT "Building runtime binaries (incremental)..."
)

Expand Down
42 changes: 42 additions & 0 deletions cmake/sanitizers.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) PyPTO Contributors.
# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
# CANN Open Software License Agreement Version 2.0 (the "License").
# Please refer to the License for details. You may not use this file except in compliance with the License.
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
#
# Shared compiler-sanitizer helper.
#
# `SIMPLER_SANITIZERS` is a comma-separated `-fsanitize` token list (e.g.
# "address,undefined" or "thread"), passed straight through to the compiler.
# Empty (the default) makes every call below a no-op, so ordinary builds are
# byte-for-byte unchanged.
#
# Apply ONLY to host-compiled targets — the sim runtime/kernels/orchestration
# and the onboard *host* runtime. NEVER call it for device toolchains (ccec for
# AICore, aarch64 cross for the AICPU): they run on the NPU and cannot carry a
# host sanitizer runtime.
#
# `-O1` (the last `-O` wins, overriding an earlier `-O3`) plus frame pointers
# keep sanitizer stack traces from being inlined away — the standard
# good-report settings.

function(simpler_apply_sanitizers tgt)
if(NOT SIMPLER_SANITIZERS)
return()
endif()
target_compile_options(${tgt} PRIVATE
-fsanitize=${SIMPLER_SANITIZERS}
-fno-omit-frame-pointer
-O1)
target_link_options(${tgt} PRIVATE -fsanitize=${SIMPLER_SANITIZERS})
# TSAN can't model standalone std::atomic_thread_fence and warns about it;
# the AICPU target compiles with -Werror, which would make that warning
# fatal. Keep the warning visible but non-fatal — the limitation is known
# and acceptable (fence-ordered accesses just aren't TSAN-tracked there).
if(SIMPLER_SANITIZERS MATCHES "thread")
target_compile_options(${tgt} PRIVATE -Wno-error=tsan)
endif()
endfunction()
44 changes: 44 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,17 @@ def pytest_addoption(parser):
choices=["ssh", "https"],
help="Protocol for cloning pto-isa when --pto-isa-commit is set",
)
parser.addoption(
"--sanitizer",
action="store",
default="none",
help=(
"Run against sanitizer-built binaries. Preset (asan/ubsan/tsan) or raw "
"-fsanitize tokens. Must match the SIMPLER_SANITIZER the runtime was "
"pip-installed with, and needs the matching runtime preloaded "
"(e.g. LD_PRELOAD=$(g++ -print-file-name=libasan.so))."
),
)
parser.addoption(
"--require-pto-isa",
action="store_true",
Expand Down Expand Up @@ -354,6 +365,37 @@ def _install_child_faulthandler() -> None:
pass


def _configure_sanitizer(config):
"""Wire the `--sanitizer` option: drive kernel compile + require the preload.

The runtime `.so` are sanitizer-built at install time
(`pip install --config-settings=cmake.define.SIMPLER_SANITIZER=...`); this
only has to (a) compile the per-test kernels/orchestration to match and
(b) fail early if the runtime isn't preloaded.
"""
from simpler_setup import sanitizers as san # noqa: PLC0415
from simpler_setup.kernel_compiler import KernelCompiler # noqa: PLC0415

selection = config.getoption("--sanitizer", default="none")
tokens = san.resolve(selection)
if not tokens:
return
try:
san.validate(tokens)
except ValueError as e:
raise pytest.UsageError(f"--sanitizer={selection}: {e}") from e
KernelCompiler._sanitizers = tokens

lib = san.preload_lib(tokens)
if lib and not san.is_runtime_loaded(lib):
platform = config.getoption("--platform", default="") or ""
raise pytest.UsageError(
f"--sanitizer={selection} needs the {lib} runtime preloaded "
f"(the instrumented .so are dlopen'd into this Python). Re-run with:\n"
f" {san.preload_command(tokens, platform)} pytest --sanitizer {selection} ..."
)


def pytest_configure(config):
"""Register custom markers and apply global config."""
config.addinivalue_line("markers", "platforms(list): supported platforms for standalone ST functions")
Expand All @@ -365,6 +407,8 @@ def pytest_configure(config):
"filtering so non-@scene_test tests only run under their matching runtime",
)

_configure_sanitizer(config)

log_level = config.getoption("--log-level", default=None)
if log_level:
configure_logging(log_level)
Expand Down
10 changes: 10 additions & 0 deletions docs/ci.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@ PullRequest
| `ut-a5` | a5 self-hosted | `pytest tests/ut --platform a5` + `ctest -L "^requires_hardware(_a5)?$"` |
| `st-onboard-a5` | a5 self-hosted | `pytest examples tests/st --platform a5 --device ...` |

### Nightly sanitizer sweep

A **separate** workflow, [`sanitizers.yml`](../.github/workflows/sanitizers.yml),
runs on a nightly `schedule` — kept out of `ci.yml` so the cron fires only the
sanitizer jobs, never the PR/self-hosted pipeline. Its
`sanitizer-sim` job builds the sim runtime + kernels with ASAN or TSAN
(`pip install --config-settings=cmake.define.SIMPLER_SANITIZER=...`) and runs
`pytest examples tests/st` under the matching `LD_PRELOAD` (a2a3sim/a5sim,
ubuntu-only). Not a PR gate; see [testing.md](testing.md#sanitizer-builds-asan--tsan).

### Parallel ST runs on hardware

For self-hosted jobs with multiple NPUs, pass a `--device` range (and
Expand Down
42 changes: 42 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,48 @@ Key fields:

If similar coverage exists in both `examples/` and `tests/st/`, collapse it into a single `test_*.py`: small cases get `platforms: ["a2a3sim", "a2a3"]`; large benchmark cases get `platforms: ["a2a3"], "manual": True`.

## Sanitizer builds (ASAN / TSAN)

Sanitizers instrument **host-compiled** code only — on sim that is the runtime
(`host`/`aicpu`/`aicore`), the per-test kernels, and orchestration; on onboard
only the host runtime. Device code (ccec AICore `.o`, aarch64 AICPU) cannot
carry a host sanitizer, and device custom arenas (`DeviceArena`/`HeapRing`)
bypass ASAN redzones, so device-side heap bugs are not caught.

It is a two-part flag — **install-time** (which sanitizer the binaries are
built with) and **run-time** (preload + match):

```bash
# 1. Build the runtime with the sanitizer (ASAN bundles UBSan).
pip install --no-build-isolation --config-settings=cmake.define.SIMPLER_SANITIZER=asan .

# 2. Run, preloading the matching runtime (the instrumented .so are dlopen'd
# into a vanilla Python, so the sanitizer runtime must come first). Sim
# unifies on g++-15, so preload g++-15's runtime — using plain g++'s would
# mismatch the kernels' ABI and fail at load.
LD_PRELOAD=$(g++-15 -print-file-name=libasan.so) \
ASAN_OPTIONS=detect_leaks=0:abort_on_error=1:halt_on_error=1 \
UBSAN_OPTIONS=halt_on_error=1:print_stacktrace=1 \
pytest examples tests/st --platform a2a3sim --sanitizer asan -v
```

`--sanitizer` must match the `SIMPLER_SANITIZER` the runtime was installed with;
it compiles the per-test kernels/orchestration to match and fails fast (with the
exact `LD_PRELOAD` command) if the runtime isn't preloaded. Presets: `asan`
(`address,undefined`), `ubsan`, `tsan`; a raw `-fsanitize` token list also works.

**TSAN is a separate, mutually-exclusive build** (cannot coexist with ASAN) and
is **Linux-only** (no macOS `libtsan`):

```bash
pip install --no-build-isolation --config-settings=cmake.define.SIMPLER_SANITIZER=tsan .
LD_PRELOAD=$(g++-15 -print-file-name=libtsan.so) TSAN_OPTIONS=halt_on_error=1 \
pytest examples tests/st --platform a2a3sim --sanitizer tsan -v
```

`detect_leaks=0` is recommended initially — LSan false-positives on the device
custom arenas until suppressions are added.

## CI Pipeline

See [ci.md](ci.md) for the full CI pipeline documentation, including the job matrix, runner constraints, and marker scheme.
Expand Down
25 changes: 25 additions & 0 deletions simpler_setup/build_runtimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from simpler_setup.platform_info import PROJECT_ROOT, discover_runtimes, parse_platform # noqa: E402
from simpler_setup.runtime_builder import RuntimeBuilder # noqa: E402
from simpler_setup.sanitizers import SANITIZER_PRESETS, resolve, validate # noqa: E402

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,6 +76,7 @@ def build_all(
cache_dir: Path,
platforms: Optional[list] = None,
clone_protocol: str = "ssh",
sanitizer: str = "none",
) -> None:
"""Build all runtime variants for the given platforms.

Expand All @@ -85,11 +87,24 @@ def build_all(
clone_protocol: Protocol used by ensure_pto_isa_root() when an
onboard platform needs the pto-isa headers and PTO_ISA_ROOT is
not pre-set. Mirrors conftest's --clone-protocol flag.
sanitizer: Sanitizer preset (asan/ubsan/tsan/none) or raw `-fsanitize`
token list. Only host-compiled targets honor it; see
BuildTarget.gen_cmake_args.
"""
# Override default paths to respect CLI args
RuntimeBuilder._LIB_DIR = lib_dir
RuntimeBuilder._CACHE_DIR = cache_dir

# Resolve the preset to `-fsanitize` tokens and stash on RuntimeCompiler so
# every host cmake configure below picks it up (default "" = no sanitizer).
from simpler_setup.runtime_compiler import RuntimeCompiler # noqa: PLC0415

tokens = resolve(sanitizer)
validate(tokens)
RuntimeCompiler._sanitizers = tokens
if tokens:
logger.info(f"Building with sanitizers: {tokens} (host targets only)")

if platforms is None:
platforms = detect_buildable_platforms()

Expand Down Expand Up @@ -205,6 +220,15 @@ def main():
"and PTO_ISA_ROOT is not pre-set (default: ssh, matching conftest)"
),
)
parser.add_argument(
"--sanitizer",
default="none",
help=(
f"Compiler sanitizer for host-compiled targets. Preset "
f"({'/'.join(SANITIZER_PRESETS)}) or a raw -fsanitize token list. "
"Default: none. asan/tsan are mutually exclusive (separate builds)."
),
)
args = parser.parse_args()

logging.basicConfig(
Expand All @@ -229,6 +253,7 @@ def main():
cache_dir=args.cache_dir,
platforms=args.platforms,
clone_protocol=args.clone_protocol,
sanitizer=args.sanitizer,
)


Expand Down
23 changes: 22 additions & 1 deletion simpler_setup/kernel_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ class KernelCompiler:
- AARCH64_GXX: aarch64 cross-compiler for device orchestration
"""

# Comma-separated `-fsanitize` tokens, set once by conftest from the pytest
# `--sanitizer` option (default "" = off). Only host toolchains (Gxx15 sim
# incore, Gxx sim orchestration) honor it; ccec/aarch64 device builds never
# do. Must match the runtime's install-time SIMPLER_SANITIZER.
_sanitizers = ""

def __init__(self, platform: str = "a2a3"):
"""
Initialize KernelCompiler.
Expand Down Expand Up @@ -80,10 +86,23 @@ def __init__(self, platform: str = "a2a3"):
else:
self.ccec = None
self.aarch64 = None
self.host_gxx = GxxToolchain()
# Sim orchestration must match the sim kernels' g++-15 under a
# sanitizer (one runtime per process); see GxxToolchain prefer_g15.
self.host_gxx = GxxToolchain(prefer_g15=bool(self._sanitizers))

self.gxx15 = Gxx15Toolchain()

def _sanitizer_flags(self, toolchain) -> list[str]:
Comment thread
ChaoWao marked this conversation as resolved.
"""Sanitizer flags for a host-compiled kernel / orchestration .so.

No-op for device toolchains (ccec/aarch64) and when no sanitizer is
selected. `-O1` + frame pointers mirror cmake/sanitizers.cmake so the
sim kernel/orchestration match the sanitized runtime.
"""
if not self._sanitizers or not toolchain.is_host:
return []
return [f"-fsanitize={self._sanitizers}", "-fno-omit-frame-pointer", "-O1"]

def get_platform_include_dirs(self) -> list[str]:
"""
Get platform-specific include directories for orchestration compilation.
Expand Down Expand Up @@ -452,6 +471,7 @@ def _compile_orchestration_shared_lib(
)

cmd = [toolchain.cxx_path] + toolchain.get_compile_flags()
cmd += self._sanitizer_flags(toolchain)

# Force a deterministic ELF GNU Build-ID into every orchestration .so.
# The host-side DeviceRunner reads `.note.gnu.build-id` to detect when
Expand Down Expand Up @@ -531,6 +551,7 @@ def _compile_incore_sim(

# Build command from toolchain
cmd = [self.gxx15.cxx_path] + self.gxx15.get_compile_flags(core_type=core_type)
cmd += self._sanitizer_flags(self.gxx15)

# Add PTO ISA header paths if provided
if pto_isa_root:
Expand Down
Loading
Loading