Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ Before implementing:
- No "flexibility" or "configurability" that wasn't requested.
- No error handling for impossible scenarios.
- If you write 200 lines and it could be 50, rewrite it.
- Try to keep docstrings short to medium length.

Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.

Expand Down
135 changes: 60 additions & 75 deletions src/mldebug/backend/core_dump_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import struct
from pathlib import Path
from mldebug.utils import print_tile_grid
from mldebug.arch import AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL, AIE_DEV_NPU3, load_aie_arch
from mldebug.arch import AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL, AIE_DEV_NPU3
from .backend_interface import BackendInterface

try:
Expand Down Expand Up @@ -62,28 +62,23 @@ class CoreDumpFallbackReader:
Pure Python fallback implementation for reading core dump files.
Replicates the C++ CoreDumpDataAccessBackend logic.
"""
def __init__(self, core_dump_file, dev_name, no_header=False, args=None):
def __init__(self, core_dump_file, dev_name, no_header=False):
"""
Initialize the fallback reader

Args:
core_dump_file (str): Path to the binary core dump file
dev_name (str): Device name (phx, stx, telluride, npu3)
no_header (bool): If True, skip header parsing and treat data as starting at offset 0
args: Used to update device and aie_iface.
"""
self.filename = core_dump_file
self.dev_name = dev_name.lower()
self.args = args
self.file_handle = None

# Without a header to parse, we have no way to recover from an unknown device name.
# With a header, _parse_header() will detect/override dev_name and metadata.
if no_header and self.dev_name not in DEVICE_CONFIGS:
if self.dev_name not in DEVICE_CONFIGS:
raise ValueError(f"Unknown device: {dev_name}. Supported: {list(DEVICE_CONFIGS.keys())}")

# Provisional metadata; may be replaced by _parse_header() based on header hwGen.
self.metadata = DEVICE_CONFIGS.get(self.dev_name)
self.metadata = DEVICE_CONFIGS[self.dev_name]
self.header_size = 256 # Default header size

# Open the binary dump file
Expand Down Expand Up @@ -119,9 +114,11 @@ def __del__(self):
if self.file_handle:
self.file_handle.close()

def _parse_header(self):
@staticmethod
def peek_device(filename):
"""
Parse the core dump file header.
Read the core dump header, print its contents, and return the device name.

Header structure (from C++ coreDumpHeader):
- char magicNumber[4]: "NPU" (4 bytes)
- uint32_t versionNum: Version number (4 bytes)
Expand All @@ -132,12 +129,57 @@ def _parse_header(self):
- uint8_t memTileRows: Number of memory tile rows (1 byte)
- uint8_t totalNumRows: Total number of rows (1 byte)
- uint8_t totalNumCols: Total number of columns (1 byte)

Returns the matching device name from DEVICE_CONFIGS, or None if the file
is missing/unreadable, lacks the "NPU" magic, or has an unknown hwGen.
"""
if not filename or not Path(filename).exists():
return None
try:
with open(filename, "rb") as f:
magic = f.read(4)
if magic[:3] != b"NPU":
return None
header = f.read(14)
if len(header) != 14:
return None
except OSError:
return None

version_num, header_size = struct.unpack("<II", header[:8])
hw_gen, core_row_start, mem_row_start, mem_tile_rows, total_rows, total_cols = (
struct.unpack("<BBBBBB", header[8:14]))

detected = None
for name, cfg in DEVICE_CONFIGS.items():
if cfg["hwGen"] == hw_gen:
detected = name
break

print( "[INFO] Core dump header:")
print(f" Magic: {magic.decode('ascii', errors='ignore').rstrip(chr(0))}")
print(f" Version: {version_num}")
print(f" Header size: {header_size} bytes")
print(f" Core row start: {core_row_start}")
print(f" Mem row start: {mem_row_start}")
print(f" Mem tile rows: {mem_tile_rows}")
print(f" Total rows: {total_rows}")
print(f" Total cols: {total_cols}")

return detected

def _parse_header(self):
"""
Parse the core dump file header to learn header_size.

Device detection is handled earlier (see ``peek_device`` and
``set_device``); this only validates the magic number and reads the
header size so we know where the tile data starts.
"""
assert self.file_handle is not None
try:
self.file_handle.seek(0)

# Read magic number (4 bytes)
magic = self.file_handle.read(4)
if len(magic) != 4:
raise RuntimeError("Core dump file is too small or corrupted: cannot read header magic number")
Expand All @@ -146,75 +188,19 @@ def _parse_header(self):
if magic_str != "NPU":
raise ValueError(f"Invalid core dump file format: expected magic number 'NPU', got '{magic_str}'")

# Read version number (4 bytes, little-endian uint32)
version_data = self.file_handle.read(4)
if len(version_data) != 4:
# Skip versionNum (4 bytes); read headerSize (4 bytes, little-endian uint32).
if len(self.file_handle.read(4)) != 4:
raise RuntimeError("Core dump file is corrupted: cannot read version number")
version_num = struct.unpack("<I", version_data)[0]

# Read header size (4 bytes, little-endian uint32)
header_size_data = self.file_handle.read(4)
if len(header_size_data) != 4:
raise RuntimeError("Core dump file is corrupted: cannot read header size")
self.header_size = struct.unpack("<I", header_size_data)[0]

# Validate header size is reasonable
if self.header_size < 18 or self.header_size > 1024 * 1024: # Between 18 bytes and 1MB
if self.header_size < 18 or self.header_size > 1024 * 1024:
raise ValueError(f"Invalid header size in core dump: {self.header_size} bytes (expected 18-1048576)")

# Read device metadata (6 bytes)
metadata_data = self.file_handle.read(6)
if len(metadata_data) != 6:
raise RuntimeError("Core dump file is corrupted: cannot read device metadata")

hw_gen, core_row_start, mem_row_start, mem_tile_rows, total_rows, total_cols = (
struct.unpack("<BBBBBB", metadata_data))

# Detect device from header hwGen and override dev_name/metadata
detected_dev = None
for name, cfg in DEVICE_CONFIGS.items():
if cfg["hwGen"] == hw_gen:
detected_dev = name
break

if detected_dev is None:
raise ValueError(f"Unknown hwGen {hw_gen} in core dump header; supported: "
f"{[(n, c['hwGen']) for n, c in DEVICE_CONFIGS.items()]}")

if detected_dev != self.dev_name:
print(f"[INFO] Device detected from core dump header: \'{detected_dev}\'; overriding '{self.dev_name}'")
else:
print(f"[INFO] Device detected from core dump header: '{detected_dev}'")

self.dev_name = detected_dev
self.metadata = DEVICE_CONFIGS[detected_dev]

# Refresh args.aie_iface so the rest of the tool uses the architecture
# that matches the device baked into the core dump.
if self.args:
self.args.device = detected_dev
self.args.aie_iface = load_aie_arch(detected_dev)
self.args.aie_iface.init(detected_dev == AIE_DEV_PHX)

expected_core_row_start = self.metadata["core_row_start"]
expected_mem_row_start = self.metadata["mem_row_start"]
expected_mem_tile_rows = self.metadata["memtile_rows"]
expected_rows = self.metadata["numrows"]
expected_cols = self.metadata["numcols"]

print( "[INFO] Core dump header:")
print(f" Magic: {magic_str}")
print(f" Version: {version_num}")
print(f" Header size: {self.header_size} bytes")
#print(f" HW Gen: {hw_gen} (device: {self.dev_name})")
print(f" Core row start: {expected_core_row_start}")
print(f" Mem row start: {expected_mem_row_start}")
print(f" Mem tile rows: {expected_mem_tile_rows}")
print(f" Total rows: {expected_rows}")
print(f" Total cols: {expected_cols}")

except (ValueError, RuntimeError) as e:
# Re-raise critical errors that indicate file corruption or format issues
raise ValueError("I/O error while reading core dump header") from e
except OSError as e:
raise RuntimeError("I/O error while reading core dump header") from e
Expand Down Expand Up @@ -346,7 +332,7 @@ class CoreDumpImpl(BackendInterface):
"""

is_offline = True
def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None, no_header=False, args=None) -> None:
def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None, no_header=False) -> None:
"""
Initialize the Core Dump backend

Expand All @@ -358,7 +344,6 @@ def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None
core_dump_file: Path to core dump file (required)
no_header: If True, parse core dump assuming no header (data starts at offset 0).
Forces use of the Python fallback reader.
args: Used for device management
"""
self.overlay_aie_core_tiles = aie_overlay_tiles
self.pc_brkpts = [0, 0]
Expand All @@ -379,11 +364,11 @@ def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None
try:
self.binding = MlDebug(list(self.overlay_aie_core_tiles), ctx_id, pid, dev_name, "debuglibrary", core_dump_file)
print("[INFO] Core Dump backend initialized with C++ DebugLibrary")
except ImportError:
except (ImportError, TypeError):
self.use_fallback = True

if self.use_fallback:
self.fallback_reader = CoreDumpFallbackReader(core_dump_file, dev_name, no_header=no_header, args=args)
self.fallback_reader = CoreDumpFallbackReader(core_dump_file, dev_name, no_header=no_header)

print("[INFO] Core Dump backend is read-only. Write/control operations will be ignored.")

Expand Down
1 change: 0 additions & 1 deletion src/mldebug/backend/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,4 @@ def create_backend(backend_type, config):
return core_dump_mod.CoreDumpImpl(
config.tiles, config.ctx_id, config.pid, config.device,
core_dump_file=config.core_dump_file, no_header=config.no_header,
args=config.args,
)
10 changes: 10 additions & 0 deletions src/mldebug/input_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import re

from mldebug.arch import load_aie_arch, AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL
from mldebug.backend.core_dump_impl import CoreDumpFallbackReader
from mldebug.utils import LOGGER, is_aarch64, is_windows

@dataclass
Expand Down Expand Up @@ -200,6 +201,15 @@ def set_device(args) -> None:
endmsg = "\n"
if not args.device:
endmsg = " Use -d to specify a diferent device.\n"
# For core dumps, the device is baked into the file header. Detect it now
# so the overlay (built before the backend) uses the correct aie_iface.
if getattr(args, "core_dump", None) and not getattr(args, "no_header", False):
cd_dev = CoreDumpFallbackReader.peek_device(args.core_dump)
if cd_dev:
args.device = cd_dev
print(f"[INFO] Using AIE Device: {args.device} (detected from core dump header).")
return

# if on ARM, default is telluride else STX
args.device = AIE_DEV_TEL if is_aarch64() else AIE_DEV_STX
genstr = "XAIE_DEV_GEN_AIE2P"
Expand Down
Loading