Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions skills/detection/yolo-detection-2026/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ parameters:
description: "Auto-convert model to optimized format for faster inference"
group: Performance

- name: compute_units
label: "Apple Compute Units"
type: select
options: ["auto", "cpu_and_ne", "all", "cpu_only", "cpu_and_gpu"]
default: "auto"
description: "CoreML compute target — 'auto' routes to Neural Engine (NPU), leaving GPU free for LLM/VLM"
group: Performance
platform: macos

capabilities:
live_detection:
script: scripts/detect.py
Expand All @@ -89,13 +98,15 @@ Real-time object detection using the latest YOLO 2026 models. Detects 80+ COCO o

The skill uses [`env_config.py`](../../lib/env_config.py) to **automatically detect hardware** and convert the model to the fastest format for your platform. Conversion happens once during deployment and is cached.

| Platform | Backend | Optimized Format | Expected Speedup |
|----------|---------|------------------|:----------------:|
| NVIDIA GPU | CUDA | TensorRT `.engine` | ~3-5x |
| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | ~2x |
| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | ~2-3x |
| AMD GPU | ROCm | ONNX Runtime | ~1.5-2x |
| CPU (any) | CPU | ONNX Runtime | ~1.5x |
| Platform | Backend | Optimized Format | Compute Units | Expected Speedup |
|----------|---------|------------------|:-------------:|:----------------:|
| NVIDIA GPU | CUDA | TensorRT `.engine` | GPU | ~3-5x |
| Apple Silicon (M1+) | MPS | CoreML `.mlpackage` | **Neural Engine** (NPU) | ~2x |
| Intel CPU/GPU/NPU | OpenVINO | OpenVINO IR `.xml` | CPU/GPU/NPU | ~2-3x |
| AMD GPU | ROCm | ONNX Runtime | GPU | ~1.5-2x |
| CPU (any) | CPU | ONNX Runtime | CPU | ~1.5x |

> **Apple Silicon Note**: Detection defaults to `cpu_and_ne` (CPU + Neural Engine), keeping the GPU free for LLM/VLM inference. Set `compute_units: all` to include GPU if not running local LLM.

### How It Works

Expand Down
7 changes: 5 additions & 2 deletions skills/detection/yolo-detection-2026/scripts/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def main():
perf.model_load_ms = env.load_ms
perf.export_ms = env.export_ms

emit({
ready_event = {
"event": "ready",
"model": f"yolo2026{model_size[0]}",
"model_size": model_size,
Expand All @@ -260,7 +260,10 @@ def main():
"fps": fps,
"model_load_ms": round(env.load_ms, 1),
"available_sizes": list(MODEL_SIZE_MAP.keys()),
})
}
if hasattr(env, 'compute_units') and env.backend == "mps":
ready_event["compute_units"] = env.compute_units
emit(ready_event)
except Exception as e:
emit({"event": "error", "message": f"Failed to load model: {e}", "retriable": False})
sys.exit(1)
Expand Down
78 changes: 74 additions & 4 deletions skills/detection/yolo-detection-2026/scripts/env_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class BackendSpec:
model_suffix: str # file extension/dir to look for cached model
half: bool = True # use FP16
extra_export_args: dict = field(default_factory=dict)
compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc.


BACKEND_SPECS = {
Expand All @@ -61,6 +62,7 @@ class BackendSpec:
model_suffix=".mlpackage",
half=True,
extra_export_args={"nms": False},
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
),
"intel": BackendSpec(
name="intel",
Expand All @@ -86,6 +88,7 @@ class HardwareEnv:
backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu"
device: str = "cpu" # torch device string
export_format: str = "onnx" # optimal export format
compute_units: str = "all" # CoreML compute units (Apple only)
gpu_name: str = "" # human-readable GPU name
gpu_memory_mb: int = 0 # GPU memory in MB
driver_version: str = "" # GPU driver version
Expand Down Expand Up @@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
else:
env._fallback_cpu()

# Set export format from backend spec
# Set export format and compute units from backend spec
spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
env.export_format = spec.export_format
if spec.compute_units:
env.compute_units = spec.compute_units

# Check if optimized runtime is available
env.framework_ok = env._check_framework()
Expand Down Expand Up @@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:

return None

def _load_coreml_with_compute_units(self, model_path: str):
"""
Load a CoreML model via YOLO with specific compute_units.

Monkey-patches coremltools.MLModel to inject compute_units
(e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
expose this parameter. Patch is scoped and immediately restored.
"""
from ultralytics import YOLO

# Map string config → coremltools enum
_COMPUTE_UNIT_MAP = {
"all": "ALL",
"cpu_only": "CPU_ONLY",
"cpu_and_gpu": "CPU_AND_GPU",
"cpu_and_ne": "CPU_AND_NE",
}

ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
if not ct_enum_name:
_log(f"Unknown compute_units '{self.compute_units}', using default")
return YOLO(model_path)

try:
import coremltools as ct
target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
if target_units is None:
_log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
return YOLO(model_path)

# Temporarily patch MLModel to inject compute_units
_OrigMLModel = ct.models.MLModel

class _PatchedMLModel(_OrigMLModel):
def __init__(self, *args, **kwargs):
kwargs.setdefault('compute_units', target_units)
super().__init__(*args, **kwargs)

ct.models.MLModel = _PatchedMLModel
try:
model = YOLO(model_path)
finally:
ct.models.MLModel = _OrigMLModel # Always restore

_log(f"CoreML model loaded with compute_units={ct_enum_name} "
f"(Neural Engine preferred)")
return model

except ImportError:
_log("coremltools not available, loading without compute_units")
return YOLO(model_path)

def load_optimized(self, model_name: str, use_optimized: bool = True):
"""
Load the best available model for this hardware.
Expand All @@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
optimized_path = self.get_optimized_path(model_name)
if optimized_path.exists():
try:
model = YOLO(str(optimized_path))
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(optimized_path))
else:
model = YOLO(str(optimized_path))
self.load_ms = (time.perf_counter() - t0) * 1000
_log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
return model, self.export_format
Expand All @@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
exported = self.export_model(pt_model, model_name)
if exported:
try:
model = YOLO(str(exported))
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(exported))
else:
model = YOLO(str(exported))
self.load_ms = (time.perf_counter() - t0) * 1000
_log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
return model, self.export_format
Expand Down Expand Up @@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):

def to_dict(self) -> dict:
"""Serialize environment info for JSON output."""
return {
d = {
"backend": self.backend,
"device": self.device,
"export_format": self.export_format,
Expand All @@ -519,6 +586,9 @@ def to_dict(self) -> dict:
"export_ms": round(self.export_ms, 1),
"load_ms": round(self.load_ms, 1),
}
if self.backend == "mps":
d["compute_units"] = self.compute_units
return d


# ─── CLI: run standalone for diagnostics ─────────────────────────────────────
Expand Down
78 changes: 74 additions & 4 deletions skills/lib/env_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class BackendSpec:
model_suffix: str # file extension/dir to look for cached model
half: bool = True # use FP16
extra_export_args: dict = field(default_factory=dict)
compute_units: Optional[str] = None # CoreML compute units: "cpu_and_ne", "all", etc.


BACKEND_SPECS = {
Expand All @@ -61,6 +62,7 @@ class BackendSpec:
model_suffix=".mlpackage",
half=True,
extra_export_args={"nms": False},
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
),
"intel": BackendSpec(
name="intel",
Expand All @@ -86,6 +88,7 @@ class HardwareEnv:
backend: str = "cpu" # "cuda" | "rocm" | "mps" | "intel" | "cpu"
device: str = "cpu" # torch device string
export_format: str = "onnx" # optimal export format
compute_units: str = "all" # CoreML compute units (Apple only)
gpu_name: str = "" # human-readable GPU name
gpu_memory_mb: int = 0 # GPU memory in MB
driver_version: str = "" # GPU driver version
Expand Down Expand Up @@ -113,9 +116,11 @@ def detect() -> "HardwareEnv":
else:
env._fallback_cpu()

# Set export format from backend spec
# Set export format and compute units from backend spec
spec = BACKEND_SPECS.get(env.backend, BACKEND_SPECS["cpu"])
env.export_format = spec.export_format
if spec.compute_units:
env.compute_units = spec.compute_units

# Check if optimized runtime is available
env.framework_ok = env._check_framework()
Expand Down Expand Up @@ -439,6 +444,58 @@ def export_model(self, model, model_name: str) -> Optional[Path]:

return None

def _load_coreml_with_compute_units(self, model_path: str):
"""
Load a CoreML model via YOLO with specific compute_units.

Monkey-patches coremltools.MLModel to inject compute_units
(e.g. CPU_AND_NE for Neural Engine) since ultralytics doesn't
expose this parameter. Patch is scoped and immediately restored.
"""
from ultralytics import YOLO

# Map string config → coremltools enum
_COMPUTE_UNIT_MAP = {
"all": "ALL",
"cpu_only": "CPU_ONLY",
"cpu_and_gpu": "CPU_AND_GPU",
"cpu_and_ne": "CPU_AND_NE",
}

ct_enum_name = _COMPUTE_UNIT_MAP.get(self.compute_units)
if not ct_enum_name:
_log(f"Unknown compute_units '{self.compute_units}', using default")
return YOLO(model_path)

try:
import coremltools as ct
target_units = getattr(ct.ComputeUnit, ct_enum_name, None)
if target_units is None:
_log(f"coremltools.ComputeUnit.{ct_enum_name} not available")
return YOLO(model_path)

# Temporarily patch MLModel to inject compute_units
_OrigMLModel = ct.models.MLModel

class _PatchedMLModel(_OrigMLModel):
def __init__(self, *args, **kwargs):
kwargs.setdefault('compute_units', target_units)
super().__init__(*args, **kwargs)

ct.models.MLModel = _PatchedMLModel
try:
model = YOLO(model_path)
finally:
ct.models.MLModel = _OrigMLModel # Always restore

_log(f"CoreML model loaded with compute_units={ct_enum_name} "
f"(Neural Engine preferred)")
return model

except ImportError:
_log("coremltools not available, loading without compute_units")
return YOLO(model_path)

def load_optimized(self, model_name: str, use_optimized: bool = True):
"""
Load the best available model for this hardware.
Expand All @@ -455,7 +512,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
optimized_path = self.get_optimized_path(model_name)
if optimized_path.exists():
try:
model = YOLO(str(optimized_path))
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(optimized_path))
else:
model = YOLO(str(optimized_path))
self.load_ms = (time.perf_counter() - t0) * 1000
_log(f"Loaded {self.export_format} model ({self.load_ms:.0f}ms)")
return model, self.export_format
Expand All @@ -467,7 +529,12 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
exported = self.export_model(pt_model, model_name)
if exported:
try:
model = YOLO(str(exported))
# On Apple Silicon: route CoreML to Neural Engine
if self.backend == "mps" and self.compute_units != "all":
model = self._load_coreml_with_compute_units(
str(exported))
else:
model = YOLO(str(exported))
self.load_ms = (time.perf_counter() - t0) * 1000
_log(f"Loaded freshly exported {self.export_format} model ({self.load_ms:.0f}ms)")
return model, self.export_format
Expand Down Expand Up @@ -508,7 +575,7 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):

def to_dict(self) -> dict:
"""Serialize environment info for JSON output."""
return {
d = {
"backend": self.backend,
"device": self.device,
"export_format": self.export_format,
Expand All @@ -519,6 +586,9 @@ def to_dict(self) -> dict:
"export_ms": round(self.export_ms, 1),
"load_ms": round(self.load_ms, 1),
}
if self.backend == "mps":
d["compute_units"] = self.compute_units
return d


# ─── CLI: run standalone for diagnostics ─────────────────────────────────────
Expand Down
Loading
Loading