linebender · Shnatsel · May 19, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/fearless_simd/examples/play.rs b/fearless_simd/examples/play.rs
@@ -30,26 +30,10 @@ fn foo<S: Simd>(simd: S, x: f32) -> f32 {
     simd.splat_f32x4(x).sqrt()[0]
 }
 
-// currently requires `safe_wrappers` feature
-fn do_something_on_neon(_level: Level) -> f32 {
-    #[cfg(all(feature = "safe_wrappers", target_arch = "aarch64"))]
-    if let Some(neon) = _level.as_neon() {
-        return neon.vectorize(
-            #[inline(always)]
-            || {
-                let v = neon.neon.vdupq_n_f32(42.0);
-                neon.neon.vgetq_lane_f32::<0>(v)
-            },
-        );
-    }
-    0.0
-}
-
 fn main() {
     let level = Level::new();
     let x = level.dispatch(Foo);
     let y = dispatch!(level, simd => foo(simd, 42.0));
-    let z = do_something_on_neon(level);
 
-    println!("level = {level:?}, x = {x}, y = {y}, z = {z}");
+    println!("level = {level:?}, x = {x}, y = {y}");
 }
diff --git a/fearless_simd/examples/srgb.rs b/fearless_simd/examples/srgb.rs
@@ -1,53 +1,68 @@
 // Copyright 2024 the Fearless_SIMD Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
-#![expect(
-    missing_docs,
-    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-)]
+//! Converts a single RGBA pixel from linear RGB to sRGB.
+//!
+//! This example demonstrates the usual Fearless SIMD structure:
+//!
+//! - write the main computation as an `#[inline(always)]` function generic over
+//!   [`Simd`];
+//! - use [`dispatch!`] at the non-SIMD boundary to run it with the best
+//!   available target features;
+//! - drop down to [`kernel!`](fearless_simd::kernel) when a small part of the
+//!   computation needs a target-specific intrinsic.
+//!
+//! The RGB channels are converted with portable SIMD operations. The alpha
+//! channel is copied unchanged, using an architecture-specific lane-copy
+//! intrinsic if one is available and a scalar fallback otherwise.
 
 use fearless_simd::{Level, dispatch, f32x4, prelude::*};
 
-// This block shows how to use safe wrappers for compile-time enforcement
-// of using valid SIMD intrinsics.
-#[cfg(feature = "safe_wrappers")]
-#[inline(always)]
-fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    // #[cfg(target_arch = "x86_64")]
-    // if let Some(avx2) = a.simd.level().as_avx2() {
-    //     return avx2
-    //         .sse4_1
-    //         ._mm_blend_ps::<8>(a.into(), b.into())
-    //         .simd_into(a.simd);
-    // }
-    #[cfg(target_arch = "aarch64")]
-    if let Some(neon) = a.simd.level().as_neon() {
-        return neon
-            .neon
-            .vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-            .simd_into(a.simd);
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__m128, _mm_blend_ps};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__m128, _mm_blend_ps};
+
+fearless_simd::kernel! {
+    /// Copy the alpha lane on AArch64 using a NEON lane-copy intrinsic.
+    #[inline]
+    fn copy_alpha_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
+        vcopyq_laneq_f32::<3, 3>(a, b)
     }
-    let mut result = a;
-    result[3] = b[3];
-    result
 }
 
-// This block lets the example compile without safe wrappers.
-#[cfg(not(feature = "safe_wrappers"))]
+fearless_simd::kernel! {
+    /// Copy the alpha lane on x86 using the SSE4.2 token to enable SSE4.1 blend instructions.
+    #[inline]
+    fn copy_alpha_sse4_2(sse4_2: Sse4_2, a: __m128, b: __m128) -> __m128 {
+        _mm_blend_ps::<8>(a, b)
+    }
+}
+
+/// Return `a` with its alpha channel replaced by `b`'s alpha channel.
+///
+/// This helper shows how portable SIMD code can opportunistically call
+/// target-specific kernels while still providing a fallback for every backend.
 #[inline(always)]
 fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if let Some(sse4_2) = a.simd.level().as_sse4_2() {
+        return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd);
+    }
+
     #[cfg(target_arch = "aarch64")]
-    if let Some(_neon) = a.simd.level().as_neon() {
-        unsafe {
-            return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-                .simd_into(a.simd);
-        }
+    if let Some(neon) = a.simd.level().as_neon() {
+        return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd);
     }
+
     let mut result = a;
     result[3] = b[3];
     result
 }
 
+/// Approximate the linear-RGB to sRGB transfer curve for RGB, preserving alpha.
 #[inline(always)]
 fn to_srgb<S: Simd>(simd: S, rgba: [f32; 4]) -> [f32; 4] {
     let v: f32x4<S> = rgba.simd_into(simd);