Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
use flake
45 changes: 24 additions & 21 deletions .github/workflows/container_images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,19 @@ jobs:
arch: arm64
variance:
- name: Ubuntu-24.04/CUDA-12.8.1
image: "rust-gpu/rust-cuda-ubuntu24-cuda12"
image: "rust-cuda-ubuntu24-cuda12"
dockerfile: ./container/ubuntu24-cuda12/Dockerfile
- name: Ubuntu-24.04/CUDA-13.0.2
image: "rust-gpu/rust-cuda-ubuntu24-cuda13"
image: "rust-cuda-ubuntu24-cuda13"
dockerfile: ./container/ubuntu24-cuda13/Dockerfile
- name: Ubuntu-24.04/CUDA-13.2.1/LLVM-19.1.7
image: "rust-cuda-ubuntu24-cuda13-llvm19"
dockerfile: ./container/ubuntu24-cuda13-llvm19/Dockerfile
- name: RockyLinux-9/CUDA-12.8.1
image: "rust-gpu/rust-cuda-rockylinux9-cuda12"
image: "rust-cuda-rockylinux9-cuda12"
dockerfile: ./container/rockylinux9-cuda12/Dockerfile
- name: RockyLinux-9/CUDA-13.0.2
image: "rust-gpu/rust-cuda-rockylinux9-cuda13"
image: "rust-cuda-rockylinux9-cuda13"
dockerfile: ./container/rockylinux9-cuda13/Dockerfile
steps:
- name: Free up space
Expand Down Expand Up @@ -86,6 +89,8 @@ jobs:
df -h
- name: Checkout repository
uses: actions/checkout@v4
- name: Set lowercase repo owner
run: echo "REPO_OWNER=$(echo ${{ github.repository_owner }} | tr A-Z a-z)" >> $GITHUB_ENV
- name: Validate platform
run: |
ARCH=$(uname -m)
Expand All @@ -108,7 +113,7 @@ jobs:
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ matrix.variance.image }}
images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ matrix.variance.image }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build and push by digest
Expand All @@ -119,15 +124,12 @@ jobs:
file: ${{ matrix.variance.dockerfile }}
platforms: linux/${{ matrix.platform.arch }}
labels: ${{ steps.meta.outputs.labels }}
outputs: type=image,name=${{ env.REGISTRY }}/${{ matrix.variance.image }},push-by-digest=true,name-canonical=true,push=${{ github.event_name != 'pull_request' }}
outputs: type=image,name=${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ matrix.variance.image }},push-by-digest=true,name-canonical=true,push=${{ github.event_name != 'pull_request' }}
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Set artifact name
if: github.event_name != 'pull_request'
run: |
ARTIFACT_NAME="${{ matrix.variance.image }}"
ARTIFACT_NAME="${ARTIFACT_NAME#*/}" # Remove everything before and including the slash
echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV
run: echo "ARTIFACT_NAME=${{ matrix.variance.image }}" >> $GITHUB_ENV
- name: Export digest
if: github.event_name != 'pull_request'
run: |
Expand Down Expand Up @@ -158,19 +160,20 @@ jobs:
matrix:
variance:
- name: Ubuntu-24.04/CUDA-12.8.1
image: "rust-gpu/rust-cuda-ubuntu24-cuda12"
image: "rust-cuda-ubuntu24-cuda12"
- name: Ubuntu-24.04/CUDA-13.0.2
image: "rust-gpu/rust-cuda-ubuntu24-cuda13"
image: "rust-cuda-ubuntu24-cuda13"
- name: Ubuntu-24.04/CUDA-13.2.1/LLVM-19.1.7
image: "rust-cuda-ubuntu24-cuda13-llvm19"
- name: RockyLinux-9/CUDA-12.8.1
image: "rust-gpu/rust-cuda-rockylinux9-cuda12"
image: "rust-cuda-rockylinux9-cuda12"
- name: RockyLinux-9/CUDA-13.0.2
image: "rust-gpu/rust-cuda-rockylinux9-cuda13"
image: "rust-cuda-rockylinux9-cuda13"
steps:
- name: Set lowercase repo owner
run: echo "REPO_OWNER=$(echo ${{ github.repository_owner }} | tr A-Z a-z)" >> $GITHUB_ENV
- name: Set artifact name
run: |
ARTIFACT_NAME="${{ matrix.variance.image }}"
ARTIFACT_NAME="${ARTIFACT_NAME#*/}" # Remove everything before and including the slash
echo "ARTIFACT_NAME=$ARTIFACT_NAME" >> $GITHUB_ENV
run: echo "ARTIFACT_NAME=${{ matrix.variance.image }}" >> $GITHUB_ENV
- name: Download digests
uses: actions/download-artifact@v4
with:
Expand All @@ -183,7 +186,7 @@ jobs:
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ matrix.variance.image }}
images: ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ matrix.variance.image }}
tags: |
type=ref,event=branch
type=ref,event=pr
Expand All @@ -202,7 +205,7 @@ jobs:
working-directory: /tmp/digests
run: |
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
$(printf '${{ env.REGISTRY }}/${{ matrix.variance.image }}@sha256:%s ' *)
$(printf '${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ matrix.variance.image }}@sha256:%s ' *)
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }}
docker buildx imagetools inspect ${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ matrix.variance.image }}:${{ steps.meta.outputs.version }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ book
rustc-ice-*.txt
.nix-driver-libs
.claude
.direnv
99 changes: 99 additions & 0 deletions container/ubuntu24-cuda13-llvm19/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
FROM nvcr.io/nvidia/cuda:13.2.1-cudnn-devel-ubuntu24.04 AS llvm-builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
build-essential \
clang \
curl \
libffi-dev \
libedit-dev \
libncurses5-dev \
libssl-dev \
libtinfo-dev \
libxml2-dev \
cmake \
ninja-build \
pkg-config \
python3 \
xz-utils \
zlib1g-dev && \
rm -rf /var/lib/apt/lists/*

WORKDIR /data/llvm19

# Download and build LLVM 19.1.7 (the active LLVM 19 pin used by `rustc_codegen_nvvm`).
# LLVM 8+ ships as a monorepo tarball; cmake source root is the `llvm/` subdir.
RUN curl -sSf -L -O https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.7/llvm-project-19.1.7.src.tar.xz && \
tar -xf llvm-project-19.1.7.src.tar.xz && \
cd llvm-project-19.1.7.src && \
mkdir build && cd build && \
ARCH=$(dpkg --print-architecture) && \
if [ "$ARCH" = "amd64" ]; then \
TARGETS="X86;NVPTX"; \
else \
TARGETS="AArch64;NVPTX"; \
fi && \
cmake -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DLLVM_TARGETS_TO_BUILD="$TARGETS" \
-DLLVM_BUILD_LLVM_DYLIB=ON \
-DLLVM_LINK_LLVM_DYLIB=ON \
-DLLVM_ENABLE_ASSERTIONS=OFF \
-DLLVM_ENABLE_BINDINGS=OFF \
-DLLVM_INCLUDE_EXAMPLES=OFF \
-DLLVM_INCLUDE_TESTS=OFF \
-DLLVM_INCLUDE_BENCHMARKS=OFF \
-DLLVM_ENABLE_ZLIB=ON \
-DLLVM_ENABLE_TERMINFO=ON \
-DCMAKE_INSTALL_PREFIX=/opt/llvm-19 \
../llvm && \
ninja -j$(nproc) && \
ninja install && \
cd ../.. && \
rm -rf llvm-project-19.1.7.src*

FROM nvcr.io/nvidia/cuda:13.2.1-cudnn-devel-ubuntu24.04

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qq -y install \
build-essential \
clang \
curl \
libssl-dev \
libtinfo-dev \
pkg-config \
xz-utils \
zlib1g-dev \
cmake \
libfontconfig-dev \
libx11-xcb-dev \
libxcursor-dev \
libxi-dev \
libxinerama-dev \
libxrandr-dev && \
rm -rf /var/lib/apt/lists/*

COPY --from=llvm-builder /opt/llvm-19 /opt/llvm-19
RUN ln -s /opt/llvm-19/bin/llvm-config /usr/bin/llvm-config && \
ln -s /opt/llvm-19/bin/llvm-config /usr/bin/llvm-config-19 && \
ln -s /opt/llvm-19/bin/llvm-as /usr/bin/llvm-as-19

# Get Rust (install rustup; toolchain installed from rust-toolchain.toml below)
RUN curl -sSf -L https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"

# Setup the workspace
WORKDIR /data/rust-cuda
RUN --mount=type=bind,source=rust-toolchain.toml,target=/data/rust-cuda/rust-toolchain.toml \
rustup show

# Add nvvm + LLVM 19 dylib to the runtime linker path.
ENV LD_LIBRARY_PATH="/opt/llvm-19/lib:/usr/local/cuda/nvvm/lib64:${LD_LIBRARY_PATH}"

# `rustc_codegen_nvvm`'s build.rs probes `LLVM_CONFIG_19` to locate the LLVM 19
# toolchain when the `llvm19` cargo feature is on. The feature itself is gated;
# downstream crates that depend on `cuda_builder` must build with
# `--features llvm19` for this to take effect — that propagates through to
# `nvvm/llvm19` (default `NvvmArch` = Blackwell) and `rustc_codegen_nvvm/llvm19`
# (LLVM 19 codegen path) per crates/cuda_builder/Cargo.toml.
ENV LLVM_CONFIG_19=/opt/llvm-19/bin/llvm-config
ENV LLVM_LINK_STATIC=1
ENV RUST_LOG=info
35 changes: 18 additions & 17 deletions crates/cuda_std/src/warp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,20 +313,20 @@ unsafe fn match_any_64(mask: u32, value: u64) -> u32 {
#[inline(always)]
unsafe fn match_all_32(mask: u32, value: u32) -> (u32, bool) {
unsafe extern "C" {
#[allow(improper_ctypes)]
fn __nvvm_warp_match_all_32(mask: u32, value: u32) -> (u32, bool);
// see libintrinsics.ll — packs (value, predicate) into i64
fn __nvvm_warp_match_all_32(mask: u32, value: u32) -> u64;
}
unsafe { __nvvm_warp_match_all_32(mask, value) }
unpack_warp_result(unsafe { __nvvm_warp_match_all_32(mask, value) })
}

#[gpu_only]
#[inline(always)]
unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) {
unsafe extern "C" {
#[allow(improper_ctypes)]
fn __nvvm_warp_match_all_64(mask: u32, value: u64) -> (u32, bool);
// see libintrinsics.ll — packs (value, predicate) into i64
fn __nvvm_warp_match_all_64(mask: u32, value: u64) -> u64;
}
unsafe { __nvvm_warp_match_all_64(mask, value) }
unpack_warp_result(unsafe { __nvvm_warp_match_all_64(mask, value) })
}

/// Synchronizes a subset of threads in a warp then performs a reduce-and-broadcast
Expand Down Expand Up @@ -741,14 +741,16 @@ pub enum WarpShuffleMode {
Xor = 3,
}

// C-compatible struct to match LLVM IR's {i32, i8} return type
// This fixes an ABI mismatch where Rust would represent (u32, bool) as [2 x i32]
// but the LLVM intrinsic returns {i32, i8} (a struct, not an array)
#[doc(hidden)]
#[repr(C)]
pub struct WarpShuffleResult {
value: u32,
predicate: u8,
// The libintrinsics.ll wrappers pack their (value, predicate) result into a
// single i64: low 32 bits = value, bit 32 = predicate. Returning a primitive
// integer avoids the small-aggregate ABI path where rustc attaches `align N`
// to the call's return value — an attribute LLVM 19's verifier rejects on
// non-pointer returns.
// Unused on host targets — every caller is `#[gpu_only]`.
#[allow(dead_code)]
#[inline(always)]
fn unpack_warp_result(packed: u64) -> (u32, bool) {
(packed as u32, (packed >> 32) & 1 != 0)
}

#[gpu_only]
Expand All @@ -761,8 +763,7 @@ unsafe fn warp_shuffle_32(
) -> (u32, bool) {
unsafe extern "C" {
// see libintrinsics.ll
// Returns {i32, i8} in LLVM IR, which maps to our WarpShuffleResult struct
fn __nvvm_warp_shuffle(mask: u32, mode: u32, a: u32, b: u32, c: u32) -> WarpShuffleResult;
fn __nvvm_warp_shuffle(mask: u32, mode: u32, a: u32, b: u32, c: u32) -> u64;
}

assert!(
Expand All @@ -776,7 +777,7 @@ unsafe fn warp_shuffle_32(
c |= (32 - width) << 8;

let result = unsafe { __nvvm_warp_shuffle(mask, mode as u32, value, b, c) };
(result.value, result.predicate != 0)
unpack_warp_result(result)
}

unsafe fn warp_shuffle_128(
Expand Down
Loading
Loading