Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/scripts/build-cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ elif [ "${build_arch}" = "aarch64" ]; then
build_capability="75;80;90"

# CUDA 12.8+: Add sm100/sm120
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* || "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;120"
else
# By default, target Pascal through Hopper.
build_capability="60;70;75;80;86;89;90"

# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"

# CUDA 13.0+: Remove < sm75 to align with PyTorch 2.9+cu130 minimum
[[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;86;89;90;100;120"
fi

[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,22 @@ jobs:
- os: windows-latest
arch: x86_64
cuda_version:
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
# Windows: We install Cuda on the agent (slow)
- uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
#- uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27
- uses: N-Storm/cuda-toolkit@d68ba29a800229200a2c3f572f9e816d7f67cdb4 # v0.2.24m
if: startsWith(matrix.os, 'windows')
id: cuda-toolkit
with:
# Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
method: "network"
sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
linux-local-args: '["--toolkit"]'
# Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action.
cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }}
method: "local"
use-github-cache: false
use-local-cache: false
log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt
- name: Setup MSVC
if: startsWith(matrix.os, 'windows')
uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
Expand Down
29 changes: 19 additions & 10 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Unit tests
name: Nightly Tests

on:
workflow_dispatch:
Expand Down Expand Up @@ -49,6 +49,7 @@ jobs:
build-cuda:
strategy:
matrix:
# TODO: Add 13.0.1 when we have runners with new enough drivers.
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
os: [ubuntu-22.04, ubuntu-22.04-arm]
include:
Expand Down Expand Up @@ -111,7 +112,7 @@ jobs:
arch: aarch64
- os: ubuntu-22.04-arm
arch: aarch64
torch_version: "2.5.1"
torch_version: "2.5.1" # Higher minimum requirement for aarch64
- os: windows-2025
arch: x86_64
- os: macos-15
Expand All @@ -136,7 +137,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'

- name: Setup MSVC
if: startsWith(matrix.os, 'windows')
Expand Down Expand Up @@ -182,7 +183,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'

- name: Install dependencies
run: |
Expand Down Expand Up @@ -313,7 +314,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'

- name: Install PyTorch
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
Expand Down Expand Up @@ -343,20 +344,26 @@ jobs:
os: [ubuntu-22.04, windows-2025]
arch: [x86_64]
gpu: [T4, L40S]
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
include:
- cuda_version: "11.8.0"
torch_version: "2.3.1"
pypi_index: "https://download.pytorch.org/whl/cu118"
- cuda_version: "12.6.3"
torch_version: "2.6.0"
pypi_index: "https://download.pytorch.org/whl/cu126"
- cuda_version: "12.8.1"
torch_version: "2.7.1"
pypi_index: "https://download.pytorch.org/whl/cu128"
- cuda_version: "12.9.1"
torch_version: "2.8.0"
pypi_index: "https://download.pytorch.org/whl/cu129"
- cuda_version: "12.8.1"
torch_version: "2.9.0"
pypi_index: "https://download.pytorch.org/whl/test/cu128"

# Note: Currently our runners do not have new enough drivers for CUDA 13.
# Add this when supported.
# - cuda_version: "13.0.1"
# torch_version: "2.9.0"
# pypi_index: "https://download.pytorch.org/whl/test/cu130"


# Linux L40S runners
Expand Down Expand Up @@ -395,6 +402,8 @@ jobs:
exclude:
# Our current T4 Windows runner has a driver too old (471.11)
# and cannot support CUDA 12+. Skip for now.
- os: windows-2025
cuda_version: "13.0.1"
- os: windows-2025
cuda_version: "12.9.1"
- os: windows-2025
Expand Down Expand Up @@ -424,7 +433,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: '3.10'

- name: Install dependencies
run: |
Expand Down
44 changes: 25 additions & 19 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,30 +113,36 @@ if(BUILD_CUDA)
)
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4")
message(FATAL_ERROR "CUDA Version < 11.4 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
message(FATAL_ERROR "CUDA Version > 12 is not supported")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.8")
message(FATAL_ERROR "CUDA Version < 11.8 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "14.0")
message(FATAL_ERROR "CUDA Version > 13 is not supported")
endif()

# CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
if(CMAKE_VERSION VERSION_LESS "3.23.0")
message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")

# 11.4+ supports these at a minimum.
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)

# CUDA 11.8 adds support for Ada and Hopper.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
endif()

# CUDA 12.8 adds support for Blackwell.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
# Starting in CUDA 13.0, Thor Blackwell is renamed to SM110.
# Support for architectures older than Turing (SM75) is removed.
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87 88 89 90 100 103 110 120 121)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 110 120)
else()
# 11.8-12.9 supports these at a minimum.
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87 89 90)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80 90)

# CUDA 12.8 adds support for Blackwell.
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120 121)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
endif()

# CUDA 12.9 adds SM103 (Blackwell B300).
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103)
endif()
endif()
endif()

Expand Down Expand Up @@ -252,7 +258,7 @@ endif()

set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES})
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
target_include_directories(bitsandbytes PUBLIC csrc include)


Expand Down
32 changes: 20 additions & 12 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
#include <math_constants.h>
#include <mma.h>

#if CCCL_VERSION >= 2008002
#include <cuda/std/functional>
#define CUB_REDUCTIONOP_MAX \
cuda::maximum<> {}
#else
#define CUB_REDUCTIONOP_MAX cub::Max()
#endif

#define HLF_MAX 65504
#define TH 1024
#define NUM 4
Expand Down Expand Up @@ -365,7 +373,7 @@ __global__ void kQuantizeBlockwise(
for (int j = 0; j < NUM_PER_TH; j++)
local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));

local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items);
local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, CUB_REDUCTIONOP_MAX, valid_items);

if (threadIdx.x == 0) {
smem_absmax_value[0] = 1.0f / local_abs_max;
Expand Down Expand Up @@ -951,12 +959,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
}

__syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
__syncthreads();
local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items);
local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, CUB_REDUCTIONOP_MAX, valid_items);
if (unorm != NULL) {
__syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
}

if (threadIdx.x == 0) {
Expand Down Expand Up @@ -1162,13 +1170,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
}

__syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items);
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
if (threadIdx.x == 0) {
atomicMax(&new_max1[0], local_max_s1);
}
if (unorm != NULL) {
__syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items);
local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
if (threadIdx.x == 0) {
atomicAdd(&unorm[0], local_unorm);
}
Expand Down Expand Up @@ -1473,11 +1481,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise(
}

// reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max());
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);
new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, CUB_REDUCTIONOP_MAX);

if (OPTIMIZER == ADEMAMIX) {
new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max());
new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, CUB_REDUCTIONOP_MAX);
}

if (threadIdx.x == 0) {
Expand Down Expand Up @@ -1686,7 +1694,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise(
}

// reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max());
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);

if (threadIdx.x == 0)
smem_exchange1[0] = new_local_abs_max1;
Expand Down Expand Up @@ -1792,7 +1800,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
}

// Reduce thread-local absmax across the block.
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = smem_row_absmax = row_absmax;
Expand Down Expand Up @@ -1847,7 +1855,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__

// Reduce thread-local absmax across the block.
// TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols);
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = row_absmax;
Expand Down
9 changes: 9 additions & 0 deletions csrc/pythonInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree.

#if BUILD_CUDA
#include <cuda_runtime_api.h>
#include <ops.cuh>
#endif
#if BUILD_HIP
Expand Down Expand Up @@ -710,7 +711,15 @@ void cprefetch(void* ptr, size_t bytes, int device) {
if (hasPrefetch == 0)
return;

#if CUDART_VERSION >= 13000
cudaMemLocation loc{};
loc.type = cudaMemLocationTypeDevice;
loc.id = device;
CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, loc, 0u, 0));
#else
CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
#endif

CUDA_CHECK_RETURN(cudaPeekAtLastError());
}

Expand Down
4 changes: 3 additions & 1 deletion docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ The currently distributed `bitsandbytes` packages are built with the following c
|--------------------|------------------|----------------------|--------------
| **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm60, sm70, sm75, sm80, sm86, sm89, sm90
| **Linux x86-64** | 12.8 - 12.9 | GCC 11.2 | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Linux x86-64** | 13.0 | GCC 11.2 | sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90
| **Linux aarch64** | 12.8 - 12.9 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120
| **Linux aarch64** | 12.8 - 13.0 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120
| **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90
| **Windows x86-64** | 12.8 - 12.9 | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Windows x86-64** | 13.0 | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120

Use `pip` or `uv` to install:

Expand Down
Loading