Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ jobs:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
# The specific Python version is irrelevant in this context as we are only packaging non-C extension
# code. This ensures compatibility across Python versions, including Python 3.8, as compatibility is
# code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
# dictated by the packaged code itself, not the Python version used for packaging.
python-version: ["3.10"]
arch: [x86_64, aarch64]
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
rev: v0.11.2
hooks:
- id: ruff
args:
Expand Down
2 changes: 1 addition & 1 deletion benchmarking/int8/int8_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@
print("=" * 40)
print(f"Example:\n{tokenizer.decode(generated_ids[0])}")
print("=" * 40)
print(f"Speed: {num/(time.time() - time_1)}token/s")
print(f"Speed: {num / (time.time() - time_1)}token/s")
18 changes: 10 additions & 8 deletions benchmarking/matmul_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_bench_matmul(batch, seq, model, hidden):
torch.matmul(A, B.t())
torch.cuda.synchronize()
print(
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s",
f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s",
)

# torch.cuda.synchronize()
Expand All @@ -88,22 +88,24 @@ def test_bench_matmul(batch, seq, model, hidden):
for i in range(iters):
bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4)
torch.cuda.synchronize()
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s")

torch.cuda.synchronize()
t0 = time.time()
for i in range(iters):
bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c)
torch.cuda.synchronize()
print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
print(
f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

torch.cuda.synchronize()
t0 = time.time()
for i in range(iters):
bnb.matmul(A, B)
torch.cuda.synchronize()
print(
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
f"B -> CB (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

torch.cuda.synchronize()
Expand All @@ -112,7 +114,7 @@ def test_bench_matmul(batch, seq, model, hidden):
bnb.matmul(A, B, threshold=6.0)
torch.cuda.synchronize()
print(
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
f"B -> CB + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

CA, SCA, _ = F.int8_vectorwise_quant(A, threshold=0.0)
Expand All @@ -124,7 +126,7 @@ def test_bench_matmul(batch, seq, model, hidden):
out32 = F.int8_linear_matmul(CA, CB)
torch.cuda.synchronize()
print(
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
f"no overhead int8 [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

# C32A, SA = F.transform(CA, "col32")
Expand Down Expand Up @@ -183,7 +185,7 @@ def test_bench_matmul(batch, seq, model, hidden):
linear8bit(A)
torch.cuda.synchronize()
print(
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

linearMixedBit(A)
Expand All @@ -193,7 +195,7 @@ def test_bench_matmul(batch, seq, model, hidden):
linearMixedBit(A)
torch.cuda.synchronize()
print(
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time() - t0:.4f}s"
)

# linear8bit_train(A)
Expand Down
9 changes: 5 additions & 4 deletions bitsandbytes/_ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections.abc import Sequence
from math import prod
from typing import Optional, Sequence, Tuple
from typing import Optional

import torch

Expand Down Expand Up @@ -131,7 +132,7 @@ def _(
def _(
A: torch.Tensor,
threshold=0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
out_row = torch.empty_like(A, dtype=torch.int8)
out_col = torch.empty_like(A, dtype=torch.int8)
row_stats = torch.empty(prod(A.shape[:-1]), device=A.device, dtype=torch.float32)
Expand Down Expand Up @@ -191,7 +192,7 @@ def _(
@register_fake("bitsandbytes::quantize_4bit")
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)

n = A.numel()
Expand Down Expand Up @@ -235,7 +236,7 @@ def _(


@register_fake("bitsandbytes::quantize_blockwise")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
n = A.numel()
blocks = -(n // -blocksize)
Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/autograd/_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass
from math import prod
from typing import Callable, Optional, Tuple
from typing import Callable, Optional
import warnings
from warnings import warn

Expand Down Expand Up @@ -55,7 +55,7 @@ def get_current_outlier_idx(self):
)
def get_inverse_transform_indices(
transform_tile: Callable[[torch.Tensor], torch.Tensor],
tile_size: Tuple[int, int],
tile_size: tuple[int, int],
):
"""
Compute a permutation of indices that invert the specified (tiled) matrix transformation
Expand Down
6 changes: 3 additions & 3 deletions bitsandbytes/backends/cpu/ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import ctypes as ct
from typing import Optional, Tuple
from typing import Optional

import torch

Expand Down Expand Up @@ -47,7 +47,7 @@ def _(


@register_kernel("bitsandbytes::quantize_blockwise", "cpu")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on cpu, got {A.dtype}")

Expand Down Expand Up @@ -116,7 +116,7 @@ def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int,
@register_kernel("bitsandbytes::quantize_4bit", "cpu")
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")

Expand Down
16 changes: 7 additions & 9 deletions bitsandbytes/backends/cuda/ops.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Sequence
import ctypes as ct
from math import prod
from typing import Optional, Sequence, Tuple
from typing import Optional

import torch

Expand Down Expand Up @@ -78,10 +79,7 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
raise NotImplementedError("int8_linear_matmul not implemented!")
else:
raise RuntimeError(
f"cublasLt ran into an error!\n"
f"\t{shapeA=}, {shapeB=}, {shapeC=}\n"
f"\t{(lda, ldb, ldc)=}\n"
f"\t{(m, n, k)=}"
f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
)

return out
Expand Down Expand Up @@ -169,7 +167,7 @@ def _(A: torch.Tensor, threshold=0.0):
def _(
A: torch.Tensor,
threshold=0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
# Use CUDA kernel for rowwise and COO tensor
quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
A,
Expand All @@ -188,7 +186,7 @@ def _(
def _get_col_absmax(
A: torch.Tensor,
threshold=0.0,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
torch._check(A.is_floating_point())

outlier_mask = None
Expand All @@ -207,7 +205,7 @@ def _get_col_absmax(


@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> Tuple[torch.Tensor, torch.Tensor]:
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
Expand Down Expand Up @@ -292,7 +290,7 @@ def _dequantize_blockwise_impl(
@register_kernel("bitsandbytes::quantize_4bit", "cuda")
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
Expand Down
10 changes: 5 additions & 5 deletions bitsandbytes/cuda_specs.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
import dataclasses
from functools import lru_cache
from typing import List, Optional, Tuple
from typing import Optional

import torch


@dataclasses.dataclass(frozen=True)
class CUDASpecs:
highest_compute_capability: Tuple[int, int]
highest_compute_capability: tuple[int, int]
cuda_version_string: str
cuda_version_tuple: Tuple[int, int]
cuda_version_tuple: tuple[int, int]

@property
def has_imma(self) -> bool:
return torch.version.hip or self.highest_compute_capability >= (7, 5)


def get_compute_capabilities() -> List[Tuple[int, int]]:
def get_compute_capabilities() -> list[tuple[int, int]]:
return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count()))


@lru_cache(None)
def get_cuda_version_tuple() -> Tuple[int, int]:
def get_cuda_version_tuple() -> tuple[int, int]:
if torch.version.cuda:
return map(int, torch.version.cuda.split(".")[0:2])
elif torch.version.hip:
Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections.abc import Iterable, Iterator
import logging
import os
from pathlib import Path
from typing import Dict, Iterable, Iterator

import torch

Expand Down Expand Up @@ -76,7 +76,7 @@ def is_relevant_candidate_env_var(env_var: str, value: str) -> bool:
)


def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]:
def get_potentially_lib_path_containing_env_vars() -> dict[str, str]:
return {env_var: value for env_var, value in os.environ.items() if is_relevant_candidate_env_var(env_var, value)}


Expand Down
21 changes: 11 additions & 10 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections.abc import Iterable
import ctypes as ct
import itertools
from math import prod
from typing import Any, Dict, Iterable, Optional, Tuple, Union
from typing import Any, Optional, Union

import numpy as np
import torch
Expand Down Expand Up @@ -619,7 +620,7 @@ def __get_item__(self, idx):
return list_repr[idx]

@classmethod
def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> "QuantState":
def from_dict(cls, qs_dict: dict[str, Any], device: torch.device) -> "QuantState":
"""
unpacks components of state_dict into QuantState
where necessary, convert into strings, torch.dtype, ints, etc.
Expand Down Expand Up @@ -741,7 +742,7 @@ def quantize_blockwise(
out: Optional[torch.Tensor] = None,
blocksize=4096,
nested=False,
) -> Tuple[torch.Tensor, QuantState]:
) -> tuple[torch.Tensor, QuantState]:
"""Quantize a tensor in blocks of values.

The input tensor is quantized by dividing it into blocks of `blocksize` values.
Expand Down Expand Up @@ -994,7 +995,7 @@ def quantize_4bit(
compress_statistics=False,
quant_type="fp4",
quant_storage=torch.uint8,
) -> Tuple[torch.Tensor, QuantState]:
) -> tuple[torch.Tensor, QuantState]:
"""Quantize tensor A in blocks of 4-bit values.

Quantizes tensor A by dividing it into blocks which are independently quantized.
Expand Down Expand Up @@ -1161,7 +1162,7 @@ def quantize(
A: Tensor,
code: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
) -> tuple[Tensor, tuple[Tensor, Tensor]]:
if code is None:
if "dynamic" not in name2qmap:
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
Expand All @@ -1179,7 +1180,7 @@ def quantize(
@deprecated("This function is deprecated and will be removed in a future release.", category=FutureWarning)
def dequantize(
A: Tensor,
state: Optional[Tuple[Tensor, Tensor]] = None,
state: Optional[tuple[Tensor, Tensor]] = None,
absmax: Optional[torch.Tensor] = None,
code: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -2006,7 +2007,7 @@ def get_colrow_absmax(
col_stats: Optional[torch.Tensor] = None,
nnz_block_ptr: Optional[torch.Tensor] = None,
threshold=0.0,
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
""" "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

The row-wise and column-wise absmax values are determined.
Expand Down Expand Up @@ -2268,9 +2269,9 @@ def spmm_coo(
out: Optional[torch.Tensor] = None,
):
if not isinstance(cooA, COOSparseTensor):
assert (
cooA.is_sparse and cooA.layout == torch.sparse_coo
), "Tensor must be `COOSparseTensor or a PyTorch COO tensor."
assert cooA.is_sparse and cooA.layout == torch.sparse_coo, (
"Tensor must be `COOSparseTensor or a PyTorch COO tensor."
)

# Convert to custom COOSparseTensor
cooA = COOSparseTensor(
Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import copy
from typing import Any, Dict, Optional, TypeVar, Union, overload
from typing import Any, Optional, TypeVar, Union, overload
import warnings

import torch
Expand Down Expand Up @@ -268,7 +268,7 @@ def __copy__(self):
def from_prequantized(
cls,
data: torch.Tensor,
quantized_stats: Dict[str, Any],
quantized_stats: dict[str, Any],
requires_grad: bool = False,
device="cuda",
module: Optional["Linear4bit"] = None,
Expand Down
Loading