Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@ jobs:
build-cpu:
strategy:
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
include:
- os: ubuntu-22.04
arch: x86_64
- os: ubuntu-22.04-arm
arch: aarch64
- os: windows-2025
arch: x86_64
- os: macos-15
arch: arm64
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -97,7 +99,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
torch_version: ["2.7.0"]
include:
- os: ubuntu-22.04
Expand All @@ -106,6 +108,8 @@ jobs:
arch: aarch64
- os: windows-2025
arch: x86_64
- os: macos-15
arch: arm64
runs-on: ${{ matrix.os }}
env:
BNB_TEST_DEVICE: cpu
Expand Down
46 changes: 24 additions & 22 deletions tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,11 @@ class Test8BitBlockwiseQuantizeFunctional:
@pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
iters = 100

if device == "cpu":
iters = 10

# This test is slow on CPU, so avoid atypical use cases.
if nested:
pytest.skip("Not a typical use case.")
Expand All @@ -106,7 +110,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,

diffs = []
reldiffs = []
for i in range(100):
for i in range(iters):
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
A2 = F.dequantize_blockwise(C, S)
Expand All @@ -116,15 +120,13 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
reldiffs.append(reldiff.mean().item())
abserr = sum(diffs) / len(diffs)
relerr = sum(reldiffs) / len(reldiffs)
# print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs))
# print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs))
assert abserr < 0.011
assert relerr < 0.018
assert A2.dtype == dtype

diffs = []
code = F.create_dynamic_map(signed=signed)
for i in range(100):
for i in range(iters):
A1 = torch.rand(1024, 1024, device=device, dtype=dtype)
C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
A2 = F.dequantize_blockwise(C, S)
Expand All @@ -142,29 +144,29 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
assert abserr < 0.00175
assert relerr < 0.012
assert A2.dtype == dtype
# print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
# print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))

def test_blockwise_cpu_large(self):
@pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
@pytest.mark.parametrize("hidden", [128])
@pytest.mark.parametrize("blocksize", [4096, 16384])
def test_blockwise_cpu_large(self, hidden, blocksize):
diffs = []
reldiffs = []
batch = 128
seq = 128
for hidden in [128]: # , 14336]:
for blocksize in [4096, 16384]:
for i in range(2):
A1 = torch.randn(batch, seq, hidden, device="cpu")
t0 = time.time()
C, S = F.quantize_blockwise(A1, blocksize=blocksize)
A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
print(time.time() - t0)
diff = torch.abs(A1 - A2)
reldiff = diff / torch.abs(A1 + 1e-8)
diffs.append(diff.mean().item())
reldiffs.append(reldiff.mean().item())
assert diffs[-1] < 0.011
# print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs))

for i in range(2):
A1 = torch.randn(batch, seq, hidden, device="cpu")
t0 = time.time()
C, S = F.quantize_blockwise(A1, blocksize=blocksize)
A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
print(time.time() - t0)
diff = torch.abs(A1 - A2)
reldiff = diff / torch.abs(A1 + 1e-8)
diffs.append(diff.mean().item())
reldiffs.append(reldiff.mean().item())
assert diffs[-1] < 0.011
# print(sum(diffs)/len(diffs))
# print(sum(reldiffs)/len(reldiffs))

@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
Expand Down
8 changes: 6 additions & 2 deletions tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,12 @@ class TestInt8BlockwiseQuantOps:
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_quantize_blockwise(self, device, dtype, blocksize):
if device == "cpu" and dtype != torch.float32:
pytest.skip("CPU implementation is only available for float32")
if device == "cpu":
if dtype != torch.float32:
pytest.skip("CPU implementation is only available for float32")

if blocksize != 256:
pytest.skip("CPU implementation is slow; only test blocksize=256")

code = bitsandbytes.functional.create_dynamic_map().to(device)
A = torch.randn(1024, 1024, dtype=dtype, device=device)
Expand Down