-
-
Notifications
You must be signed in to change notification settings - Fork 816
Closed
Labels
Description
System Info
OS: Windows10
Python: 3.10
Torch: 2.1.2
GPU: 4060 TI 16GB
Cuda: 11.8
bitsandbytes: latest snapshot
Reproduction
this is just a report for current windows support
Expected behavior
This is a test result of
tests\tests_functional.py: 31 failed, 592 passed, 9 skipped in 767.86s (0:12:47)tests\test_autograd.py: 2240 passed, 704 warnings in 119.18s (0:01:59)tests\test_linear4bit.py: 32 passed in 2.90stests\test_linear8bitlt.py: 18 passed in 14.60stests\test_optim.py: system crash after test done. (about 19 error, collected 177 items)
Details
===================================================================================== FAILURES ======================================================================================
_________________________________________ test_nvidia_transform[dims=2-transpose=F-orderOut=col32-orderA=row-int8-dim3=3-dim2=224-dim1=152] _________________________________________
dim1 = 152, dim2 = 224, dim3 = 3, dims = 2, dtype = torch.int8, orderA = 'row', orderOut = 'col32', transpose = False
@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut"))
@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
if dims == 3 and orderOut != "col32":
return
if dtype == torch.int32 and orderOut != "col32":
return
try:
func = F.get_transform_func(dtype, orderA, orderOut, transpose)
except ValueError as ve:
pytest.skip(str(ve)) # skip if not supported
if dims == 2:
A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype)
elif dims == 3:
A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(
dtype
)
out, S = F.nvidia_transform(A, to_order=orderOut)
if orderOut == "row":
torch.testing.assert_close(A.flatten(), out.flatten())
elif orderOut == "col":
torch.testing.assert_close(A.t().flatten(), out.flatten())
elif orderOut == "col32":
if dims == 2:
n = A.shape[0] * (A.shape[1] + (32 - (A.shape[1] % 32)))
elif dims == 3:
n = (
A.shape[0]
* A.shape[1]
* (A.shape[2] + (32 - (A.shape[2] % 32)))
)
> assert out.numel() == n
E AssertionError: assert 34048 == 38912
E + where 34048 = <built-in method numel of Tensor object at 0x0000000000A48F40>()
E + where <built-in method numel of Tensor object at 0x0000000000A48F40> = tensor([[ -32, -70, 98, 119, -80, 66, -11, 30, -63, -51, 47, -22, 100, -78, 32, 39, -71, 17,\n -20, 94, ..., 64, 115, 73, 109, 95, -71, 100, 40, 113, 75, -54, 58, -86, 20, -37,\n -120, -79, 88, -12, -68],\n [ 68, 75, -66, 23, 7, -45, 86, -8, -50, -24, 48, 55, 18, -97, -61, 64, -43, 70,\n -67, -4, ..., -116, 67, 27, -97, 36, 124, -122, -93, 22, -3, -13, 7, -38, 70, 104,\n 120, -64, 90, 83, 49],\n [ -5, 97, 31, 123, -101, -69, -36, -13, 15, 96, -11, -12, 107, -57, 48, -122, -64, -94,\n 43, 106, ..., -5, 41, 124, 41, 27, 104, 84, 89, 21, 118, -64, -112, 83, -12, -87,\n 19, -13, -47, 11, -17],\n [ 123, -126, 29, -88, 45, 1, 125, -124, 111, -40, -60, -83, -49, -44, -18, -111, 84, 63,\n 59, -76, ..., -60, 63, -120, -48, -52, -78, -99, 17, 59, -17, 75, -37, -60, -37, 52,\n 68, -107, -39, -54, 88],\n [ -69, -64, -86, -60, 88, -84, 44, 30, 6, 35, 12, 21, 104, 39, 122, 10...32, 74, 84, 102, -61,\n 105, -39, 18, -53, 39],\n [ -91, -124, 108, 77, -19, -14, 115, 75, -90, 123, -68, -117, -91, -71, -59, -59, -121, 97,\n 97, -34, ..., 62, 47, -78, 40, 97, 41, 83, 84, 25, 121, -52, 11, 104, -78, 92,\n -23, 10, 81, -34, 39],\n [-128, -36, -72, -57, -41, 36, -69, 76, 103, -87, -39, -84, 87, -91, 103, 30, 114, 122,\n -19, 51, ..., 95, 65, 83, 71, -11, 111, 4, 74, 50, -82, 89, -79, -67, 104, -71,\n 71, -69, 1, -84, 72],\n [ 79, 29, 74, 96, 19, -30, 2, -73, 78, 75, -58, -24, -25, 63, 28, 63, -14, -128,\n -15, -68, ..., 113, -114, -48, 75, 107, 3, 30, 61, -50, -103, -25, 61, -4, -18, -27,\n -26, -47, -2, -19, -7],\n [ -19, -34, 119, 39, 12, 1, -31, 88, -9, -18, -115, -13, -62, 44, 117, -7, 10, 53,\n -64, -97, ..., -104, -82, 111, -44, 40, -100, -114, 30, -72, 45, 117, 16, 29, -122, 2,\n 22, -102, -46, 9, 48]], device='cuda:0', dtype=torch.int8).numel
tests\test_functional.py:587: AssertionError
_________________________________________ test_nvidia_transform[dims=2-transpose=F-orderOut=col32-orderA=row-int8-dim3=3-dim2=224-dim1=73] __________________________________________
dim1 = 73, dim2 = 224, dim3 = 3, dims = 2, dtype = torch.int8, orderA = 'row', orderOut = 'col32', transpose = False
@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut"))
@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
if dims == 3 and orderOut != "col32":
return
if dtype == torch.int32 and orderOut != "col32":
return
try:
func = F.get_transform_func(dtype, orderA, orderOut, transpose)
except ValueError as ve:
pytest.skip(str(ve)) # skip if not supported
if dims == 2:
A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype)
elif dims == 3:
A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(
dtype
)
out, S = F.nvidia_transform(A, to_order=orderOut)
if orderOut == "row":
torch.testing.assert_close(A.flatten(), out.flatten())
elif orderOut == "col":
torch.testing.assert_close(A.t().flatten(), out.flatten())
elif orderOut == "col32":
if dims == 2:
n = A.shape[0] * (A.shape[1] + (32 - (A.shape[1] % 32)))
elif dims == 3:
n = (
A.shape[0]
* A.shape[1]
* (A.shape[2] + (32 - (A.shape[2] % 32)))
)
> assert out.numel() == n
E AssertionError: assert 16352 == 18688
E + where 16352 = <built-in method numel of Tensor object at 0x000000001E9A7A60>()
E + where <built-in method numel of Tensor object at 0x000000001E9A7A60> = tensor([[-109, -60, 80, -56, -43, 70, 41, 58, -45, 120, 39, -127, 34, -111, -84, 16, 126, -54,\n -36, 107, ..., 48, 110, -95, -62, 75, 99, -69, -12, -57, -97, 75, -47, 117, 67, -65,\n 39, 8, 108, 76, -49],\n [ 57, -67, -127, -64, 18, 46, 117, 107, 71, -44, 35, -82, 115, -93, -29, -58, 67, -33,\n -87, -57, ..., 45, 30, -58, -46, -11, 13, 96, -100, 124, 122, 12, 107, 27, -23, -126,\n -65, 29, -92, 106, 43],\n [ 124, 46, -65, -74, -92, 12, 71, 51, 75, -19, 70, -41, -49, -108, -28, 37, -117, 66,\n -42, -128, ..., 70, -73, -71, 122, -110, -22, 68, -114, -105, 116, -34, -28, -40, 21, 9,\n 8, -94, 75, -12, -33],\n [ 60, -18, -127, -57, -36, 38, 98, -102, 12, 18, 84, -128, -77, -34, -36, 109, 1, 19,\n -104, -114, ..., 89, 123, 20, -84, -89, 104, 15, 34, 47, -95, -46, 116, -32, -109, -43,\n -15, 21, -79, 57, -24],\n [-124, -59, 78, -41, -37, 75, 61, 77, -123, 59, 88, -53, -44, 4, -75, 4...29, 80, -12, 67, 111,\n 34, 75, -122, -103, -8],\n [-114, 66, 102, -111, 46, 98, -32, 53, -122, 47, -117, 31, 10, -110, 55, -119, -16, 4,\n -35, -82, ..., 115, 97, 9, -67, -9, -41, -105, 103, -18, -20, 44, 8, 8, -64, -25,\n 22, -7, -11, -40, -46],\n [ 111, -64, 46, -106, 65, -123, -32, -61, -40, 111, -99, -70, 20, 54, -125, -89, -44, 123,\n 79, 125, ..., -74, -106, 29, 68, -19, -15, 28, -103, 10, -56, 119, -93, 111, -94, 17,\n -66, 37, 23, 79, 126],\n [-124, -21, -8, 75, -79, -90, -56, -102, -118, 112, 123, -28, -71, 88, 3, 11, -36, -48,\n 108, 28, ..., 50, -32, 5, 73, -92, -55, 60, -128, 63, 113, -18, -40, 15, 35, 5,\n -71, 1, -62, 115, -20],\n [ -74, -81, 120, -51, 48, -124, 89, -96, 2, 80, -45, -94, -36, -90, -25, 69, 93, 116,\n 100, -111, ..., 61, 81, -4, 39, -31, -125, -122, 14, 70, 36, 2, -31, -113, 78, 65,\n -31, -25, 110, 41, 39]], device='cuda:0', dtype=torch.int8).numel
tests\test_functional.py:587: AssertionError
(snip)...
____________________________________________________________________ test_gemv_4bit[uint8-bf16-fc2-nf4-DQ_True] _____________________________________________________________________
dtype = torch.bfloat16, storage_type = 'nf4', quant_storage = torch.uint8, double_quant = True, kind = 'fc2'
@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'])
@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
for dim in [128, 256, 512, 1024]:
#for dim in [4*1024]:
#for dim in [1*16]:
errs1 = []
errs2 = []
errs3 = []
relerrs1 = []
relerrs2 = []
relerrs3 = []
max_errs1 = []
max_errs2 = []
max_errs3 = []
for i in range(100):
if kind == 'fc1':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim*4, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'fc2':
A = torch.randn(1, 4*dim, dtype=dtype, device='cuda')
B = torch.randn(dim, 4*dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'attn':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'attn_packed':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim*3, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant, quant_storage=quant_storage)
C3 = torch.matmul(A, B.t())
C2 = F.gemv_4bit(A, qB.t(), state=state)
A.requires_grad = True
C1 = bnb.matmul_4bit(A, qB.t(), state)
err1 = (C1-C2).abs().float()
err2 = (C3-C2).abs().float()
err3 = (C3-C1).abs().float()
mag1 = torch.abs(C1).float()+1e-5
mag2 = torch.abs(C3).float()+1e-5
mag3 = torch.abs(C3).float()+1e-5
relerr1 = err1/mag1
relerr2 = err2/mag2
relerr3 = err3/mag3
max_err1 = err1.max()
max_err2 = err2.max()
max_err3 = err3.max()
errs1.append(err1.mean().item())
errs2.append(err2.mean().item())
errs3.append(err3.mean().item())
relerrs1.append(relerr1.mean().item())
relerrs2.append(relerr2.mean().item())
relerrs3.append(relerr3.mean().item())
max_errs1.append(max_err1.item())
max_errs2.append(max_err2.item())
max_errs3.append(max_err3.item())
c = int(C1.numel()*0.0014*(dim/256))+1
c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False)
err1 = sum(errs1)/len(errs1)/math.sqrt(dim)
err2 = sum(errs2)/len(errs2)/math.sqrt(dim)
err3 = sum(errs3)/len(errs3)/math.sqrt(dim)
relerr1 = sum(relerrs1)/len(relerrs1)/math.sqrt(dim)
relerr2 = sum(relerrs2)/len(relerrs2)/math.sqrt(dim)
relerr3 = sum(relerrs3)/len(relerrs3)/math.sqrt(dim)
maxerr1 = sum(max_errs1)/len(max_errs1)/math.sqrt(dim)
maxerr2 = sum(max_errs2)/len(max_errs2)/math.sqrt(dim)
maxerr3 = sum(max_errs3)/len(max_errs3)/math.sqrt(dim)
absratio = err2/err3
relratio = relerr2/relerr3
maxratio = relerr2/relerr3
# for debugging if the tests fails
#
#print('='*80)
#print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
#print(C1.flatten()[-20:])
#print(C2.flatten()[-20:])
#print(f'inference vs training abs: {err1}')
#print(f'inference vs training rel: {relerr1}')
#print(f'inference vs training max: {maxerr1}')
#print(f'inference vs training vs torch err ratio abs: {absratio}')
#print(f'inference vs training vs torch err ratio rel: {relratio}')
#print(f'inference vs training vs torch err ratio max: {maxratio}')
if dtype == torch.float16:
if dim <= 512:
assert err1 < 7e-5
assert relerr1 < 0.0008
else:
assert err1 < 6e-5
assert relerr1 < 2e-4
assert absratio < 1.005 and absratio > 0.995
assert relratio < 1.005 and relratio > 0.995
assert maxratio < 1.005 and maxratio > 0.995
elif dtype == torch.float32:
if dim <= 512:
assert err1 < 5e-8
assert relerr1 < 1e-6
assert maxerr1 < 1e-7
else:
assert err1 < 5e-8
assert relerr1 < 8e-6
assert maxerr1 < 1e-7
assert absratio < 1.005 and absratio > 0.995
assert relratio < 1.005 and relratio > 0.995
assert maxratio < 1.005 and maxratio > 0.995
elif dtype == torch.bfloat16:
if dim <= 512:
assert err1 < 6e-4
assert relerr1 < 0.007
assert maxerr1 < 0.015
else:
assert err1 < 2e-4
> assert relerr1 < 0.002
E assert 0.005869260463805403 < 0.002
tests\test_functional.py:2280: AssertionError
(snip)...
====================================================================================== PASSES =======================================================================================
_____________________________________________________________________________ test_dynamic_quantization _____________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.01197060595266521
0.018862400725483893
0.011960445903241634
0.018855047821998597
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=4k, hidden=12k] __________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.054001569747924805
0.045006752014160156
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=5k, hidden=15k] __________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.07099795341491699
0.07000350952148438
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=12k, hidden=48k] _________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.5150082111358643
0.5169980525970459
___________________________________________________________________________________ test_overflow ___________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
col_ampere
col_ampere
__________________________________________________________________________________ test_spmm_bench __________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
6.008148193359375e-05
0.002994537353515625 0.02800440788269043
0.1069309290901506
6.961822509765625e-05
0.014997482299804688 0.027995824813842773
0.5357042487417286
___________________________________________________________________________________ test_matmuls ____________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.1888427734375 0.1895751953125
0.189697265625 0.189453125
________________________________________________________ test_spmm_coo_very_sparse[out_func=zeros-fp16-dim2=12288-dim1=2048] ________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
________________________________________________________ test_spmm_coo_very_sparse[out_func=ones-fp16-dim2=12288-dim1=2048] _________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
______________________________________________________________________ test_spmm_coo_dequant[dtype0-2048-2048] ______________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
tensor(15., device='cuda:0')
cusparse fp16 0.04900002479553223
int8 0.17600059509277344
int8+dequant 0.20199847221374512
matmul 0.15299677848815918
sparse+ matmul 0.5020043849945068
partial matmul 0.3229959011077881
partial matmul 0.18599939346313477
tensor(15., device='cuda:0')
cusparse fp16 0.04900026321411133
int8 0.14599609375
int8+dequant 0.18900036811828613
matmul 0.15299654006958008
sparse+ matmul 0.5290052890777588
partial matmul 0.3379969596862793
partial matmul 0.1929950714111328
_____________________________________________________________ test_bench_matmul[batch=1, seq=1, model=6656, hidden=26k] _____________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
pytorch fp16: [1,1,6656], [6656,26624]->[1,1,26624]: 1.3130s
bnb nf4: [1,1,6656], [6656,26624]->[1,1,26624]: 0.3770s
bnb nf4+DQ: [1,1,6656], [6656,26624]->[1,1,26624]: 0.4420s
pytorch fp16: [1,1,6656], [6656,26624]->[1,1,26624]: 1.3120s
bnb nf4: [1,1,6656], [6656,26624]->[1,1,26624]: 0.3770s
bnb nf4+DQ: [1,1,6656], [6656,26624]->[1,1,26624]: 0.4410s
__________________________________________________________________________________ test_zeropoint ___________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
5.278311252593994 0.002829474862664938 1.2293792224227218e-06 6.859274890302913e-07 8.60375803313218e-05 0.000324648164678365
5.207507133483887 0.0028353077359497547 1.2703461607088684e-06 6.862092050141655e-07 8.6198553617578e-05 0.0003371547209098935
_____________________________________________________________________________ test_blockwise_cpu_large ______________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.4819974899291992
0.3310065269470215
0.07500696182250977
0.09300756454467773
0.219010591506958
0.2360093593597412
0.07599949836730957
0.1819922924041748
_____________________________________________________________________________ test_bench_dequantization _____________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
tensor(255, device='cuda:0', dtype=torch.uint8)
tensor(255, device='cuda:0', dtype=torch.uint8)
=============================================================== 31 failed, 592 passed, 9 skipped in 767.86s (0:12:47) ===============================================================test_nvidia_transform: 8 failed, 88 passed, 536 deselected in 11.29stest_gemv_4bit: 23 failed, 169 passed, 440 deselected in 615.68s (0:10:15)