TensorRT produces different outputs for identical samples in the same batch
Summary
A strongly-typed TensorRT network produces different outputs for two identical samples placed in the same batch.
The input tensor enc_hidden[0] and enc_hidden[1] are bitwise identical, and all weights are constants. Therefore the corresponding output slices should also be identical. However, TensorRT returns different values for both an intermediate projection output and the final attention output.
Environment
Observed with:
Python: 3.11.9
TensorRT: 10.7.0.post1 bindings + 10.6.0.23 / 10.16.0.72 libs
PyTorch: 2.5.1+cu124
GPU: NVIDIA L20
Reproduction
Save the following script as repro.py and run:
The script is self-contained. It only uses deterministic random FP16 constants and a duplicated FP16 input batch.
# coding=utf-8
import numpy as np
import tensorrt as trt
import torch
LOGGER = trt.Logger(trt.Logger.INFO)
BATCH, ENC_LEN, HIDDEN = 2, 128, 1024
NUM_HEADS, HEAD_DIM = 16, 64
def constant(network, array, name):
arr = np.ascontiguousarray(array.astype(np.float16))
layer = network.add_constant(arr.shape, arr)
layer.name = name
return layer.get_output(0)
def shuffle(network, x, name, reshape=None, transpose=None):
layer = network.add_shuffle(x)
layer.name = name
if reshape is not None:
layer.reshape_dims = reshape
if transpose is not None:
layer.first_transpose = trt.Permutation(transpose)
return layer.get_output(0)
def linear(network, x, weight, bias, name):
w = constant(network, weight.reshape(1, HIDDEN, HIDDEN), name + ".weight")
b = constant(network, bias.reshape(1, 1, HIDDEN), name + ".bias")
mm = network.add_matrix_multiply(x, trt.MatrixOperation.NONE, w, trt.MatrixOperation.TRANSPOSE).get_output(0)
return network.add_elementwise(mm, b, trt.ElementWiseOperation.SUM).get_output(0)
def build_engine(weights):
builder = trt.Builder(LOGGER)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED))
config = builder.create_builder_config()
profile = builder.create_optimization_profile()
profile.set_shape("enc_hidden", (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN))
config.add_optimization_profile(profile)
enc_hidden = network.add_input("enc_hidden", trt.DataType.HALF, (-1, ENC_LEN, HIDDEN))
k = linear(network, enc_hidden, weights["k_w"], weights["k_b"], "k_proj")
v = linear(network, enc_hidden, weights["v_w"], weights["v_b"], "v_proj")
k = shuffle(network, k, "k_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
v = shuffle(network, v, "v_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
v.name = "v_proj_out"
network.mark_output(v)
q = constant(network, np.zeros((BATCH, 1, NUM_HEADS, HEAD_DIM), dtype=np.float16), "q")
q = shuffle(network, q, "q_transpose", transpose=(0, 2, 1, 3))
k = shuffle(network, k, "k_transpose", transpose=(0, 2, 3, 1))
v = shuffle(network, v, "v_transpose", transpose=(0, 2, 1, 3))
qk = network.add_matrix_multiply(q, trt.MatrixOperation.NONE, k, trt.MatrixOperation.NONE).get_output(0)
scale = constant(network, np.array([[[[1.0 / np.sqrt(HEAD_DIM)]]]], dtype=np.float16), "scale")
qk = network.add_elementwise(qk, scale, trt.ElementWiseOperation.PROD).get_output(0)
softmax = network.add_softmax(qk)
softmax.axes = 1 << 3
attn = softmax.get_output(0)
out = network.add_matrix_multiply(attn, trt.MatrixOperation.NONE, v, trt.MatrixOperation.NONE).get_output(0)
out = shuffle(network, out, "out_transpose", transpose=(0, 2, 1, 3))
out = shuffle(network, out, "out_view", reshape=(BATCH, HIDDEN))
out.name = "attn_out"
network.mark_output(out)
serialized = builder.build_serialized_network(network, config)
if serialized is None:
raise RuntimeError("TensorRT failed to build the engine")
engine = trt.Runtime(LOGGER).deserialize_cuda_engine(serialized)
if engine is None:
raise RuntimeError("TensorRT failed to deserialize the engine")
return engine
def run(engine, enc_hidden):
context = engine.create_execution_context()
if not context.set_input_shape("enc_hidden", enc_hidden.shape):
raise RuntimeError("set_input_shape failed")
inp = torch.from_numpy(enc_hidden).cuda()
outputs = {
"v_proj_out": torch.empty((BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM), dtype=torch.float16, device="cuda"),
"attn_out": torch.empty((BATCH, HIDDEN), dtype=torch.float16, device="cuda"),
}
context.set_tensor_address("enc_hidden", inp.data_ptr())
for name, tensor in outputs.items():
context.set_tensor_address(name, tensor.data_ptr())
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
if not context.execute_async_v3(stream.cuda_stream):
raise RuntimeError("execute_async_v3 failed")
stream.synchronize()
for name, tensor in outputs.items():
diff = (tensor[0].float() - tensor[1].float()).abs()
print(f"{name:10s} shape={tuple(tensor.shape)} same={torch.equal(tensor[0], tensor[1])} "
f"maxdiff={diff.max().item():.8g} meandiff={diff.mean().item():.8g}")
def main():
if not torch.cuda.is_available():
raise RuntimeError("CUDA is required")
rng = np.random.default_rng(1234)
weights = {
"k_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
"k_b": np.zeros(HIDDEN, dtype=np.float16),
"v_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
"v_b": np.zeros(HIDDEN, dtype=np.float16),
}
single = rng.standard_normal((1, ENC_LEN, HIDDEN)).astype(np.float16)
enc_hidden = np.ascontiguousarray(np.concatenate([single, single], axis=0))
assert np.array_equal(enc_hidden[0], enc_hidden[1])
run(build_engine(weights), enc_hidden)
if __name__ == "__main__":
main()
Actual output
v_proj_out shape=(2, 128, 16, 64) same=False maxdiff=3.9804688 meandiff=0.71506613
attn_out shape=(2, 1024) same=False maxdiff=0.41235352 meandiff=0.0860909
Expected output
Because enc_hidden[0] and enc_hidden[1] are bitwise identical and all weights/constants are shared across the batch dimension, the two output slices should be identical:
v_proj_out same=True maxdiff=0
attn_out same=True maxdiff=0
Graph structure
The minimized graph is equivalent to:
k = linear(enc_hidden).reshape(2, 128, 16, 64)
v = linear(enc_hidden).reshape(2, 128, 16, 64)
q = zeros((2, 1, 16, 64), fp16)
q = transpose(q, [0, 2, 1, 3]) # (2, 16, 1, 64)
k = transpose(k, [0, 2, 3, 1]) # (2, 16, 64, 128)
v = transpose(v, [0, 2, 1, 3]) # (2, 16, 128, 64)
attn = softmax((q @ k) / sqrt(64), axis=-1)
out = (attn @ v).transpose(0, 2, 1, 3).reshape(2, 1024)
The first marked output (v_proj_out) already differs between the two identical batch samples, so the issue appears before the final attention output.
TensorRT produces different outputs for identical samples in the same batch
Summary
A strongly-typed TensorRT network produces different outputs for two identical samples placed in the same batch.
The input tensor
enc_hidden[0]andenc_hidden[1]are bitwise identical, and all weights are constants. Therefore the corresponding output slices should also be identical. However, TensorRT returns different values for both an intermediate projection output and the final attention output.Environment
Observed with:
Reproduction
Save the following script as
repro.pyand run:The script is self-contained. It only uses deterministic random FP16 constants and a duplicated FP16 input batch.
Actual output
Expected output
Because
enc_hidden[0]andenc_hidden[1]are bitwise identical and all weights/constants are shared across the batch dimension, the two output slices should be identical:Graph structure
The minimized graph is equivalent to:
The first marked output (
v_proj_out) already differs between the two identical batch samples, so the issue appears before the final attention output.