TensorRT produces different outputs for identical samples in the same batch on GPU L20

# TensorRT produces different outputs for identical samples in the same batch

## Summary

A strongly-typed TensorRT network produces different outputs for two identical samples placed in the same batch.

The input tensor `enc_hidden[0]` and `enc_hidden[1]` are bitwise identical, and all weights are constants. Therefore the corresponding output slices should also be identical. However, TensorRT returns different values for both an intermediate projection output and the final attention output.

## Environment

Observed with:

```text
Python: 3.11.9
TensorRT: 10.7.0.post1 bindings + 10.6.0.23 / 10.16.0.72 libs
PyTorch: 2.5.1+cu124
GPU: NVIDIA L20
```

## Reproduction

Save the following script as `repro.py` and run:

```bash
python repro.py
```

The script is self-contained. It only uses deterministic random FP16 constants and a duplicated FP16 input batch.

```python
# coding=utf-8
import numpy as np
import tensorrt as trt
import torch


LOGGER = trt.Logger(trt.Logger.INFO)
BATCH, ENC_LEN, HIDDEN = 2, 128, 1024
NUM_HEADS, HEAD_DIM = 16, 64


def constant(network, array, name):
    arr = np.ascontiguousarray(array.astype(np.float16))
    layer = network.add_constant(arr.shape, arr)
    layer.name = name
    return layer.get_output(0)


def shuffle(network, x, name, reshape=None, transpose=None):
    layer = network.add_shuffle(x)
    layer.name = name
    if reshape is not None:
        layer.reshape_dims = reshape
    if transpose is not None:
        layer.first_transpose = trt.Permutation(transpose)
    return layer.get_output(0)


def linear(network, x, weight, bias, name):
    w = constant(network, weight.reshape(1, HIDDEN, HIDDEN), name + ".weight")
    b = constant(network, bias.reshape(1, 1, HIDDEN), name + ".bias")
    mm = network.add_matrix_multiply(x, trt.MatrixOperation.NONE, w, trt.MatrixOperation.TRANSPOSE).get_output(0)
    return network.add_elementwise(mm, b, trt.ElementWiseOperation.SUM).get_output(0)


def build_engine(weights):
    builder = trt.Builder(LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED))
    config = builder.create_builder_config()

    profile = builder.create_optimization_profile()
    profile.set_shape("enc_hidden", (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN))
    config.add_optimization_profile(profile)

    enc_hidden = network.add_input("enc_hidden", trt.DataType.HALF, (-1, ENC_LEN, HIDDEN))

    k = linear(network, enc_hidden, weights["k_w"], weights["k_b"], "k_proj")
    v = linear(network, enc_hidden, weights["v_w"], weights["v_b"], "v_proj")

    k = shuffle(network, k, "k_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
    v = shuffle(network, v, "v_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
    v.name = "v_proj_out"
    network.mark_output(v)

    q = constant(network, np.zeros((BATCH, 1, NUM_HEADS, HEAD_DIM), dtype=np.float16), "q")
    q = shuffle(network, q, "q_transpose", transpose=(0, 2, 1, 3))
    k = shuffle(network, k, "k_transpose", transpose=(0, 2, 3, 1))
    v = shuffle(network, v, "v_transpose", transpose=(0, 2, 1, 3))

    qk = network.add_matrix_multiply(q, trt.MatrixOperation.NONE, k, trt.MatrixOperation.NONE).get_output(0)
    scale = constant(network, np.array([[[[1.0 / np.sqrt(HEAD_DIM)]]]], dtype=np.float16), "scale")
    qk = network.add_elementwise(qk, scale, trt.ElementWiseOperation.PROD).get_output(0)

    softmax = network.add_softmax(qk)
    softmax.axes = 1 << 3
    attn = softmax.get_output(0)

    out = network.add_matrix_multiply(attn, trt.MatrixOperation.NONE, v, trt.MatrixOperation.NONE).get_output(0)
    out = shuffle(network, out, "out_transpose", transpose=(0, 2, 1, 3))
    out = shuffle(network, out, "out_view", reshape=(BATCH, HIDDEN))
    out.name = "attn_out"
    network.mark_output(out)

    serialized = builder.build_serialized_network(network, config)
    if serialized is None:
        raise RuntimeError("TensorRT failed to build the engine")
    engine = trt.Runtime(LOGGER).deserialize_cuda_engine(serialized)
    if engine is None:
        raise RuntimeError("TensorRT failed to deserialize the engine")
    return engine


def run(engine, enc_hidden):
    context = engine.create_execution_context()
    if not context.set_input_shape("enc_hidden", enc_hidden.shape):
        raise RuntimeError("set_input_shape failed")

    inp = torch.from_numpy(enc_hidden).cuda()
    outputs = {
        "v_proj_out": torch.empty((BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM), dtype=torch.float16, device="cuda"),
        "attn_out": torch.empty((BATCH, HIDDEN), dtype=torch.float16, device="cuda"),
    }

    context.set_tensor_address("enc_hidden", inp.data_ptr())
    for name, tensor in outputs.items():
        context.set_tensor_address(name, tensor.data_ptr())

    stream = torch.cuda.Stream()
    with torch.cuda.stream(stream):
        if not context.execute_async_v3(stream.cuda_stream):
            raise RuntimeError("execute_async_v3 failed")
    stream.synchronize()

    for name, tensor in outputs.items():
        diff = (tensor[0].float() - tensor[1].float()).abs()
        print(f"{name:10s} shape={tuple(tensor.shape)} same={torch.equal(tensor[0], tensor[1])} "
              f"maxdiff={diff.max().item():.8g} meandiff={diff.mean().item():.8g}")


def main():
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is required")

    rng = np.random.default_rng(1234)
    weights = {
        "k_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
        "k_b": np.zeros(HIDDEN, dtype=np.float16),
        "v_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
        "v_b": np.zeros(HIDDEN, dtype=np.float16),
    }

    single = rng.standard_normal((1, ENC_LEN, HIDDEN)).astype(np.float16)
    enc_hidden = np.ascontiguousarray(np.concatenate([single, single], axis=0))
    assert np.array_equal(enc_hidden[0], enc_hidden[1])

    run(build_engine(weights), enc_hidden)


if __name__ == "__main__":
    main()
```

## Actual output

```text
v_proj_out shape=(2, 128, 16, 64) same=False maxdiff=3.9804688 meandiff=0.71506613
attn_out   shape=(2, 1024) same=False maxdiff=0.41235352 meandiff=0.0860909
```

## Expected output

Because `enc_hidden[0]` and `enc_hidden[1]` are bitwise identical and all weights/constants are shared across the batch dimension, the two output slices should be identical:

```text
v_proj_out same=True maxdiff=0
attn_out   same=True maxdiff=0
```

## Graph structure

The minimized graph is equivalent to:

```python
k = linear(enc_hidden).reshape(2, 128, 16, 64)
v = linear(enc_hidden).reshape(2, 128, 16, 64)
q = zeros((2, 1, 16, 64), fp16)

q = transpose(q, [0, 2, 1, 3])       # (2, 16, 1, 64)
k = transpose(k, [0, 2, 3, 1])       # (2, 16, 64, 128)
v = transpose(v, [0, 2, 1, 3])       # (2, 16, 128, 64)

attn = softmax((q @ k) / sqrt(64), axis=-1)
out = (attn @ v).transpose(0, 2, 1, 3).reshape(2, 1024)
```

The first marked output (`v_proj_out`) already differs between the two identical batch samples, so the issue appears before the final attention output.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

TensorRT produces different outputs for identical samples in the same batch on GPU L20 #4796

TensorRT produces different outputs for identical samples in the same batch

Summary

Environment

Reproduction

Actual output

Expected output

Graph structure

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

TensorRT produces different outputs for identical samples in the same batch on GPU L20 #4796

Description

TensorRT produces different outputs for identical samples in the same batch

Summary

Environment

Reproduction

Actual output

Expected output

Graph structure

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions