Skip to content

TensorRT produces different outputs for identical samples in the same batch on GPU L20 #4796

@kuashesa

Description

@kuashesa

TensorRT produces different outputs for identical samples in the same batch

Summary

A strongly-typed TensorRT network produces different outputs for two identical samples placed in the same batch.

The input tensor enc_hidden[0] and enc_hidden[1] are bitwise identical, and all weights are constants. Therefore the corresponding output slices should also be identical. However, TensorRT returns different values for both an intermediate projection output and the final attention output.

Environment

Observed with:

Python: 3.11.9
TensorRT: 10.7.0.post1 bindings + 10.6.0.23 / 10.16.0.72 libs
PyTorch: 2.5.1+cu124
GPU: NVIDIA L20

Reproduction

Save the following script as repro.py and run:

python repro.py

The script is self-contained. It only uses deterministic random FP16 constants and a duplicated FP16 input batch.

# coding=utf-8
import numpy as np
import tensorrt as trt
import torch


LOGGER = trt.Logger(trt.Logger.INFO)
BATCH, ENC_LEN, HIDDEN = 2, 128, 1024
NUM_HEADS, HEAD_DIM = 16, 64


def constant(network, array, name):
    arr = np.ascontiguousarray(array.astype(np.float16))
    layer = network.add_constant(arr.shape, arr)
    layer.name = name
    return layer.get_output(0)


def shuffle(network, x, name, reshape=None, transpose=None):
    layer = network.add_shuffle(x)
    layer.name = name
    if reshape is not None:
        layer.reshape_dims = reshape
    if transpose is not None:
        layer.first_transpose = trt.Permutation(transpose)
    return layer.get_output(0)


def linear(network, x, weight, bias, name):
    w = constant(network, weight.reshape(1, HIDDEN, HIDDEN), name + ".weight")
    b = constant(network, bias.reshape(1, 1, HIDDEN), name + ".bias")
    mm = network.add_matrix_multiply(x, trt.MatrixOperation.NONE, w, trt.MatrixOperation.TRANSPOSE).get_output(0)
    return network.add_elementwise(mm, b, trt.ElementWiseOperation.SUM).get_output(0)


def build_engine(weights):
    builder = trt.Builder(LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED))
    config = builder.create_builder_config()

    profile = builder.create_optimization_profile()
    profile.set_shape("enc_hidden", (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN), (BATCH, ENC_LEN, HIDDEN))
    config.add_optimization_profile(profile)

    enc_hidden = network.add_input("enc_hidden", trt.DataType.HALF, (-1, ENC_LEN, HIDDEN))

    k = linear(network, enc_hidden, weights["k_w"], weights["k_b"], "k_proj")
    v = linear(network, enc_hidden, weights["v_w"], weights["v_b"], "v_proj")

    k = shuffle(network, k, "k_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
    v = shuffle(network, v, "v_view", reshape=(BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM))
    v.name = "v_proj_out"
    network.mark_output(v)

    q = constant(network, np.zeros((BATCH, 1, NUM_HEADS, HEAD_DIM), dtype=np.float16), "q")
    q = shuffle(network, q, "q_transpose", transpose=(0, 2, 1, 3))
    k = shuffle(network, k, "k_transpose", transpose=(0, 2, 3, 1))
    v = shuffle(network, v, "v_transpose", transpose=(0, 2, 1, 3))

    qk = network.add_matrix_multiply(q, trt.MatrixOperation.NONE, k, trt.MatrixOperation.NONE).get_output(0)
    scale = constant(network, np.array([[[[1.0 / np.sqrt(HEAD_DIM)]]]], dtype=np.float16), "scale")
    qk = network.add_elementwise(qk, scale, trt.ElementWiseOperation.PROD).get_output(0)

    softmax = network.add_softmax(qk)
    softmax.axes = 1 << 3
    attn = softmax.get_output(0)

    out = network.add_matrix_multiply(attn, trt.MatrixOperation.NONE, v, trt.MatrixOperation.NONE).get_output(0)
    out = shuffle(network, out, "out_transpose", transpose=(0, 2, 1, 3))
    out = shuffle(network, out, "out_view", reshape=(BATCH, HIDDEN))
    out.name = "attn_out"
    network.mark_output(out)

    serialized = builder.build_serialized_network(network, config)
    if serialized is None:
        raise RuntimeError("TensorRT failed to build the engine")
    engine = trt.Runtime(LOGGER).deserialize_cuda_engine(serialized)
    if engine is None:
        raise RuntimeError("TensorRT failed to deserialize the engine")
    return engine


def run(engine, enc_hidden):
    context = engine.create_execution_context()
    if not context.set_input_shape("enc_hidden", enc_hidden.shape):
        raise RuntimeError("set_input_shape failed")

    inp = torch.from_numpy(enc_hidden).cuda()
    outputs = {
        "v_proj_out": torch.empty((BATCH, ENC_LEN, NUM_HEADS, HEAD_DIM), dtype=torch.float16, device="cuda"),
        "attn_out": torch.empty((BATCH, HIDDEN), dtype=torch.float16, device="cuda"),
    }

    context.set_tensor_address("enc_hidden", inp.data_ptr())
    for name, tensor in outputs.items():
        context.set_tensor_address(name, tensor.data_ptr())

    stream = torch.cuda.Stream()
    with torch.cuda.stream(stream):
        if not context.execute_async_v3(stream.cuda_stream):
            raise RuntimeError("execute_async_v3 failed")
    stream.synchronize()

    for name, tensor in outputs.items():
        diff = (tensor[0].float() - tensor[1].float()).abs()
        print(f"{name:10s} shape={tuple(tensor.shape)} same={torch.equal(tensor[0], tensor[1])} "
              f"maxdiff={diff.max().item():.8g} meandiff={diff.mean().item():.8g}")


def main():
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is required")

    rng = np.random.default_rng(1234)
    weights = {
        "k_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
        "k_b": np.zeros(HIDDEN, dtype=np.float16),
        "v_w": (rng.standard_normal((HIDDEN, HIDDEN)).astype(np.float16) * np.float16(0.02)).astype(np.float16),
        "v_b": np.zeros(HIDDEN, dtype=np.float16),
    }

    single = rng.standard_normal((1, ENC_LEN, HIDDEN)).astype(np.float16)
    enc_hidden = np.ascontiguousarray(np.concatenate([single, single], axis=0))
    assert np.array_equal(enc_hidden[0], enc_hidden[1])

    run(build_engine(weights), enc_hidden)


if __name__ == "__main__":
    main()

Actual output

v_proj_out shape=(2, 128, 16, 64) same=False maxdiff=3.9804688 meandiff=0.71506613
attn_out   shape=(2, 1024) same=False maxdiff=0.41235352 meandiff=0.0860909

Expected output

Because enc_hidden[0] and enc_hidden[1] are bitwise identical and all weights/constants are shared across the batch dimension, the two output slices should be identical:

v_proj_out same=True maxdiff=0
attn_out   same=True maxdiff=0

Graph structure

The minimized graph is equivalent to:

k = linear(enc_hidden).reshape(2, 128, 16, 64)
v = linear(enc_hidden).reshape(2, 128, 16, 64)
q = zeros((2, 1, 16, 64), fp16)

q = transpose(q, [0, 2, 1, 3])       # (2, 16, 1, 64)
k = transpose(k, [0, 2, 3, 1])       # (2, 16, 64, 128)
v = transpose(v, [0, 2, 1, 3])       # (2, 16, 128, 64)

attn = softmax((q @ k) / sqrt(64), axis=-1)
out = (attn @ v).transpose(0, 2, 1, 3).reshape(2, 1024)

The first marked output (v_proj_out) already differs between the two identical batch samples, so the issue appears before the final attention output.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Module:AccuracyOutput mismatch between TensorRT and other frameworks

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions