NVIDIA-NeMo · MahmoudAshraf97 · Jan 27, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
@@ -1026,8 +1026,10 @@ def setup_streaming_params(
             if hasattr(m, "_max_cache_len"):
                 if isinstance(m, MultiHeadAttention):
                     m.cache_drop_size = streaming_cfg.cache_drop_size
+                    m.valid_query_length = streaming_cfg.valid_out_len
                 if isinstance(m, CausalConv1D):
                     m.cache_drop_size = streaming_cfg.cache_drop_size
+                    m.valid_query_length = streaming_cfg.valid_out_len
 
         self.streaming_cfg = streaming_cfg
 

diff --git a/nemo/collections/asr/parts/submodules/causal_convs.py b/nemo/collections/asr/parts/submodules/causal_convs.py
@@ -62,7 +62,8 @@ def __init__(
         )
 
     def forward(
-        self, x,
+        self,
+        x,
     ):
         x = F.pad(x, pad=(self._left_padding, self._right_padding, self._left_padding, self._right_padding))
         x = super().forward(x)
@@ -96,6 +97,7 @@ def __init__(
         dtype=None,
     ) -> None:
         self.cache_drop_size = None
+        self.valid_query_length = None
         if padding is None:
             self._left_padding = kernel_size - 1
             self._right_padding = stride - 1
@@ -134,11 +136,8 @@ def update_cache(self, x, cache=None):
         else:
             new_x = F.pad(x, pad=(0, self._right_padding))
             new_x = torch.cat([cache, new_x], dim=-1)
-            if self.cache_drop_size > 0:
-                next_cache = new_x[:, :, : -self.cache_drop_size]
-            else:
-                next_cache = new_x
-            next_cache = next_cache[:, :, -cache.size(-1) :]
+            cache_keep_size = self.valid_query_length - self.cache_drop_size
+            next_cache = torch.cat([cache, x[:, :, :cache_keep_size]], dim=-1)[:, :, -cache.size(-1) :]
         return new_x, next_cache
 
     def forward(self, x, cache=None):

diff --git a/nemo/collections/asr/parts/submodules/multi_head_attention.py b/nemo/collections/asr/parts/submodules/multi_head_attention.py
@@ -86,6 +86,7 @@ def __init__(
         self.use_pytorch_sdpa_backends = use_pytorch_sdpa_backends
 
         self.cache_drop_size = None
+        self.valid_query_length = None
         self.use_bias = use_bias
         self.dropout_rate = dropout_rate
         assert n_feat % n_head == 0
@@ -204,7 +205,7 @@ def forward(self, query, key, value, mask, pos_emb=None, cache=None):
     def update_cache(self, key, value, query, cache):
         if cache is not None:
             key = value = torch.cat([cache, key], dim=1)
-            q_keep_size = query.shape[1] - self.cache_drop_size
+            q_keep_size = self.valid_query_length - self.cache_drop_size
             cache = torch.cat([cache[:, q_keep_size:, :], query[:, :q_keep_size, :]], dim=1)
         return key, value, query, cache