NVIDIA-NeMo · artbataev · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -446,6 +446,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
                     ),
                     preserve_alignments=self.preserve_alignments,
                     preserve_frame_confidence=self.preserve_frame_confidence,
+                    exclude_blank_from_confidence=self.exclude_blank_from_confidence,
                     confidence_method_cfg=self.confidence_method_cfg,
                     loop_labels=self.cfg.greedy.get('loop_labels', True),
                     use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', True),
@@ -464,6 +465,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int, supported_punctu
                     ),
                     preserve_alignments=self.preserve_alignments,
                     preserve_frame_confidence=self.preserve_frame_confidence,
+                    exclude_blank_from_confidence=self.exclude_blank_from_confidence,
                     include_duration=self.tdt_include_token_duration,
                     include_duration_confidence=self.tdt_include_duration_confidence,
                     confidence_method_cfg=self.confidence_method_cfg,
@@ -815,18 +817,40 @@ def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothes
         """
         if self._is_tdt:
             # if self.tdt_include_duration_confidence is True then frame_confidence elements consist of two numbers
-            maybe_pre_aggregate = (
-                (lambda x: self._aggregate_confidence(x)) if self.tdt_include_duration_confidence else (lambda x: x)
-            )
-            for hyp in hypotheses_list:
-                token_confidence = []
-                # trying to recover frame_confidence according to alignments
-                subsequent_blank_confidence = []
-                # going backwards since <blank> tokens are considered belonging to the last non-blank token.
-                for fc, fa in zip(hyp.frame_confidence[::-1], hyp.alignments[::-1]):
-                    # there is only one score per frame most of the time
-                    if len(fa) > 1:
-                        for i, a in reversed(list(enumerate(fa))):
+            if self.exclude_blank_from_confidence and all(
+                hyp.non_blank_step_confidence_precomputed is not None for hyp in hypotheses_list
+            ):
+                for hyp in hypotheses_list:
+                    hyp.token_confidence = hyp.non_blank_step_confidence_precomputed
+            else:
+                maybe_pre_aggregate = (
+                    (lambda x: self._aggregate_confidence(x))
+                    if self.tdt_include_duration_confidence
+                    else (lambda x: x)
+                )
+                for hyp in hypotheses_list:
+                    token_confidence = []
+                    # trying to recover frame_confidence according to alignments
+                    subsequent_blank_confidence = []
+                    # going backwards since <blank> tokens are considered belonging to the last non-blank token.
+                    for fc, fa in zip(hyp.frame_confidence[::-1], hyp.alignments[::-1]):
+                        # there is only one score per frame most of the time
+                        if len(fa) > 1:
+                            for i, a in reversed(list(enumerate(fa))):
+                                if a[-1] == self.blank_id:
+                                    if not self.exclude_blank_from_confidence:
+                                        subsequent_blank_confidence.append(maybe_pre_aggregate(fc[i]))
+                                elif not subsequent_blank_confidence:
+                                    token_confidence.append(maybe_pre_aggregate(fc[i]))
+                                else:
+                                    token_confidence.append(
+                                        self._aggregate_confidence(
+                                            [maybe_pre_aggregate(fc[i])] + subsequent_blank_confidence
+                                        )
+                                    )
+                                    subsequent_blank_confidence = []
+                        else:
+                            i, a = 0, fa[0]
                             if a[-1] == self.blank_id:
                                 if not self.exclude_blank_from_confidence:
                                     subsequent_blank_confidence.append(maybe_pre_aggregate(fc[i]))
@@ -839,20 +863,8 @@ def compute_confidence(self, hypotheses_list: List[Hypothesis]) -> List[Hypothes
                                     )
                                 )
                                 subsequent_blank_confidence = []
-                    else:
-                        i, a = 0, fa[0]
-                        if a[-1] == self.blank_id:
-                            if not self.exclude_blank_from_confidence:
-                                subsequent_blank_confidence.append(maybe_pre_aggregate(fc[i]))
-                        elif not subsequent_blank_confidence:
-                            token_confidence.append(maybe_pre_aggregate(fc[i]))
-                        else:
-                            token_confidence.append(
-                                self._aggregate_confidence([maybe_pre_aggregate(fc[i])] + subsequent_blank_confidence)
-                            )
-                            subsequent_blank_confidence = []
-                token_confidence = token_confidence[::-1]
-                hyp.token_confidence = token_confidence
+                    token_confidence = token_confidence[::-1]
+                    hyp.token_confidence = token_confidence
         else:
             if self.exclude_blank_from_confidence:
                 for hyp in hypotheses_list:

diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -610,6 +610,7 @@ def __init__(
         max_symbols_per_step: Optional[int] = None,
         preserve_alignments: bool = False,
         preserve_frame_confidence: bool = False,
+        exclude_blank_from_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
         loop_labels: bool = True,
         use_cuda_graph_decoder: bool = True,
@@ -629,6 +630,7 @@ def __init__(
 
         self.use_cuda_graph_decoder = use_cuda_graph_decoder
         self.loop_labels = loop_labels
+        self.exclude_blank_from_confidence = exclude_blank_from_confidence
 
         # Depending on availability of `blank_as_pad` support
         # switch between more efficient batch decoding technique
@@ -643,7 +645,8 @@ def __init__(
                     blank_index=self._blank_index,
                     max_symbols_per_step=self.max_symbols,
                     preserve_alignments=preserve_alignments,
-                    preserve_frame_confidence=preserve_frame_confidence,
+                    preserve_step_confidence=preserve_frame_confidence,
+                    exclude_blank_from_confidence=self.exclude_blank_from_confidence,
                     confidence_method_cfg=confidence_method_cfg,
                     allow_cuda_graphs=self.use_cuda_graph_decoder,
                     fusion_models=fusion_models,
@@ -2839,6 +2842,7 @@ def __init__(
         max_symbols_per_step: Optional[int] = None,
         preserve_alignments: bool = False,
         preserve_frame_confidence: bool = False,
+        exclude_blank_from_confidence: bool = False,
         include_duration: bool = False,
         include_duration_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
@@ -2859,6 +2863,7 @@ def __init__(
         self.durations = durations
         self.include_duration = include_duration
         self.include_duration_confidence = include_duration_confidence
+        self.exclude_blank_from_confidence = exclude_blank_from_confidence
 
         # Depending on availability of `blank_as_pad` support
         # switch between more efficient batch decoding technique
@@ -2873,7 +2878,8 @@ def __init__(
                 durations=self.durations,
                 max_symbols_per_step=self.max_symbols,
                 preserve_alignments=preserve_alignments,
-                preserve_frame_confidence=preserve_frame_confidence,
+                preserve_step_confidence=preserve_frame_confidence,
+                exclude_blank_from_confidence=self.exclude_blank_from_confidence,
                 include_duration=include_duration,
                 include_duration_confidence=include_duration_confidence,
                 confidence_method_cfg=confidence_method_cfg,