Merge branch 'flash_attn_pad_bw_seqs' of github.com:sudhakarsingh27/TransformerEngine into flash_attn_pad_bw_seqs

sudhakarsingh27 · sudhakarsingh27 · commit ea51821e02a4 · 2026-01-13T17:15:33.000-08:00
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -1176,8 +1176,12 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
         max_seqlen_kv=config.max_seqlen_kv,
         cu_seqlens_q=cu_seqlens_q,
         cu_seqlens_kv=cu_seqlens_kv,
-        cu_seqlens_q_padded=cu_seqlens_q_after_pad if backend in ["FusedAttention", "FlashAttention"] else None,
-        cu_seqlens_kv_padded=cu_seqlens_kv_after_pad if backend in ["FusedAttention", "FlashAttention"] else None,
+        cu_seqlens_q_padded=(
+            cu_seqlens_q_after_pad if backend in ["FusedAttention", "FlashAttention"] else None
+        ),
+        cu_seqlens_kv_padded=(
+            cu_seqlens_kv_after_pad if backend in ["FusedAttention", "FlashAttention"] else None
+        ),
         attn_mask_type=config.attn_mask_type,
         checkpoint_core_attention=ckpt_attn,
         core_attention_bias_type=config.attn_bias_type,
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -931,7 +931,9 @@ def forward(
                             cu_seqlens_q_padded if cu_seqlens_q_padded is not None else cu_seqlens_q
                         )
                         fa_optional_forward_args_thd.append(
-                            cu_seqlens_kv_padded if cu_seqlens_kv_padded is not None else cu_seqlens_kv
+                            cu_seqlens_kv_padded
+                            if cu_seqlens_kv_padded is not None
+                            else cu_seqlens_kv
                         )
                         fa_optional_forward_args_thd.append(max_seqlen_q)
                         fa_optional_forward_args_thd.append(max_seqlen_kv)
@@ -973,8 +975,12 @@ def forward(
                         # in addition to `cu_seqlens_q_padded` and `cu_seqlens_kv_padded` to avoid affecting the
                         # padding positions.
                         if pad_between_seqs:
-                            fa_3_optional_forward_kwargs["seqused_q"] = cu_seqlens_q[1:] - cu_seqlens_q[:-1]
-                            fa_3_optional_forward_kwargs["seqused_k"] = cu_seqlens_kv[1:] - cu_seqlens_kv[:-1]
+                            fa_3_optional_forward_kwargs["seqused_q"] = (
+                                cu_seqlens_q[1:] - cu_seqlens_q[:-1]
+                            )
+                            fa_3_optional_forward_kwargs["seqused_k"] = (
+                                cu_seqlens_kv[1:] - cu_seqlens_kv[:-1]
+                            )
                     else:
                         fa_3_optional_forward_kwargs["cu_seqlens_q"] = cu_seqlens_q
                         fa_3_optional_forward_kwargs["max_seqlen_q"] = max_seqlen_q