TP refactor for FSDP + TP integration by 3outeille · Pull Request #45028 · huggingface/transformers

3outeille · 2026-03-26T17:50:07Z

TODO
- how will dtensor works with quantization ?
- how will dtensor works with kernels ?
- Needs end to end test (combine verify_all_loss -> training with saving + loading back for generate ?)
- double check Save FSDP + TP
  - do test that does FSDP + TP -> training -> saving -> loading back -> training (compare to a normal training end to end)

Verify loading

# python verify_loading.py --mode single_gpu
# torchrun --nproc_per_node=4 verify_loading.py --mode fsdp
# torchrun --nproc_per_node=4 verify_loading.py --mode tp
# torchrun --nproc_per_node=4 verify_loading.py --mode tp_fsdp
import argparse, os, torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.distributed import DistributedConfig

parser = argparse.ArgumentParser()
parser.add_argument("--mode", choices=["single_gpu", "fsdp", "tp", "tp_fsdp"], required=True)
args = parser.parse_args()

if args.mode != "single_gpu":
    torch.distributed.init_process_group(backend="nccl")
    rank = int(os.environ["RANK"])
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
    world_size = torch.distributed.get_world_size()
else:
    rank = 0
    torch.cuda.set_device(0)
    world_size = 1 # single GPU

configs = {
    "single_gpu": None,
    "fsdp": DistributedConfig(fsdp_size=world_size, fsdp_plan="auto"),
    "tp": DistributedConfig(tp_size=world_size, tp_plan="auto", enable_sequence_parallel=True),
    "tp_fsdp": DistributedConfig(tp_size=2, tp_plan="auto", fsdp_size=world_size // 2, fsdp_plan="auto", enable_sequence_parallel=True),
}

print(configs[args.mode])
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B", distributed_config=configs[args.mode], dtype=torch.float32)
if args.mode == "single_gpu":
    model = model.to("cuda:0")

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
text = "The capital of France is Paris and the capital of Germany is Berlin and the capital of Italy is Rome"
inputs = tokenizer(text, return_tensors="pt").to(f"cuda:{rank}")

model.eval()
with torch.no_grad():
    loss = model(**inputs, labels=inputs["input_ids"].clone()).loss

if rank == 0:
    print(f"{args.mode}: loss = {loss.item():.4f}")

if args.mode != "single_gpu":
    torch.distributed.destroy_process_group()

Training

# torchrun --nproc_per_node=4 train_fsdp_tp.py

import os

import torch
import torch.distributed.checkpoint as dcp
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.distributed import DistributedConfig
from torchtitan.distributed import utils as dist_utils #TODO(3outeille): add this to transformers.distributed

def build_packed_dataset(dataset_name, tokenizer, seq_len, dp_rank, dp_world_size):
    """Stream + tokenize + greedy-pack documents into fixed-length (input, label) windows."""
    ds = load_dataset(dataset_name, name="en", split="train", streaming=True)
    ds = ds.shard(num_shards=dp_world_size, index=dp_rank)
    buf, w = [], seq_len + 1

    def pack(batch):
        for t in batch["text"]:
            buf.extend(tokenizer(t)["input_ids"])
        ids, lbls = [], []
        while len(buf) >= w:
            ids.append(buf[:seq_len]); lbls.append(buf[1:w]); del buf[:w]
        return {"input_ids": ids, "labels": lbls}

    ds = ds.map(pack, batched=True, remove_columns=ds.column_names)
    return ds.with_format("torch")


if __name__ == "__main__":

    model_name = "Qwen/Qwen3-0.6B"
    dataset_name = "allenai/c4"
    seq_len = 512
    num_steps, lr = 50, 3e-4
    batch_size = 4
    save_dir = "./checkpoints"

    torch.distributed.init_process_group(backend="nccl")
    rank, local_rank = int(os.environ["RANK"]), int(os.environ["LOCAL_RANK"])
    torch.cuda.set_device(local_rank)

    distributed_config = DistributedConfig(tp_size=2, tp_plan="auto", fsdp_size=2, fsdp_plan="auto", enable_sequence_parallel=True)
    # distributed_config = DistributedConfig(fsdp_size=4, fsdp_plan="auto")
    # distributed_config = DistributedConfig(tp_size=4, tp_plan="auto", enable_sequence_parallel=True)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        distributed_config=distributed_config,
        torch_dtype=torch.bfloat16,
    )

    dp_rank = model.device_mesh["fsdp"].get_local_rank() if "fsdp" in model.device_mesh.mesh_dim_names else 0
    dp_world_size = model.device_mesh["fsdp"].size() if "fsdp" in model.device_mesh.mesh_dim_names else 1
    

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    dataset = build_packed_dataset(dataset_name, tokenizer, seq_len, dp_rank, dp_world_size)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    model.train()
    data_iterator = iter(dataloader)
    for step in range(num_steps):
        batch = next(data_iterator)
        input_ids = batch["input_ids"].to(f"cuda:{local_rank}")
        labels = batch["labels"].to(f"cuda:{local_rank}")

        loss = model(input_ids, labels=labels).loss
        loss.backward()
        grad_norm = dist_utils.clip_grad_norm_(list(model.parameters()), max_norm=1.0, foreach=True)
        optimizer.step()
        optimizer.zero_grad()

        if rank == 0:
            print(f"Step {step:>4d} | Loss: {loss.item():.4f} | Grad norm: {grad_norm.item():.4f}")

    # Save model (HF format) and optimizer (DCP)
    model.save_pretrained(save_dir)
    dcp.save({"optimizer": optimizer.state_dict()}, checkpoint_id=os.path.join(save_dir, "optimizer"))

    if rank == 0:
        print(f"Saved to {save_dir}")

    torch.distributed.destroy_process_group()

(env_refactor-tp-dtensor) ➜  refactor-tp-dtensor git:(refactor-tp-dtensor) ✗ torchrun --nproc_per_node=4 train_fsdp_tp.py 2>&1 | tee log.txt
W0404 15:36:13.989000 408584 torch/distributed/run.py:862] 
W0404 15:36:13.989000 408584 torch/distributed/run.py:862] *****************************************
W0404 15:36:13.989000 408584 torch/distributed/run.py:862] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0404 15:36:13.989000 408584 torch/distributed/run.py:862] *****************************************
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 311/311 [00:00<00:00, 1332.48it/s]
Loading weights: 100%|██████████| 311/311 [00:00<00:00, 1393.62it/s]
Loading weights: 100%|██████████| 311/311 [00:00<00:00, 1506.23it/s]
Loading weights: 100%|██████████| 311/311 [00:00<00:00, 1343.11it/s]
[rank0]:W0404 15:36:38.769000 408764 torch/distributed/tensor/_redistribute.py:371] While redistributing from (_NormPartial(2.0), _NormPartial(2.0)) to (Replicate(), Replicate()), 2 sequential all_reduce operations will be performed. This is suboptimal: multiple collective operations have higher latency (separate kernel launches and synchronization points) and may give inconsistent results between ranks due to different reduction orders. To optimize, flatten mesh dimensions ["fsdp", "tp"] so DTensor can use a single operation instead.
[rank1]:W0404 15:36:38.773000 408765 torch/distributed/tensor/_redistribute.py:371] While redistributing from (_NormPartial(2.0), _NormPartial(2.0)) to (Replicate(), Replicate()), 2 sequential all_reduce operations will be performed. This is suboptimal: multiple collective operations have higher latency (separate kernel launches and synchronization points) and may give inconsistent results between ranks due to different reduction orders. To optimize, flatten mesh dimensions ["fsdp", "tp"] so DTensor can use a single operation instead.
[rank3]:W0404 15:36:38.775000 408767 torch/distributed/tensor/_redistribute.py:371] While redistributing from (_NormPartial(2.0), _NormPartial(2.0)) to (Replicate(), Replicate()), 2 sequential all_reduce operations will be performed. This is suboptimal: multiple collective operations have higher latency (separate kernel launches and synchronization points) and may give inconsistent results between ranks due to different reduction orders. To optimize, flatten mesh dimensions ["fsdp", "tp"] so DTensor can use a single operation instead.
[rank2]:W0404 15:36:38.775000 408766 torch/distributed/tensor/_redistribute.py:371] While redistributing from (_NormPartial(2.0), _NormPartial(2.0)) to (Replicate(), Replicate()), 2 sequential all_reduce operations will be performed. This is suboptimal: multiple collective operations have higher latency (separate kernel launches and synchronization points) and may give inconsistent results between ranks due to different reduction orders. To optimize, flatten mesh dimensions ["fsdp", "tp"] so DTensor can use a single operation instead.
Step    0 | Loss: 16.2702 | Grad norm: 5504.0000
Step    1 | Loss: 15.0783 | Grad norm: 210.0000
Step    2 | Loss: 14.4346 | Grad norm: 492.0000
Step    3 | Loss: 14.1189 | Grad norm: 54.0000
Step    4 | Loss: 12.4449 | Grad norm: 29.2500
Step    5 | Loss: 11.5218 | Grad norm: 62.7500
Step    6 | Loss: 11.0154 | Grad norm: 18.6250
Step    7 | Loss: 9.4406 | Grad norm: 7.5312
Step    8 | Loss: 9.1040 | Grad norm: 7.5938
Step    9 | Loss: 8.8089 | Grad norm: 23.1250
Step   10 | Loss: 8.3567 | Grad norm: 6.9375
Step   11 | Loss: 8.5408 | Grad norm: 40.7500
Step   12 | Loss: 8.3762 | Grad norm: 7.8750
Step   13 | Loss: 7.8817 | Grad norm: 4.9688
Step   14 | Loss: 7.9886 | Grad norm: 5.4688
Step   15 | Loss: 8.0018 | Grad norm: 4.3750
Step   16 | Loss: 8.1857 | Grad norm: 2.6250
Step   17 | Loss: 7.7257 | Grad norm: 3.5625
Step   18 | Loss: 8.3246 | Grad norm: 7.2500
Step   19 | Loss: 7.7896 | Grad norm: 5.8750
Step   20 | Loss: 8.4081 | Grad norm: 5.1875
Step   21 | Loss: 7.1414 | Grad norm: 4.3125
Step   22 | Loss: 7.7173 | Grad norm: 4.2188
Step   23 | Loss: 7.7260 | Grad norm: 3.1719
Step   24 | Loss: 7.4410 | Grad norm: 3.9844
Step   25 | Loss: 8.1295 | Grad norm: 2.7812
Step   26 | Loss: 7.6850 | Grad norm: 3.0938
Step   27 | Loss: 7.9302 | Grad norm: 4.3438
Step   28 | Loss: 7.9948 | Grad norm: 5.4062
Step   29 | Loss: 7.9131 | Grad norm: 5.1562
Step   30 | Loss: 7.9130 | Grad norm: 7.0625
Step   31 | Loss: 7.7614 | Grad norm: 3.7656
Step   32 | Loss: 7.6384 | Grad norm: 2.9062
Step   33 | Loss: 7.9990 | Grad norm: 2.2188
Step   34 | Loss: 7.9196 | Grad norm: 2.7188
Step   35 | Loss: 7.4915 | Grad norm: 2.3750
Step   36 | Loss: 7.6832 | Grad norm: 2.2500
Step   37 | Loss: 7.6955 | Grad norm: 5.6875
Step   38 | Loss: 7.8350 | Grad norm: 6.4375
Step   39 | Loss: 7.9663 | Grad norm: 7.0312
Step   40 | Loss: 7.5218 | Grad norm: 6.0000
Step   41 | Loss: 7.6298 | Grad norm: 3.6562
Step   42 | Loss: 7.3585 | Grad norm: 3.6562
Step   43 | Loss: 7.6517 | Grad norm: 2.5938
Step   44 | Loss: 7.4640 | Grad norm: 3.1562
Step   45 | Loss: 7.5150 | Grad norm: 2.8594
Step   46 | Loss: 7.7882 | Grad norm: 3.3750
Step   47 | Loss: 7.4193 | Grad norm: 2.6250
Step   48 | Loss: 7.5599 | Grad norm: 2.6719
Step   49 | Loss: 7.8967 | Grad norm: 2.7969
/fsx/ferdinandmom/ferdinand-hf/transformers_pr/env_refactor-tp-dtensor/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict_saver.py:448: UserWarning: Detected an existing checkpoint in checkpoints/optimizer, overwriting since self.overwrite=True. Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to maintain this functionality or False to raise when an existing checkpoint is found.
  local_plan = storage_writer.prepare_local_plan(local_plan)
/fsx/ferdinandmom/ferdinand-hf/transformers_pr/env_refactor-tp-dtensor/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict_saver.py:448: UserWarning: Detected an existing checkpoint in checkpoints/optimizer, overwriting since self.overwrite=True. Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to maintain this functionality or False to raise when an existing checkpoint is found.
  local_plan = storage_writer.prepare_local_plan(local_plan)
/fsx/ferdinandmom/ferdinand-hf/transformers_pr/env_refactor-tp-dtensor/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict_saver.py:448: UserWarning: Detected an existing checkpoint in checkpoints/optimizer, overwriting since self.overwrite=True. Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to maintain this functionality or False to raise when an existing checkpoint is found.
  local_plan = storage_writer.prepare_local_plan(local_plan)
/fsx/ferdinandmom/ferdinand-hf/transformers_pr/env_refactor-tp-dtensor/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict_saver.py:448: UserWarning: Detected an existing checkpoint in checkpoints/optimizer, overwriting since self.overwrite=True. Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to maintain this functionality or False to raise when an existing checkpoint is found.
  local_plan = storage_writer.prepare_local_plan(local_plan)
Saved to ./checkpoints

HuggingFaceDocBuilderDev · 2026-03-26T17:59:39Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

- DtensorShardOperation for range-math shard-on-read - spawn_materialize() enhancements - from_pretrained wiring for distributed config - Shard operation helpers in tensor_parallel - Shard-on-read and LoadStateDictConfig tests

- Replace hook-based TP with DTensor-based TPStyle API - TPStyle dataclass with dense kinds: colwise, rowwise, vocab - apply_tensor_parallel() using PyTorch parallelize_module - verify_tp_plan() for plan validation - Update dense model configs (llama, mistral, qwen2, phi, glm) to TPStyle - DTensor apply_rotary_pos_emb guard for llama, mistral, qwen3 - Extended DistributedConfig with tp/fsdp size and plan fields - DistributedConfig serialization in configuration_utils - MXFP4 NotImplementedError for DTensor TP - Dense TP tests

* MoE expert parallelism + sequence parallelism - Add PackedColwiseParallel for fused gate_up_proj weights - Add MoEExpertsParallel with per-expert DTensor sharding - Add PrepareModuleInputOutput for SP allgather/split hooks - Add _AllReduceBackward for MoE routing weight gradients - Extend TPStyle with moe_experts, packed_colwise, activation, module kinds - _StridedShard handling in core_model_loading for interleaved weights - MoE model configs: mixtral, deepseek_v3, qwen3 with SP plans - DTensor rotary_pos_emb guard for mixtral * Fix ruff linting and formatting * Fix ruff formatting in core_model_loading.py * Restore _IdentityOp accidentally removed in 25a1f48 The _IdentityOp class (added by PR #44983) was accidentally deleted during the MoE expert parallelism work. It is needed by finegrained_fp8.py and metal_quantization.py as a pass-through reverse_op for dequantize operations. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Backport new TP/FSDP API + fix DTensor imports in Copied-from models * from_pretrained orchestration + distributed save/load (#45409) * from_pretrained orchestration + save/load - Add gather_full_state_dict() for DTensor→full tensor saving - Add convert_strided_to_shard() / restore_strided_from_shard() for DCP - Add _redistribute_dtensor() helper - Full distributed_config integration in from_pretrained/save_pretrained - Rename apply_fsdp2 → apply_fully_shard_data_parallel - save_optimizer() / load_optimizer() in distributed/utils - Trainer integration with distributed_config - Updated FSDP and TP tests for new orchestration API - DTensor shard-on-read test updates * revert distributed utils * eaaea * all tests for core modeling are passing * populate import from init for tp * ruff * ruff --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

3outeille · 2026-04-09T08:26:00Z

        return

+    # Filter out module-level comm hooks — they don't shard weights
+    _NON_WEIGHT_KINDS = {"activation", "module"}


maybe separate tp and sp style ?

Restores modeling files to their base branch versions so the PR diff only shows the distributed/patches.py monkey-patch approach instead of noisy function moves in modeling files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

- Convert string plan values ("colwise", "rowwise", etc.) to TPStyle objects across 66+ model configs and modular files - Consolidate MoE expert sub-entries into TPStyle("moe_experts", ...) with shard_plan - Remove "replicated_with_grad_allreduce" entries (not needed for DTensor TP) - Migrate _tp_plan class attributes in modeling files from "colwise_gather_output" to TPStyle("colwise", "allgather") - Add TypeError in apply_tensor_parallel for unsupported plan values - Remove old TensorParallelLayer tests (API removed in DTensor refactor) - Regenerate auto-generated files via modular converter

3outeille · 2026-04-14T17:25:36Z

+        return model

-    if isinstance(fsdp_plan, str):
+    if fsdp_plan == "auto":


define fsdp_plan in every model and remove auto code path (will be done in the next PR)

These string plan values have no TPStyle equivalent in the DTensor system. Remove them to avoid TypeError at apply_tensor_parallel time. Affected models: deepseek_v2, glm4_moe_lite, glm_moe_dsa, longcat_flash.

ArthurZucker · 2026-04-15T08:56:55Z

+    # loss_parallel patches F.cross_entropy to work with Shard(-1) logits.
+    # It must be active during both forward and backward, so we enable it
+    # once rather than as a context manager.
+    has_loss_parallel = any(isinstance(v, TPStyle) and v.comm == "loss_parallel" for v in tp_plan.values())


transformers/src/transformers/loss/loss_utils.py

Line 145 in 7028c30

LOSS_MAPPING = {

check if this can be of any use

ArthurZucker · 2026-04-15T08:57:29Z

+        def _inject_sp_metadata(mod, args, kwargs):
+            input_ids = kwargs.get("input_ids", args[0] if args else None)
+            if input_ids is None:
+                return args, kwargs
+            if "position_ids" not in kwargs or kwargs["position_ids"] is None:
+                seq_len = input_ids.shape[1]
+                kwargs["position_ids"] = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
+            return args, kwargs


ParallelStyle with just prepare inputs

SequenceParallelInput(Parallel) -> explicit I need to put this and it exists

sure but that would mean specifying a empty string to the base_model_sp_plan since it's not a module level
and more like at a base model

base_model_sp_plan = { "" : SequenceParallelInput(), "embed_tokens": TPStyle("vocab", "reduce_scatter"), "layers.*.input_layernorm": TPStyle("activation", "none"), "layers.*.self_attn": TPStyle("module", "allgather", input_key="hidden_states"), "layers.*.self_attn.q_proj": TPStyle("colwise", "none"),

ArthurZucker · 2026-04-15T09:02:59Z

+import sys
+from functools import wraps
+
+from torch.distributed.tensor import DTensor, Replicate


let's discuss: https://github.com/huggingface/transformers/blob/7f49ecc51cacc6b9b60151ebb6a32e66eb71d163/src/transformers/models/llama/modeling_llama.py#L135

ArthurZucker · 2026-04-15T09:27:53Z


-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, output_layouts=None):


check

transformers/src/transformers/integrations/moe.py

Line 415 in 7f49ecc

proj_out = _grouped_linear(

and TP (on expert dim) check that fsdp when slicing expert ids does gather hidden_dim since that's what its expected to do

3outeille · 2026-04-15T10:04:29Z


-def is_dtensor_like(value: Any) -> bool:
-    return all(hasattr(value, attr) for attr in ("device_mesh", "placements", "to_local"))
+class DtensorShardOperation:


drop the shard on read approach. We should load rely on distribute_tensor Dtensor api, this will avoid us to do any slicing because of shard on read

…rsions (e.g. _StridedShard↔Shard). We force replicate beforehand

github-actions · 2026-04-22T12:11:07Z

[For maintainers] Suggested jobs to run (before merge)

run-slow: afmoe, apertus, arcee, aria, audioflamingo3, bamba, bitnet, cohere

github-actions · 2026-04-22T12:24:56Z

View the CircleCI Test Summary for this PR:

https://huggingface.co/spaces/transformers-community/circle-ci-viz?pr=45028&sha=f710f0

3outeille mentioned this pull request Mar 26, 2026

🚨 Distributed training API #44989

Draft

3outeille force-pushed the refactor-tp-dtensor branch 2 times, most recently from fcea5ce to f98e208 Compare April 4, 2026 16:53

3outeille commented Apr 8, 2026

View reviewed changes

Comment thread src/transformers/models/qwen3/modeling_qwen3.py Outdated

DistributedConfig + shard-on-read loading

739332c

- DtensorShardOperation for range-math shard-on-read - spawn_materialize() enhancements - from_pretrained wiring for distributed config - Shard operation helpers in tensor_parallel - Shard-on-read and LoadStateDictConfig tests

3outeille force-pushed the fsdp-core-model-loading branch from 607cc11 to 739332c Compare April 13, 2026 14:14

3outeille force-pushed the refactor-tp-dtensor branch from 1aa7f5f to 11b55a2 Compare April 13, 2026 14:20

3outeille mentioned this pull request Apr 13, 2026

MoE expert parallelism + sequence parallelism #45408

Merged

3 tasks

3outeille force-pushed the fsdp-core-model-loading branch from dbc9619 to c567240 Compare April 14, 2026 09:54

3outeille force-pushed the refactor-tp-dtensor branch from 34a5085 to eb428cc Compare April 14, 2026 09:54

Merge branch 'fsdp-vs-ddp' into fsdp-core-model-loading

c1dab9e

3outeille force-pushed the fsdp-core-model-loading branch from c567240 to c1dab9e Compare April 14, 2026 13:44

Merge branch 'fsdp-core-model-loading' into refactor-tp-dtensor

e0c4e06

3outeille force-pushed the refactor-tp-dtensor branch from eb428cc to e0c4e06 Compare April 14, 2026 13:44

3outeille and others added 7 commits April 14, 2026 14:22

Fix ruff formatting in core_model_loading.py

21f0561

Fix ruff linting and formatting

cd45107

Merge branch 'fsdp-core-model-loading' into refactor-tp-dtensor

52c390f

Backport new TP/FSDP API from orchestration-save-load branch

ba3990f

Fix DTensor imports in Copied-from model files

92a3491

do monkey patching for rotary

d4400d5

3outeille commented Apr 14, 2026

View reviewed changes

3outeille and others added 2 commits April 14, 2026 17:05

3outeille commented Apr 14, 2026

View reviewed changes

Comment thread src/transformers/models/afmoe/modeling_afmoe.py

3outeille commented Apr 14, 2026

View reviewed changes

Comment thread src/transformers/models/apertus/configuration_apertus.py

3outeille commented Apr 14, 2026

View reviewed changes

Restore mxfp4.py to match base branch

5ce6faa

Drop mla_kv_a_proj and moe_identity_expert from TP plans

b694f36

These string plan values have no TPStyle equivalent in the DTensor system. Remove them to avoid TypeError at apply_tensor_parallel time. Affected models: deepseek_v2, glm4_moe_lite, glm_moe_dsa, longcat_flash.

ArthurZucker reviewed Apr 15, 2026

View reviewed changes

3outeille commented Apr 15, 2026

View reviewed changes

3outeille added 14 commits April 15, 2026 14:13

more comments

1b82460

fix tp for most models. PyTorch doesn't implement all placement conve…

48f8d6f

…rsions (e.g. _StridedShard↔Shard). We force replicate beforehand

fix tp through _replicate_dtensor

91b4824

revert small change

44706eb

push temporary fix for TP and strided shard for backward

aa45f5b

refactor a bit

0a566c5

patches for rotary

11a55d4

refactor MoEExpertsParallel

53490d9

fix tp for last models

0c09915

refactor moe expert parallels

ebd03ec

linting

c08c071

add sp plan for models

4804d0d

add deepseek v2 sp plan

1a51928

undo sp plan for some tricky models

fd3a722

3outeille changed the base branch from fsdp-core-model-loading to fsdp-vs-ddp April 20, 2026 09:19

3outeille added 5 commits April 20, 2026 10:47

remove lm_head from config

253b89e

first pass of refactoring dtensor shard operator

3ff1fee

better refacto

4d96b2d

batter explanation of DtensorShardOperation

04521bf

refactor dtensor test to reflect real world scenario

f710f0d

Conversation

3outeille commented Mar 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Verify loading

Training

Uh oh!

HuggingFaceDocBuilderDev commented Mar 26, 2026

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions Bot commented Apr 22, 2026

Uh oh!

github-actions Bot commented Apr 22, 2026

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

3outeille commented Mar 26, 2026 •

edited

Loading