Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
274 commits
Select commit Hold shift + click to select a range
2727104
Wire HF parity into validation workflow
FurtherAI Apr 13, 2026
e835237
Stabilize megatron HF parity runtime
FurtherAI Apr 13, 2026
84d59e0
Drop HF parity delta checks
FurtherAI Apr 13, 2026
362160a
Wire lora coverage and correctness into workflow
FurtherAI Apr 13, 2026
8e43cdd
Wire merged vllm serving into workflow
FurtherAI Apr 13, 2026
3580730
Isolate workflow stages in subprocesses
FurtherAI Apr 14, 2026
95b07e6
Add model support trainability workflow stages
FurtherAI Apr 14, 2026
592d99e
Add realistic packed-position validation and runtime cleanup
FurtherAI Apr 15, 2026
0cf988b
Use real preprocess in packed position validation
FurtherAI Apr 15, 2026
1db721a
Move megatron preprocess patching into model handlers
FurtherAI Apr 15, 2026
9b4c2ac
Replace chat template rollout with conformance suite
FurtherAI Apr 16, 2026
d0a3198
Wait for dedicated vLLM health before serving
FurtherAI Apr 16, 2026
8dd17f6
Fix Qwen3.5 trainability and packed position handling
FurtherAI Apr 16, 2026
faeca8a
Log correctness runs and narrow DeepEP gating
FurtherAI Apr 16, 2026
5ac1f0c
WIP snapshot current megatron bridge/model support state
FurtherAI Apr 21, 2026
c15075f
Split Megatron runtime trainable modes for HF parity
FurtherAI Apr 21, 2026
0f96868
Restore Qwen3.5 text-only SP embedding scatter
FurtherAI Apr 21, 2026
aa708cc
Restore oracle flex attention eager path
FurtherAI Apr 21, 2026
cad8003
Fix Qwen3.5 GDN LoRA TP shard ordering
FurtherAI Apr 22, 2026
383f0aa
Gate DeepEP to supported runtime dtypes
FurtherAI Apr 22, 2026
1144295
Revert invalid flex attention compile toggle
FurtherAI Apr 22, 2026
1cd848e
Restore oracle-only DeepEP fp32 override
FurtherAI Apr 22, 2026
df39090
Generalize LoRA shard manifests and pin block mask compile backend
FurtherAI Apr 22, 2026
5a9388f
Fix sensitivity harness for Qwen3.5 workflow
FurtherAI Apr 22, 2026
6eb6d91
Validate packed position ids with oracle metric
FurtherAI Apr 24, 2026
c307576
Add vllm separation integration test harness
FurtherAI Apr 27, 2026
cb9fa84
Cut over ART core to external vLLM runtime
FurtherAI Apr 27, 2026
740c79e
Add vLLM separation integration checks
FurtherAI Apr 27, 2026
c29563f
Update lockfile for vLLM separation
FurtherAI Apr 27, 2026
31e430d
Fix vLLM separation test package imports
FurtherAI Apr 27, 2026
ae73761
Resolve vLLM separation test repo root via git
FurtherAI Apr 27, 2026
74f3c44
Fix runtime project root resolution in worktrees
FurtherAI Apr 27, 2026
f0888ec
Add service import smoke for vLLM-free ART env
FurtherAI Apr 27, 2026
c7ac04a
Fix service import smoke command
FurtherAI Apr 27, 2026
686285b
Implement multi-rank Megatron merged sync orchestration
FurtherAI Apr 27, 2026
9785444
Fix concurrent init assertion in merged sync tests
FurtherAI Apr 27, 2026
983a2d0
Add runtime boundary service checks
FurtherAI Apr 27, 2026
84ae38b
Add opt-in live local backend runtime smoke
FurtherAI Apr 27, 2026
db39cec
Add direct runtime live smoke
FurtherAI Apr 27, 2026
5c1f4bb
Fix runtime sleep route pause mode import
FurtherAI Apr 27, 2026
6f9d2d7
Add live Megatron separation smokes
FurtherAI Apr 27, 2026
8262767
Fix merged NCCL bootstrap across split runtimes
FurtherAI Apr 27, 2026
1e8f6a2
Normalize raw NCCL ids in runtime wrapper
FurtherAI Apr 27, 2026
42c9237
Fix runtime normalization regression test
FurtherAI Apr 27, 2026
b2006a8
Load full runtime patches in vLLM worker plugins
FurtherAI Apr 27, 2026
8ebb936
Fail fast when Megatron job worker exits
FurtherAI Apr 27, 2026
a7fa7ac
Keep NCCL bootstrap store alive during sync
FurtherAI Apr 27, 2026
f4747fa
Add workflow-style trainability validation matrix
FurtherAI Apr 27, 2026
3e8c61f
Add EP LoRA localization in runtime
FurtherAI Apr 27, 2026
42cecd5
Fix EP MoE LoRA alignment in runtime
FurtherAI Apr 27, 2026
54a8217
Fix runtime EP alignment test harness
FurtherAI Apr 27, 2026
d27afb8
Fix runtime EP LoRA align expert map handling
FurtherAI Apr 27, 2026
f72fff1
Add Qwen3 MoE DeepEP compile workaround
FurtherAI Apr 27, 2026
03506c8
Fix unsloth yes-no trainability config
FurtherAI Apr 27, 2026
b748494
Import unsloth during art startup
FurtherAI Apr 27, 2026
824943d
Tune unsloth yes-no validation defaults
FurtherAI Apr 27, 2026
579cc27
Stabilize unsloth yes-no validation
FurtherAI Apr 27, 2026
f0f772c
Handle unsloth banner in import tests
FurtherAI Apr 27, 2026
670d120
Use default trainability logprob settings
FurtherAI Apr 27, 2026
09fe7eb
Release GPU state between trainability tests
FurtherAI Apr 27, 2026
e831345
Use 1024 packed sequence validation defaults
FurtherAI Apr 27, 2026
513ff43
Stabilize live yes-no validation defaults
FurtherAI Apr 27, 2026
cda94a5
Retry GPU memory recovery in live validation
FurtherAI Apr 27, 2026
69d540a
Add longer Megatron separation live smokes
FurtherAI Apr 28, 2026
9456acb
Remove Megatron auto-setup fallback
FurtherAI Apr 28, 2026
663c6d8
Launch Megatron worker in active env
FurtherAI Apr 28, 2026
9fb5650
Launch vLLM runtime from dedicated env
FurtherAI Apr 28, 2026
b63af40
Fix runtime launcher regression test
FurtherAI Apr 28, 2026
70bd723
Add GDN shared-prefix packed sequence support
FurtherAI Apr 30, 2026
4d17742
Handle sparse Qwen3 MoE expert parity grads
FurtherAI Apr 30, 2026
1fdda3b
Fix GDN sequence-parallel output shapes
FurtherAI Apr 30, 2026
26ae3b8
Respect rollout mode in yes-no trainability
FurtherAI Apr 30, 2026
a5a0446
Cast GDN bucket outputs before scatter
FurtherAI Apr 30, 2026
2ffcb65
Add GDN layout planning support
FurtherAI May 1, 2026
96cdf53
Package vLLM runtime as managed bundle
FurtherAI May 1, 2026
b4a570e
Add ART service lifecycle cleanup
FurtherAI May 2, 2026
e251187
Fix lifecycle cleanup edge cases
FurtherAI May 2, 2026
8f0fcb3
Run Megatron trainability tests out of process
FurtherAI May 2, 2026
a72638d
Allow slow actor startup imports
FurtherAI May 2, 2026
3824036
Fix merged trainability model list assertion
FurtherAI May 2, 2026
133adba
Avoid managed process signal wait deadlock
FurtherAI May 2, 2026
1161211
Stop managed children when wrapper dies
FurtherAI May 2, 2026
77fecd1
Restore dedicated Unsloth SFT guard
FurtherAI May 2, 2026
068c9ce
Address remaining vLLM separation review findings
FurtherAI May 2, 2026
243ef8c
Add Qwen3.5/3.6 native vLLM LoRA support path
FurtherAI May 2, 2026
20cc5ea
Update vLLM runtime to official 0.19.1
FurtherAI May 2, 2026
3dd13ad
Wire native LoRA support through handlers
FurtherAI May 2, 2026
986cb6e
Adapt runtime routes to vLLM 0.19 app API
FurtherAI May 2, 2026
3fc3120
Fix dense Qwen35 text-only validation path
FurtherAI May 3, 2026
5c6a8d9
Add env gate for workflow sensitivity stage
FurtherAI May 3, 2026
44f88d5
Prepare native vLLM MoE LoRA checkpoints
FurtherAI May 3, 2026
6ee8f27
Relax packed position id MoE tolerance
FurtherAI May 3, 2026
ea8bf50
Mark Qwen3.5 MoE native LoRA as validated
FurtherAI May 3, 2026
423224f
Enable Qwen3.5/3.6 LoRA rollout defaults
FurtherAI May 3, 2026
ec9fcb3
Lazy-load tinker server export
FurtherAI May 3, 2026
4485f45
Stub tinker in renderer unit tests
FurtherAI May 3, 2026
aa4b825
Lazy-load tinker native backend export
FurtherAI May 3, 2026
4f8781b
Gate shared expert parallel by model family
FurtherAI May 3, 2026
9c95945
Split dense and MoE shared config expectations
FurtherAI May 3, 2026
c4f46ce
Revert "Lazy-load tinker server export"
FurtherAI May 3, 2026
9dc95d3
Revert "Lazy-load tinker native backend export"
FurtherAI May 3, 2026
293758e
Remove shared FC1 LoRA shape fallback
FurtherAI May 3, 2026
0825421
Revert runtime LoRA checkpoint rewriting
FurtherAI May 3, 2026
61755c4
Add native vLLM LoRA layout probe
FurtherAI May 3, 2026
58508ca
Expand native vLLM LoRA layout probe
FurtherAI May 3, 2026
84b9861
Make Megatron LoRA disk checkpoints vLLM canonical
FurtherAI May 3, 2026
f445bb3
Keep Megatron LoRA shards native
FurtherAI May 3, 2026
133da5e
Avoid redundant identity LoRA config save
FurtherAI May 3, 2026
4c7ef23
Split Megatron dense and MoE model support
FurtherAI May 3, 2026
ee53c05
Gate Megatron model support registry
FurtherAI May 3, 2026
15f70c3
Filter oracle variants by visible GPUs
FurtherAI May 3, 2026
16ccb57
Add Qwen3 dense probe handler
FurtherAI May 3, 2026
9c77732
Use registry for Megatron model support gating
FurtherAI May 3, 2026
40b1391
Remove qwen bridge fakes from provider tests
FurtherAI May 3, 2026
c1cc9d9
Canonicalize dense TP gate-up traces
FurtherAI May 3, 2026
24ca82c
Allow tiny absolute oracle loss drift
FurtherAI May 3, 2026
2ded12d
Rename unsupported_arch to unvalidated_arch. And remove loss threshol…
FurtherAI May 4, 2026
72ae53f
Fold oracle extended topologies into defaults
FurtherAI May 4, 2026
b03f70d
Use real CP size for shared-prefix GDN
FurtherAI May 4, 2026
64030f9
Allow full GDN specs with sequence parallel shards
FurtherAI May 4, 2026
75d5e86
Trace GDN modules in oracle forward reports
FurtherAI May 4, 2026
d222600
Canonicalize componentwise LoRA trace outputs
FurtherAI May 4, 2026
a968ab6
Slightly bump oracle correctness threshold for loss
FurtherAI May 4, 2026
cb85c5e
Validate Qwen3 native vLLM LoRA mode
FurtherAI May 4, 2026
c178ac5
Remove unsourced Qwen3.6 pricing
FurtherAI May 4, 2026
eda42b1
Remove Megatron optional fallback paths
FurtherAI May 4, 2026
38f4faf
Make selected Megatron paths strict
FurtherAI May 4, 2026
d6c129d
Update provider recompute test fixture
FurtherAI May 4, 2026
4bcf909
Fix provider recompute test model
FurtherAI May 4, 2026
a5e1915
Correct provider support fixture models
FurtherAI May 4, 2026
c56d89d
Fix model support stage worker arch flag
FurtherAI May 4, 2026
8df90dd
Parallelize yes-no eval prompts
FurtherAI May 4, 2026
c785024
Make native vLLM LoRA a quick serving gate
FurtherAI May 4, 2026
1ff559f
Use fresh native LoRA serving artifacts
FurtherAI May 4, 2026
57eddc1
Propagate unvalidated model validation flag
FurtherAI May 4, 2026
8fd8aa4
Delegate GDN projections to Megatron modules
FurtherAI May 4, 2026
7948e6a
Canonicalize GDN forward traces
FurtherAI May 4, 2026
05c6164
Keep GDN trace metadata in test harness
FurtherAI May 4, 2026
1225b08
Use dense topology for dense trainability
FurtherAI May 5, 2026
3c5cd55
Disable Qwen35 DeepEP permute compile
FurtherAI May 5, 2026
7a9917b
Test Qwen35 DeepEP compile workaround
FurtherAI May 5, 2026
674c256
Lower yes-no trainability reward gate
FurtherAI May 5, 2026
5b520e3
Validate native vLLM LoRA for Qwen3 dense
FurtherAI May 5, 2026
d70ab2c
Promote dense Qwen models to validated support
FurtherAI May 5, 2026
3d77ba3
Avoid eager model support workflow imports
FurtherAI May 5, 2026
3663266
Use compact packed GDN kernels for local buckets
FurtherAI May 5, 2026
5d32ac0
Use chunked FLA GDN kernel
FurtherAI May 6, 2026
697f392
Use fused Megatron cross entropy
FurtherAI May 6, 2026
632eefb
Remove legacy GDN executor path
FurtherAI May 6, 2026
4d60c94
Add harness CE fusion override worker
FurtherAI May 6, 2026
d57b48e
Add GDN timing hooks to harness wrapper
FurtherAI May 6, 2026
02f221b
Organize Megatron modules and integration tests
FurtherAI May 7, 2026
06814b0
Fix HF parity invariant handler call
FurtherAI May 7, 2026
df52d07
Port main dependency and lifecycle updates
FurtherAI May 8, 2026
4c1fde1
Update Qwen handler for newer bridge mappings
FurtherAI May 8, 2026
6c66d67
Validate Qwen3.5 vLLM LoRA layout
FurtherAI May 8, 2026
470f966
Remove flex attention compile tuning options
FurtherAI May 8, 2026
6b43ef0
Ignore train inference mismatch artifacts
FurtherAI May 8, 2026
5fe1f1b
Avoid assert bytecode in flex attention forward
FurtherAI May 8, 2026
70e9db4
Report flex attention bias type mismatches
FurtherAI May 8, 2026
f79e63e
Propagate Qwen3.5 MTP shared-prefix attention
FurtherAI May 8, 2026
1506236
Forward Qwen3.5 MTP attention bias to layers
FurtherAI May 8, 2026
dd16e0a
Avoid checkpointing Qwen3.5 MTP attention state
FurtherAI May 8, 2026
5bf2c87
Disable Qwen3.5 MTP in ART Megatron
FurtherAI May 8, 2026
e9b869d
Drop MTP diagnostic flex attention changes
FurtherAI May 8, 2026
d26ecb7
Assert Qwen3.5 ART training has no MTP
FurtherAI May 8, 2026
6b40e71
Clean PR artifacts and fix type checks
FurtherAI May 8, 2026
aafedae
Merge remote-tracking branch 'origin/main' into austin/vllm_separation
FurtherAI May 8, 2026
7edba06
Unify runtime process supervision
FurtherAI May 9, 2026
a31a581
Model asyncio subprocess contract in runtime tests
FurtherAI May 9, 2026
815d577
Defer supervised wait coroutine creation
FurtherAI May 9, 2026
f662370
Prune oracle topology artifacts by default
FurtherAI May 9, 2026
7434fdf
Handle vLLM EP dummy LoRA warmup
FurtherAI May 9, 2026
e84cc4c
Keep vLLM MoE LoRA stacking idempotent
FurtherAI May 9, 2026
ef2c7b9
Add train inference mismatch workflow stage
FurtherAI May 10, 2026
a0c071b
Update workflow test oracle artifact mocks
FurtherAI May 10, 2026
0608762
Preserve recent Unsloth training fixes
FurtherAI May 11, 2026
595fe7b
Add train inference output parity probe
FurtherAI May 12, 2026
4f5f468
Patch vLLM LoRA duplicate aliases
FurtherAI May 12, 2026
efeccf4
Fix EP MoE native LoRA TP slicing
FurtherAI May 12, 2026
f1df667
Convert Qwen3.5 q-gate LoRA layout
FurtherAI May 12, 2026
12457e1
Fix EP MoE LoRA align expert count
FurtherAI May 12, 2026
d542aab
Fix EP MoE dummy LoRA warmup
FurtherAI May 12, 2026
6a61e12
Add train-inf no shared expert LoRA ablation
FurtherAI May 12, 2026
f1f10fb
Align qwen35 moe lora with vllm 3d layout
FurtherAI May 12, 2026
28ce863
Add train-inf LoRA target override
FurtherAI May 12, 2026
a98794b
Avoid base grad buffers in parity worker
FurtherAI May 13, 2026
6263741
Pin NCCL and update merged weight sync
FurtherAI May 13, 2026
46b2b33
Update train inf mismatch metric gates
FurtherAI May 13, 2026
ceeec62
Use smaller train inf metric epsilon
FurtherAI May 13, 2026
03fbcdf
Run live train inf parity in workflow
FurtherAI May 13, 2026
c872195
Use native Megatron MoE routing replay
FurtherAI May 18, 2026
f6a369f
Add production MoE routing replay plumbing
FurtherAI May 18, 2026
a5d6a26
Expose trajectory routing replay train flag
FurtherAI May 18, 2026
211b7e2
Make expert replay a backend setting
FurtherAI May 18, 2026
f3f619c
Add real-path train inf mismatch test
FurtherAI May 18, 2026
9ab0308
Disable async scheduling for expert replay
FurtherAI May 18, 2026
f5f1714
Forward false vLLM runtime flags
FurtherAI May 18, 2026
3b84202
Use nonzero advantages in real mismatch test
FurtherAI May 18, 2026
45627c8
Align real mismatch rollout chat template
FurtherAI May 18, 2026
200494c
Allow replay to omit terminal generated route
FurtherAI May 18, 2026
cde0316
Replay known routes and live-route terminal gaps
FurtherAI May 18, 2026
2d043df
Gather TP logits in mismatch extractor
FurtherAI May 18, 2026
cb815e4
Run real mismatch test without opt-in env
FurtherAI May 18, 2026
3f3cc5f
Make routing replay native and cp2 by default
FurtherAI May 18, 2026
3470a2b
Fix mismatch test topology world size
FurtherAI May 18, 2026
b72a01a
Restore tp2 ep2 mismatch defaults
FurtherAI May 18, 2026
f61d43c
Document mismatch threshold diagnostics
FurtherAI May 18, 2026
a6e1749
Raise train-inf mismatch bf16 gate
FurtherAI May 20, 2026
22aa60f
Fix oracle routing replay capture
FurtherAI May 20, 2026
0e688f8
Fix MoE replay topology parity
FurtherAI May 21, 2026
050d6cb
Spread synthetic replay routes
FurtherAI May 21, 2026
9dba103
Merge branch 'main' into austin/train_inf_mismatch
FurtherAI May 21, 2026
2bef373
Clean up routing replay merge state
FurtherAI May 23, 2026
1a0adcd
Drop stale megatron core build config
FurtherAI May 23, 2026
2566b64
Clean up train inf mismatch real path gate
FurtherAI May 23, 2026
7b9a0c6
Restore explicit NCCL weight transfer contract
FurtherAI May 23, 2026
a8f07ea
Lower train-inf mismatch rollout temperature
FurtherAI May 23, 2026
bf99ef8
Seed train-inf mismatch rollouts
FurtherAI May 23, 2026
04ac948
Use lower train-inf rollout temperature without seeds
FurtherAI May 23, 2026
2d6de24
Restore train-inf rollout temperature
FurtherAI May 23, 2026
1ce63a7
Use compact non-CP oracle topology matrix
FurtherAI May 23, 2026
98b1cd7
Add durable model support workflow CLI
FurtherAI May 23, 2026
850ce28
Remove native LoRA exclusion from workflow CLI
FurtherAI May 23, 2026
082d0aa
Add vLLM routed expert prefix sidecar
FurtherAI May 23, 2026
53cd24c
Fix routed expert prefix cache sidecar dependencies
FurtherAI May 24, 2026
003b433
Tune train-inf mismatch gates
FurtherAI May 24, 2026
09937e0
Relax qwen3 train-inf gates
FurtherAI May 24, 2026
54855ec
Recognize fused moe lora coverage
FurtherAI May 25, 2026
0f70173
Enable managed MoE routing replay
FurtherAI May 25, 2026
fdeb42b
Release routing replay before job cleanup
FurtherAI May 25, 2026
456ee60
Update Qwen3.5 train-inf invariant gate
FurtherAI May 25, 2026
bdd6c0e
Support dense real-path train-inf topology
FurtherAI May 25, 2026
491ef59
Ignore token-only MoE routing metadata
FurtherAI May 25, 2026
7822790
Treat null route fields as absent
FurtherAI May 25, 2026
7192d07
Fix dense real-path score matching
FurtherAI May 25, 2026
6593840
Add real-path base mismatch diagnostics
FurtherAI May 26, 2026
d7a381c
Fix real-path base diagnostic scoring
FurtherAI May 26, 2026
db3cffb
Freeze base diagnostic Megatron worker
FurtherAI May 26, 2026
3084544
Add real-path base mismatch diagnostic
FurtherAI May 26, 2026
4ab349d
Add train-inf forward trace diagnostic
FurtherAI May 26, 2026
5e940a1
Keep forward trace on default vLLM path
FurtherAI May 26, 2026
fd3c3d4
Limit vLLM forward trace tensor dumps
FurtherAI May 26, 2026
f6e07d9
Capture Megatron final hidden in trace
FurtherAI May 26, 2026
87cd3a4
Save Megatron logits in forward trace
FurtherAI May 26, 2026
19297a9
Capture Megatron trace submodules for train-inf diagnostics
FurtherAI May 26, 2026
0286d1e
Trace vLLM projection submodules for diagnostics
FurtherAI May 26, 2026
9b4e340
Add all-architectures model support workflow
FurtherAI May 26, 2026
c97dbd8
Clean train-inf adapter artifacts on pass
FurtherAI May 26, 2026
76177d6
Merge remote-tracking branch 'origin/main' into austin/train_inf_mism…
FurtherAI May 27, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.11
3.12
9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "openpipe-art"
version = "0.5.18"
description = "The OpenPipe Agent Reinforcement Training (ART) library"
readme = "README.md"
requires-python = ">=3.11"
requires-python = ">=3.12"
dependencies = [
"openai>=2.14.0",
"typer>=0.15.2",
Expand Down Expand Up @@ -49,13 +49,14 @@ megatron = [
"transformer-engine==2.11.0",
"transformer-engine-cu12==2.11.0",
"transformer-engine-torch==2.11.0",
"megatron-core==0.16.0rc0",
"megatron-core==0.17.0",
"pybind11>=2.13.6",
"megatron-bridge==0.4.0rc0",
"deep-ep==1.2.1 ; sys_platform == 'linux'",
"causal-conv1d==1.6.1 ; sys_platform == 'linux' and platform_machine == 'x86_64' and python_full_version < '3.12'",
"mamba-ssm==2.3.1 ; sys_platform == 'linux' and platform_machine == 'x86_64' and python_full_version < '3.12'",
"nvidia-ml-py==13.580.82",
"nvidia-modelopt>=0.42.0a0 ; sys_platform != 'darwin'",
"nvidia-resiliency-ext<0.5 ; sys_platform == 'linux'",
"ml-dtypes>=0.5.0 ; python_full_version < '3.13'",
]
Expand Down Expand Up @@ -147,18 +148,18 @@ markers = [
required-version = ">=0.11.7"
override-dependencies = [
"flashinfer-python==0.6.1",
"megatron-core==0.17.0",
"numpy<2",
"nvidia-resiliency-ext<0.5",
"quack-kernels==0.2.5",
"transformer-engine==2.11.0",
]
exclude-dependencies = ["pynvml", "emerging-optimizers"]
no-build-isolation-package = ["apex", "transformer-engine", "transformer-engine-cu12", "transformer-engine-torch", "megatron-core", "megatron-bridge", "deep-ep", "nv-grouped-gemm"]
no-build-isolation-package = ["apex", "transformer-engine", "transformer-engine-cu12", "transformer-engine-torch", "megatron-bridge", "deep-ep", "nv-grouped-gemm"]

[tool.uv.extra-build-dependencies]
apex = ["torch>=2.8.0"]
deep-ep = ["torch>=2.8.0"]
megatron-core = ["pybind11"]
nv-grouped-gemm = ["torch>=2.8.0"]
transformer-engine-torch = ["torch>=2.8.0"]

Expand Down
9 changes: 8 additions & 1 deletion src/art/auto_trajectory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from openai.types.chat.chat_completion_chunk import ChatCompletionChunk

from .openai import init_chat_completion, update_chat_completion
from .preprocessing.moe_routing import attach_moe_routing_metadata_to_choice
from .trajectories import History, Trajectory

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -105,7 +106,13 @@ def handle_httpx_response(self, response: httpx._models.Response) -> None:
chat_completion = parse_sse_to_chat_completion(content)
choice = chat_completion.choices[0]
else:
choice = Choice(**json.loads(content)["choices"][0])
response_payload = json.loads(content)
choice = Choice(**response_payload["choices"][0])
attach_moe_routing_metadata_to_choice(
choice=choice,
response_payload=response_payload,
choice_index=0,
)
except (json.JSONDecodeError, KeyError, ValueError) as e:
logger.debug(f"Failed to parse response content: {e}")
return
Expand Down
1 change: 1 addition & 0 deletions src/art/dev/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class EngineArgs(TypedDict, total=False):
override_generation_config: dict[str, Any] | None
enable_sleep_mode: bool
enable_expert_parallel: bool
enable_return_routed_experts: bool
model_impl: str

calculate_kv_scales: bool | None
Expand Down
76 changes: 70 additions & 6 deletions src/art/local/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import socket
import time
from types import TracebackType
from typing import AsyncIterator, Iterable, Literal, cast
from typing import Any, AsyncIterator, Iterable, Literal, cast
import warnings

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -147,6 +147,7 @@ def __init__(
in_process: bool = False,
path: str | None = None,
gpu_cost_per_hour_usd: float | None = None,
enable_expert_replay: bool = True,
) -> None:
"""
Initializes a local, directory-based Backend interface at the given path.
Expand All @@ -162,12 +163,15 @@ def __init__(
automatic `costs/gpu` accounting on train steps. When unset,
ART auto-detects supported GPU types (H200 at $3/hr today) and
skips GPU cost logging for unknown devices instead of guessing.
enable_expert_replay: For supported MoE Megatron training, capture
vLLM routed experts and replay them in Megatron. Defaults to True.
"""
self._in_process = in_process
self._path = path or get_default_art_path()
self._gpu_cost_per_hour_usd = (
float(gpu_cost_per_hour_usd) if gpu_cost_per_hour_usd is not None else None
)
self._enable_expert_replay = enable_expert_replay
os.makedirs(self._path, exist_ok=True)

# Other initialization
Expand All @@ -182,6 +186,27 @@ def __init__(
"default"
)

def _model_uses_expert_replay(self, model: AnyTrainableModel) -> bool:
if not self._enable_expert_replay or not self._supports_result_packing:
return False
from ..megatron.model_support.registry import (
UnsupportedModelArchitectureError,
model_uses_expert_parallel,
)

allow_unvalidated_arch = bool(
(model._internal_config or dev.InternalModelConfig()).get(
"allow_unvalidated_arch", False
)
)
try:
return model_uses_expert_parallel(
model.base_model,
allow_unvalidated_arch=allow_unvalidated_arch,
)
except UnsupportedModelArchitectureError:
return False

def supports_automatic_train_step_metrics(self) -> bool:
return True

Expand Down Expand Up @@ -455,6 +480,7 @@ def _get_packed_tensors(
plot_tensors: bool,
packed_sequence_length: int | None,
logprob_calculation_chunk_size: int,
include_moe_routing: bool = False,
) -> PackedTensors | None:
internal_config = cast(dev.InternalModelConfig, model._internal_config or {})
tokenizer_key = _tokenizer_cache_key(model.base_model, internal_config)
Expand Down Expand Up @@ -547,6 +573,7 @@ def _get_packed_tensors(
truncate_long_results=False,
advantage_balance=advantage_balance,
pack_results=self._supports_result_packing,
include_moe_routing=include_moe_routing,
)
if (
not allow_training_without_logprobs
Expand Down Expand Up @@ -603,6 +630,11 @@ async def _prepare_backend_for_training(
config_dict: dict = dict(config or {})
internal_config = cast(dev.InternalModelConfig, model._internal_config or {})
_apply_configured_chat_template_server_args(config_dict, internal_config)
if self._model_uses_expert_replay(model):
engine_args = dict(config_dict.get("engine_args", {}))
engine_args["enable_return_routed_experts"] = True
engine_args["async_scheduling"] = False
config_dict["engine_args"] = engine_args
server_args = dict(config_dict.get("server_args", {}))

# Avoid binding collisions on busy hosts when no explicit port is provided.
Expand Down Expand Up @@ -850,7 +882,7 @@ async def _train_model(
summary,
include_trainable_groups=True,
)

include_moe_routing = self._model_uses_expert_replay(model)
packed_tensors = self._get_packed_tensors(
model,
trajectory_groups,
Expand All @@ -864,6 +896,7 @@ async def _train_model(
logprob_calculation_chunk_size=dev_config.get(
"logprob_calculation_chunk_size", 1024
),
include_moe_routing=include_moe_routing,
)
if packed_tensors is None:
print(
Expand Down Expand Up @@ -927,17 +960,34 @@ async def _train_model(
disk_packed_tensors = packed_tensors_to_dir(
packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors"
)
# Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())
grad_accumulation_sequences = max(
1, int(config.grad_accumulation_sequences or 1)
service_dev_config = cast(dev.TrainConfig, {**dev_config})
grad_accumulation_sequences = await self._resolve_grad_accumulation_sequences(
service,
config,
)
if include_moe_routing:
from ..megatron.routing_replay import (
build_moe_routing_replay_bundle_from_packed_tensors,
)

routing_replay_dir = (
f"{get_model_dir(model=model, art_path=self._path)}/tensors/"
"moe_routing_replay"
)
build_moe_routing_replay_bundle_from_packed_tensors(
packed_tensors=packed_tensors,
global_grad_accumulation_sequences=grad_accumulation_sequences,
).to_dir(routing_replay_dir)
service_dev_config["moe_routing_replay_path"] = routing_replay_dir
service_dev_config["moe_routing_replay_strict"] = True
# Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())
fallback_gradient_steps = math.ceil(
disk_packed_tensors["num_sequences"] / grad_accumulation_sequences
)
pbar = tqdm.tqdm(total=fallback_gradient_steps, desc="train")
reported_gradient_steps: int | None = None
async for result in service.train(
disk_packed_tensors, config, dev_config, verbose
disk_packed_tensors, config, service_dev_config, verbose
):
raw_num_gradient_steps = result.pop(TRAIN_GRADIENT_STEPS_KEY, None)
if raw_num_gradient_steps is not None:
Expand Down Expand Up @@ -965,6 +1015,20 @@ async def _train_model(
if verbose:
print("_train_model complete")

async def _resolve_grad_accumulation_sequences(
self,
service: ModelService,
config: TrainConfig,
) -> int:
resolver = getattr(
cast(Any, service),
"resolve_global_grad_accumulation_sequences",
None,
)
if callable(resolver):
return max(1, int(await resolver(config)))
return max(1, int(config.grad_accumulation_sequences or 1))

# Note: _get_reward_std_dev_learning_rate_multiplier and _log_metrics
# have been moved to the Model class (frontend)

Expand Down
8 changes: 8 additions & 0 deletions src/art/megatron/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,14 @@ def __init__(
b_parallel_spec=b_parallel_spec,
allreduce=False,
)
component_size = (
linear_fc1.out_features * _get_shard_world_size("expert_tp")
) // 2
_set_lora_shard_strategy_metadata(
self.lora.B_T,
strategy="componentwise",
component_sizes=(component_size, component_size),
)

def forward(
self, x: torch.Tensor, tokens_per_expert: list[int] | torch.Tensor
Expand Down
9 changes: 1 addition & 8 deletions src/art/megatron/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,6 @@ def _apply_default_parallel_topology(provider: GPTModelProvider) -> None:
provider.expert_tensor_parallel_size = 1


def _etp_ep_parallel_domain_size(provider: GPTModelProvider) -> int:
return (
cast(int, provider.expert_tensor_parallel_size)
* provider.expert_model_parallel_size
)


def _apply_art_training_runtime_prepare_defaults(provider: GPTModelProvider) -> None:
provider.recompute_granularity = "full"
provider.recompute_method = "uniform"
Expand All @@ -119,7 +112,7 @@ def _apply_art_training_runtime_prepare_defaults(provider: GPTModelProvider) ->


def _apply_art_training_runtime_finalize_defaults(provider: GPTModelProvider) -> None:
if _etp_ep_parallel_domain_size(provider) <= 1:
if provider.expert_model_parallel_size <= 1:
return
# use DeepEP for MoE expert comm. comm can be the same amount of time as actual MLP
# compute, so these are very beneficial
Expand Down
Loading
Loading