AI-Hypercomputer · ecnal-cienet · May 7, 2026 · May 7, 2026 · Apr 28, 2026 · Apr 29, 2026
@@ -87,11 +87,12 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
   devices_array = maxtext_utils.create_device_mesh(cfg)
   mesh = Mesh(devices_array, cfg.mesh_axes)
 
+  # This conversion script reads paxml-format weights and emits a Linen-format
+  # MaxText checkpoint (downstream uses `.params['params']`, `.opt_state.mu['params']`,
+  # `.opt_state.nu['params']` keystr paths; the keystr_map below targets the Linen
+  # tree shape). Use the Linen path regardless of pure_nnx.
   quant = quantizations.configure_quantization(cfg)
-  if cfg.pure_nnx:
-    raise NotImplementedError("Pure NNX support has not been implemented yet.")
-  else:
-    model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
+  model = transformer_as_linen(cfg, mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(cfg)
   tx = optimizers.get_optimizer(cfg, learning_rate_schedule)
 
@@ -102,11 +103,7 @@ def convert(paxml_ckpt_path, maxtext_model_name, base_output_directory, run_name
       cfg.checkpoint_period,
   )
 
-  if cfg.pure_nnx:
-    # NNX has a different function to init the training state.
-    raise NotImplementedError("Pure NNX support has not been implemented yet.")
-  else:
-    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
+  init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, cfg, True, init_rng)
   state, _, _, _ = maxtext_utils.setup_training_state(None, cfg, mesh, checkpoint_manager, init_state_fn)
   max_logging.log("start")
   max_utils.print_mem_stats("After params initialized")

@@ -560,6 +560,13 @@ logical_axis_rules: [
                       ['tokens_per_page', []],
                       ['paged_kv_head_dim_size', []],
                       # ==========================================
+                      # Pipeline Parallelism
+                      # ==========================================
+                      ['layers_outside_pipeline', []],
+                      ['layers_per_stage', []],
+                      ['num_activations', []],
+                      ['circular_repeats', []],
+                      # ==========================================
                       # Deprecated / Scheduled for Removal
                       # ==========================================
                       ['mlp_no_fsdp', ['tensor', 'tensor_sequence', 'autoregressive']], 

@@ -195,10 +195,9 @@ def validate_expert_shard_attention_option(expert_shard_attention_option: str) -
 
 
 def validate_vocab_tiling(num_vocab_tiling: int, per_device_batch_size: int, max_target_length: int, enable_nnx: bool):
+  del enable_nnx  # NNX vocab tiling supported via vocab_tiling_nnx_loss in vocabulary_tiling.py
   if (per_device_batch_size * max_target_length) % num_vocab_tiling != 0:
     raise ValueError("Per device batch size times sequence length should be divisible by the number of vocab tiles.")
-  if num_vocab_tiling > 1 and enable_nnx:  # TODO (chengnuojin) enable vocab tiling on NNX after NNX migration
-    raise ValueError("We currently don't support vocab tiling on NNX module.")
 
 
 def validate_rampup_batch_size(batch_size_start, batch_size_end, batch_size_increment, global_rampup_samples):

@@ -2841,8 +2841,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         and (self.per_device_batch_size * self.max_target_length) % self.num_vocab_tiling != 0
     ):
       raise ValueError("Per device batch size times sequence length should be divisible by the number of vocab tiles.")
-    if self.num_vocab_tiling > 1 and self.enable_nnx:
-      raise ValueError("We currently don't support vocab tiling on NNX module.")
+    # Vocab tiling on NNX is now supported via vocab_tiling_nnx_loss in vocabulary_tiling.py.
     if self.context_parallel_size > 1 and self.context_parallel_strategy.lower() == "ring":
       if "gpu" not in self.hardware:
         raise ValueError(