OpenMLRL
diff --git a/‎README.md‎
Lines changed: 11 additions & 14 deletions b/‎README.md‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎configs/iac_classeval_config.yaml‎
Lines changed: 67 additions & 0 deletions b/‎configs/iac_classeval_config.yaml‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎configs/maac_classeval_config.yaml‎
Lines changed: 66 additions & 0 deletions b/‎configs/maac_classeval_config.yaml‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎configs/magrpo_classeval_config.yaml‎
Lines changed: 17 additions & 11 deletions b/‎configs/magrpo_classeval_config.yaml‎
Lines changed: 17 additions & 11 deletions
@@ -28,8 +28,9 @@ conda install -c conda-forge comlrl
   `dataset.train_split` and `dataset.eval_split` (e.g., `test[:50]` and `test[50:]`).
 - **Subsetting**: if a split name is missing (e.g., ClassEval only has `test`),
   the loader falls back to the first available split before slicing.
-- **Prompting**: prompts include the sanitized class skeleton, explicit method names per
-  agent, and any collaboration instructions.
+- **Prompting**: prompts include the sanitized class skeleton plus per-agent method
+  assignments. The default strategy assigns 1-parameter methods to agent 0 and all other
+  methods to agent 1.
 - **Testing**: reward code merges agent completions back into the skeleton and runs the
   provided unit tests inside a temporary directory to isolate state.
 
@@ -41,25 +42,21 @@ Key sections in `configs/magrpo_classeval_config.yaml`:
   kwargs, and device mapping.
 - `dataset`: dataset name and split strings (`train_split`, `eval_split`) for
   ClassEval sub-slices or local mirrors.
-- `external`: determines the feedback mode. `token_report` summarizes syntax/tests at each
-  turn; other modes replicate the options documented in the code-generation README
-  (`plain`, `level_feedback`, `group_feedback`, `personal_feedback`, `personal_detailed_feedback`,
-  `passed`, `level_passed`).
+- `external`: feedback configuration (use `code_feedback` for syntax/test diagnostics).
 - `magrpo`: forwarded to `comlrl.trainers.magrpo.MAGRPOTrainer`. Includes collaboration
-  (`num_agents`, TAKE_JOB self-select), sampling settings (`num_generations`, `num_turns`,
+  (`num_agents`, param-count assignment), sampling settings (`num_generations`, `num_turns`,
   temperature/top_p), rollout buffering (`rollout_buffer_size`), optimization
   hyperparameters, and IO controls.
-- `output`: persistence knobs (save final model, keep tmp dirs); environment variables such
-  as `CLASSEVAL_TMP_BASE` are derived from this section to colocate temp files per job.
+- `reward_processor`: optional post-processing for rewards (scale, shift).
+- `output`: persistence knobs (save final model, output paths, verbose debug prints).
 
 ## Rewards, Logging, and Evaluation
 
 - `rewards/CE_reward.py` computes structured rewards:
-  - `lv1`: coverage of unique methods completed.
-  - `lv2`: penalizes under/over-allocation of total method picks.
-  - `lv3`: balance term encouraging an even workload across agents.
-  - `lv4`/`lv5`: syntax + unit-test bonuses (reported for analysis; syntax/test failures
-    short-circuit the run where applicable).
+  - `lv1`: syntax score proportional to valid method outputs (range [0, 2]).
+  - `lv2`: unit-test bonus based on pass rate (passed/total), scaled to [0, 4].
+  - `lv3`: overlap penalty normalized by total methods (range [-1, 0]).
+  - reward shift: optional post-processing shift via `reward_processor.shift`.
 - Tests execute inside per-sample temporary directories to avoid polluted state and are
   automatically truncated on timeout.
 - Loggers are inherited from CoMLRL. Enable Weights & Biases by filling `wandb.entity`
 
@@ -0,0 +1,67 @@
+model:
+  name: Qwen/Qwen3-4B-Instruct-2507
+  type: qwen
+  temperature: 0.6
+  top_p: 0.6
+  max_length: 2048
+  tokenizer_kwargs: {}
+  model_kwargs:
+    trust_remote_code: true
+    device_map: auto
+    torch_dtype: bfloat16
+
+dataset:
+  name: FudanSELab/ClassEval
+  type: classeval
+  train_split: test[:66]
+  eval_split: test[66:82]
+
+output:
+  base_dir: output
+  save_final_model: false
+  save_path: output/final_model
+  verbose: false
+
+external:
+  mode: code_feedback
+  original_prompt: true
+  previous_response: true
+
+patches:
+  generation_memory: true
+  single_agent_returns: true
+
+iac:
+  num_turns: 2
+  num_train_epochs: 40
+  per_device_train_batch_size: 1
+  rollout_buffer_size: 2
+  actor_learning_rate: 5e-6
+  critic_learning_rate: 5e-6
+  value_loss_coef: 0.6
+  value_clip_range: 0.2
+  max_new_tokens: 600
+  temperature: 0.6
+  top_p: 0.6
+  top_k: null
+  num_agents: 2
+  num_return_sequences: 1
+  use_separate_critic: true
+  critic_model: null
+  discount: 0.9
+  early_termination_threshold: -0.2
+  eval_interval: 20
+  eval_num_samples: 4
+  logging_steps: 1
+
+reward_processor:
+  enabled: true
+  scale_factor: 1.0
+  shift: -6.0
+
+wandb:
+  project: classeval_dev
+  entity: null
+  name: codecompletion_classeval_iac
+  dir: output
+  tags: ["iac", "classeval", "code-completion", "turns_2"]
@@ -0,0 +1,66 @@
+model:
+  name: Qwen/Qwen3-4B-Instruct-2507
+  type: qwen
+  temperature: 0.6
+  top_p: 0.6
+  max_length: 2048
+  tokenizer_kwargs: {}
+  model_kwargs:
+    trust_remote_code: true
+    device_map: auto
+    torch_dtype: bfloat16
+
+dataset:
+  name: FudanSELab/ClassEval
+  type: classeval
+  train_split: test[:66]
+  eval_split: test[66:82]
+
+output:
+  base_dir: output
+  save_final_model: false
+  save_path: output/final_model
+  verbose: false
+
+external:
+  mode: code_feedback
+  original_prompt: true
+  previous_response: true
+
+patches:
+  generation_memory: true
+  single_agent_returns: true
+
+maac:
+  num_turns: 2
+  critic_type: v
+  num_train_epochs: 40
+  per_device_train_batch_size: 1
+  rollout_buffer_size: 2
+  actor_learning_rate: 5e-6
+  critic_learning_rate: 5e-6
+  value_loss_coef: 0.6
+  max_new_tokens: 600
+  temperature: 0.6
+  top_p: 0.6
+  top_k: null
+  num_agents: 2
+  num_return_sequences: 1
+  critic_model: null
+  discount: 0.9
+  early_termination_threshold: -0.2
+  eval_interval: 20
+  eval_num_samples: 2
+  logging_steps: 1
+
+reward_processor:
+  enabled: true
+  scale_factor: 1.0
+  shift: -6.0
+
+wandb:
+  project: classeval_dev
+  entity: null
+  name: codecompletion_classeval_maac
+  dir: output
+  tags: ["maac", "classeval", "code-completion", "turns_2"]
@@ -1,7 +1,7 @@
 model:
-  name: Qwen/Qwen2.5-Coder-3B-Instruct
+  name: Qwen/Qwen3-4B-Instruct-2507
   type: qwen
-  temperature: 0.25
+  temperature: 0.6
   top_p: 0.6
   max_length: 2048
   tokenizer_kwargs: {}
@@ -21,37 +21,43 @@ output:
   save_final_model: false
   save_path: output/final_model
   verbose: false
-  keep_tmp: true
-  tmp_base_dir: output/tmp
 
 external:
   mode: code_feedback
-  sandbox_slice: null
   original_prompt: true
   previous_response: true
 
+patches:
+  generation_memory: true
+  single_agent_returns: true
+
 magrpo:
   num_turns: 2
-  num_train_epochs: 8
+  num_train_epochs: 13
   per_device_train_batch_size: 1
   rollout_buffer_size: 1
   learning_rate: 1e-5
-  eval_interval: 4
+  eval_interval: 10
   eval_num_samples: 4
-  num_generations: 3
+  num_generations: 2
   max_new_tokens: 600
-  temperature: 0.25
+  temperature: 0.6
   top_p: 0.6
   top_k: null
   joint_mode: aligned
   num_agents: 2
   discount: 0.9
-  termination_threshold: 7.8
-  logging_steps: 50
+  termination_threshold: -0.2
+  logging_steps: 1
   save_steps: 200
   normalize_advantage: false
   epsilon_clip: null
 
+reward_processor:
+  enabled: true
+  scale_factor: 1.0
+  shift: -6.0
+
 wandb:
   project: classeval_dev
   entity: null