Skip to content

Commit 91d75b5

Browse files
author
henrique
committed
add new barrier to avoid race condition with ckpt
1 parent f9f0f6b commit 91d75b5

2 files changed

Lines changed: 9 additions & 3 deletions

File tree

src/codeevolve/evolution.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,8 +880,14 @@ def _do_checkpoint(epoch_num: int) -> None:
880880
logger.info("Waiting for other islands to arrive at barrier...")
881881
global_data.barrier.wait()
882882
logger.info("All islands arrived. Proceeding to save ckpt.")
883+
883884
_do_checkpoint(epoch)
884885

886+
logger.info("Waiting for other islands to finish ckpt saving...")
887+
global_data.barrier.wait()
888+
logger.info("All islands synced.")
889+
890+
885891
# EARLY STOPPING
886892
logger.info("=== GLOBAL EARLY STOPPING CHECK STEP ===")
887893
if improved_local_fitness and child_sol:

src/codeevolve/utils/ckpt.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def save_run_metadata(
162162
"iteration_found": global_best_sol.iteration_found.value,
163163
"island_found": global_best_sol.island_found.value,
164164
"depth": global_best_sol.depth.value,
165-
"eval_metrics": dict(global_best_sol.eval_metrics),
165+
"eval_metrics": global_best_sol.eval_metrics.copy(),
166166
},
167167
"early_stop_counter": early_stop_counter,
168168
}
@@ -179,7 +179,7 @@ def load_run_metadata(out_dir: str | Path, epoch: int) -> Optional[Dict[str, Any
179179
epoch: Epoch number to load metadata for.
180180
181181
Returns:
182-
Dictionary with run metadata.
182+
Dictionary with run metadata. None if epoch data not found.
183183
"""
184184
if isinstance(out_dir, str):
185185
out_dir = Path(out_dir)
@@ -192,5 +192,5 @@ def load_run_metadata(out_dir: str | Path, epoch: int) -> Optional[Dict[str, Any
192192
with open(metadata_file, "r") as f:
193193
data: Dict[str, Any] = json.load(f)
194194

195-
epoch_data = data.get(str(epoch), {})
195+
epoch_data = data.get(str(epoch), None)
196196
return epoch_data

0 commit comments

Comments
 (0)