Skip to content

Commit 54172c4

Browse files
committed
Backfill remaining official run directories
1 parent 001ebdf commit 54172c4

File tree

16,718 files changed

+3734186
-40
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

16,718 files changed

+3734186
-40
lines changed

runs/official/MANIFEST.json

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"description": "Canonical run manifest for CodeContextBench evaluation",
3-
"generated": "2026-02-27T03:17:04.753322+00:00",
3+
"generated": "2026-02-27T13:45:04.194332+00:00",
44
"total_tasks": 765,
55
"total_runs": 82,
66
"runs": {
@@ -262,7 +262,7 @@
262262
"passed": 20,
263263
"failed": 5,
264264
"errored": 0,
265-
"mean_reward": 0.494,
265+
"mean_reward": 0.513,
266266
"tasks": {
267267
"bustub-hyperloglog-impl-001": {
268268
"status": "passed",
@@ -276,15 +276,13 @@
276276
},
277277
"camel-fix-protocol-feat-001": {
278278
"status": "passed",
279-
"reward": 0.41,
280-
"has_trajectory": false,
281-
"has_cost": false,
279+
"reward": 0.22,
280+
"has_trajectory": true,
281+
"has_cost": true,
282282
"judge_score": null,
283283
"judge_model": null,
284284
"judge_dimensions": null,
285-
"judge_confidence": null,
286-
"timed_out": true,
287-
"timeout_seconds": 6000.0
285+
"judge_confidence": null
288286
},
289287
"cgen-deps-install-001": {
290288
"status": "failed",
@@ -348,15 +346,13 @@
348346
},
349347
"flink-pricing-window-feat-001": {
350348
"status": "passed",
351-
"reward": 0.12,
352-
"has_trajectory": false,
353-
"has_cost": false,
349+
"reward": 0.48,
350+
"has_trajectory": true,
351+
"has_cost": true,
354352
"judge_score": null,
355353
"judge_model": null,
356354
"judge_dimensions": null,
357-
"judge_confidence": null,
358-
"timed_out": true,
359-
"timeout_seconds": 6000.0
355+
"judge_confidence": null
360356
},
361357
"flipt-dep-refactor-001": {
362358
"status": "passed",
@@ -391,14 +387,12 @@
391387
"k8s-noschedule-taint-feat-001": {
392388
"status": "passed",
393389
"reward": 0.7,
394-
"has_trajectory": false,
395-
"has_cost": false,
390+
"has_trajectory": true,
391+
"has_cost": true,
396392
"judge_score": null,
397393
"judge_model": null,
398394
"judge_dimensions": null,
399-
"judge_confidence": null,
400-
"timed_out": true,
401-
"timeout_seconds": 6000.0
395+
"judge_confidence": null
402396
},
403397
"k8s-runtime-object-impl-001": {
404398
"status": "passed",
@@ -412,15 +406,13 @@
412406
},
413407
"k8s-score-normalizer-refac-001": {
414408
"status": "passed",
415-
"reward": 0.58,
416-
"has_trajectory": false,
417-
"has_cost": false,
409+
"reward": 0.88,
410+
"has_trajectory": true,
411+
"has_cost": true,
418412
"judge_score": null,
419413
"judge_model": null,
420414
"judge_dimensions": null,
421-
"judge_confidence": null,
422-
"timed_out": true,
423-
"timeout_seconds": 6000.0
415+
"judge_confidence": null
424416
},
425417
"kafka-batch-accumulator-refac-001": {
426418
"status": "passed",
@@ -465,14 +457,12 @@
465457
"servo-scrollend-event-feat-001": {
466458
"status": "failed",
467459
"reward": 0.0,
468-
"has_trajectory": false,
469-
"has_cost": false,
460+
"has_trajectory": true,
461+
"has_cost": true,
470462
"judge_score": null,
471463
"judge_model": null,
472464
"judge_dimensions": null,
473-
"judge_confidence": null,
474-
"timed_out": true,
475-
"timeout_seconds": 6000.0
465+
"judge_confidence": null
476466
},
477467
"similar-asserts-deps-install-001": {
478468
"status": "passed",
@@ -8998,16 +8988,23 @@
89988988
]
89998989
},
90008990
"camel-fix-protocol-feat-001": {
9001-
"n_runs": 1,
9002-
"mean_reward": 0.41,
9003-
"std_reward": 0.0,
8991+
"n_runs": 2,
8992+
"mean_reward": 0.315,
8993+
"std_reward": 0.1344,
90048994
"runs": [
90058995
{
90068996
"started_at": "2026-02-23T12:49:09.942621",
90078997
"reward": 0.41,
90088998
"status": "passed",
90098999
"is_paired": false,
90109000
"run_dir": "build_haiku_20260223_124805"
9001+
},
9002+
{
9003+
"started_at": "2026-02-27T12:48:00.755688",
9004+
"reward": 0.22,
9005+
"status": "passed",
9006+
"is_paired": false,
9007+
"run_dir": "ccb_build_haiku_20260227_baseline_gapfill"
90119008
}
90129009
]
90139010
},
@@ -9096,16 +9093,23 @@
90969093
]
90979094
},
90989095
"flink-pricing-window-feat-001": {
9099-
"n_runs": 1,
9100-
"mean_reward": 0.12,
9101-
"std_reward": 0.0,
9096+
"n_runs": 2,
9097+
"mean_reward": 0.3,
9098+
"std_reward": 0.2546,
91029099
"runs": [
91039100
{
91049101
"started_at": "2026-02-23T12:49:14.726784",
91059102
"reward": 0.12,
91069103
"status": "passed",
91079104
"is_paired": false,
91089105
"run_dir": "build_haiku_20260223_124805"
9106+
},
9107+
{
9108+
"started_at": "2026-02-27T12:48:00.755864",
9109+
"reward": 0.48,
9110+
"status": "passed",
9111+
"is_paired": false,
9112+
"run_dir": "ccb_build_haiku_20260227_baseline_gapfill"
91099113
}
91109114
]
91119115
},
@@ -9152,7 +9156,7 @@
91529156
]
91539157
},
91549158
"k8s-noschedule-taint-feat-001": {
9155-
"n_runs": 1,
9159+
"n_runs": 2,
91569160
"mean_reward": 0.7,
91579161
"std_reward": 0.0,
91589162
"runs": [
@@ -9162,6 +9166,13 @@
91629166
"status": "passed",
91639167
"is_paired": false,
91649168
"run_dir": "build_haiku_20260223_124805"
9169+
},
9170+
{
9171+
"started_at": "2026-02-27T02:57:33.488748",
9172+
"reward": 0.7,
9173+
"status": "passed",
9174+
"is_paired": false,
9175+
"run_dir": "ccb_build_haiku_20260227_baseline_gapfill"
91659176
}
91669177
]
91679178
},
@@ -9180,16 +9191,23 @@
91809191
]
91819192
},
91829193
"k8s-score-normalizer-refac-001": {
9183-
"n_runs": 1,
9184-
"mean_reward": 0.58,
9185-
"std_reward": 0.0,
9194+
"n_runs": 2,
9195+
"mean_reward": 0.73,
9196+
"std_reward": 0.2121,
91869197
"runs": [
91879198
{
91889199
"started_at": "2026-02-23T13:03:34.388382",
91899200
"reward": 0.58,
91909201
"status": "passed",
91919202
"is_paired": false,
91929203
"run_dir": "build_haiku_20260223_124805"
9204+
},
9205+
{
9206+
"started_at": "2026-02-27T12:48:00.755826",
9207+
"reward": 0.88,
9208+
"status": "passed",
9209+
"is_paired": false,
9210+
"run_dir": "ccb_build_haiku_20260227_baseline_gapfill"
91939211
}
91949212
]
91959213
},
@@ -9256,6 +9274,20 @@
92569274
}
92579275
]
92589276
},
9277+
"servo-scrollend-event-feat-001": {
9278+
"n_runs": 1,
9279+
"mean_reward": 0.0,
9280+
"std_reward": 0.0,
9281+
"runs": [
9282+
{
9283+
"started_at": "2026-02-27T03:47:58.529355",
9284+
"reward": 0.0,
9285+
"status": "failed",
9286+
"is_paired": false,
9287+
"run_dir": "ccb_build_haiku_20260227_baseline_gapfill"
9288+
}
9289+
]
9290+
},
92599291
"similar-asserts-deps-install-001": {
92609292
"n_runs": 1,
92619293
"mean_reward": 1.0,

0 commit comments

Comments
 (0)