Skip to content

Commit ecdeb5a

Browse files
committed
evals: remove efficiency metric
1 parent d643b50 commit ecdeb5a

File tree

7 files changed

+0
-36
lines changed

7 files changed

+0
-36
lines changed

evals/git-evals/email-eval-results.ts

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,6 @@ function formatEvalSummaryForEmail(
3333
(sum, result) => sum + result.overall_metrics.average_completion,
3434
0,
3535
) / evalResults.length
36-
const avgEfficiency =
37-
evalResults.reduce(
38-
(sum, result) => sum + result.overall_metrics.average_efficiency,
39-
0,
40-
) / evalResults.length
4136
const avgCodeQuality =
4237
evalResults.reduce(
4338
(sum, result) => sum + result.overall_metrics.average_code_quality,
@@ -70,14 +65,12 @@ function formatEvalSummaryForEmail(
7065
• Success Rate: ${successfulRuns}/${totalRuns} (${((successfulRuns / totalRuns) * 100).toFixed(1)}%)
7166
• Overall Score: ${avgOverallScore.toFixed(2)}/10
7267
• Completion: ${avgCompletion.toFixed(2)}/10
73-
• Efficiency: ${avgEfficiency.toFixed(2)}/10
7468
• Code Quality: ${avgCodeQuality.toFixed(2)}/10
7569
7670
💰 COST & PERFORMANCE METRICS
7771
• Average Cost per Run: ${avgCostUsd.toFixed(4)}
7872
• Total Cost: ${totalCostUsd.toFixed(2)}
7973
• Average Runtime: ${avgRuntimeSec.toFixed(1)} seconds
80-
• Cost per Point (Overall Score): ${(avgCostUsd / avgOverallScore).toFixed(4)}
8174
8275
📈 BY EVAL SET:
8376
${evalResults

evals/git-evals/judge-git-eval.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,6 @@ function buildAnalysisPrompt(
5555
${evalRun.eval_commit.spec}
5656
[/SPEC]
5757
58-
[TIMING_INFORMATION]
59-
Task Duration: ${durationSeconds} seconds (${evalRun.durationMs}ms)
60-
[/TIMING_INFORMATION]
61-
62-
[COST_INFORMATION]
63-
Total Cost: ${evalRun.costUsd.toFixed(2)} USD
64-
[/COST_INFORMATION]
65-
6658
[GROUND_TRUTH_CHANGES]
6759
${groundTruthChanges}
6860
[/GROUND_TRUTH_CHANGES]
@@ -71,10 +63,6 @@ ${groundTruthChanges}
7163
${codebuffChanges}
7264
[/CHANGES_BY_CODEBUFF]
7365
74-
[TRACE]
75-
${traceContent}
76-
[/TRACE]
77-
7866
[ERROR]
7967
${evalRun.error ? evalRun.error : 'None'}
8068
[/ERROR]
@@ -84,15 +72,13 @@ Please analyze the trace of the implementation attempt and provide:
8472
2. Key strengths and weaknesses of the implementation
8573
3. Numerical scores (0-10):
8674
- Completion: How completely and correctly was the spec implemented compared to the ground truth changes?
87-
- Efficiency: How efficiently did Codebuff respond to the Agent's prompts without taking unnecessary steps? Speed is important! Consider the task duration of ${durationSeconds} seconds.
8875
- Code Quality: How well-structured, maintainable and idiomatic is the code?
8976
- Overall: Combined assessment of the implementation quality
9077
9178
Focus on:
9279
- Correctness and completeness compared to the ground truth changes
9380
- Quality of the code produced
9481
- Minimal changes: it's better to change as little code as possible to accomplish what the agent prompted
95-
- Speed and efficiency: did Codebuff make unnecessary changes or take unnecessary steps? The task took ${durationSeconds} seconds - was this reasonable for the complexity?
9682
- Error: If there was an error encountered, you should give a very low score.
9783
9884
Provide your response in a structured format with analysis, lists of strengths and weaknesses, and metrics.`

evals/git-evals/post-eval-analysis.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ function buildAnalysisPrompt(evalResult: FullEvalLog): string {
5252
const metricsSection = `
5353
Overall Performance Metrics:
5454
- Average Completion Score: ${metrics.average_completion.toFixed(2)}/10
55-
- Average Efficiency Score: ${metrics.average_efficiency.toFixed(2)}/10
5655
- Average Code Quality Score: ${metrics.average_code_quality.toFixed(2)}/10
5756
- Average Overall Score: ${metrics.average_overall.toFixed(2)}/10
5857
- Average Duration: ${(metrics.average_duration_ms / 1000).toFixed(1)} seconds
@@ -73,7 +72,6 @@ Error: ${run.error || 'None'}
7372
7473
Scores:
7574
- Completion: ${judging.metrics.completionScore}/10
76-
- Efficiency: ${judging.metrics.efficiencyScore}/10
7775
- Code Quality: ${judging.metrics.codeQualityScore}/10
7876
- Overall: ${judging.metrics.overallScore}/10
7977

evals/git-evals/run-eval-set.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,9 +278,6 @@ async function runEvalSet(options: {
278278
console.log(
279279
` Completion: ${metrics.average_completion.toFixed(2)}/10`,
280280
)
281-
console.log(
282-
` Efficiency: ${metrics.average_efficiency.toFixed(2)}/10`,
283-
)
284281
console.log(
285282
` Code Quality: ${metrics.average_code_quality.toFixed(2)}/10`,
286283
)
@@ -368,7 +365,6 @@ async function runEvalSet(options: {
368365
numCases: evalResult?.overall_metrics?.total_runs,
369366
avgScore: evalResult?.overall_metrics?.average_overall,
370367
avgCompletion: evalResult?.overall_metrics?.average_completion,
371-
avgEfficiency: evalResult?.overall_metrics?.average_efficiency,
372368
avgCodeQuality: evalResult?.overall_metrics?.average_code_quality,
373369
avgDuration: evalResult?.overall_metrics?.average_duration_ms,
374370
suite: resultWrapper.name,

evals/git-evals/run-git-evals.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ Explain your reasoning in detail.`,
254254
weaknesses: ['Judging process encountered an error'],
255255
metrics: {
256256
completionScore: 0,
257-
efficiencyScore: 0,
258257
codeQualityScore: 0,
259258
overallScore: 0,
260259
},
@@ -620,11 +619,6 @@ function calculateOverallMetrics(evalRuns: EvalRunJudged[]) {
620619
(sum, run) => sum + (run.judging_results.metrics.completionScore || 0),
621620
0,
622621
) / evalRuns.length,
623-
average_efficiency:
624-
evalRuns.reduce(
625-
(sum, run) => sum + (run.judging_results.metrics.efficiencyScore || 0),
626-
0,
627-
) / evalRuns.length,
628622
average_code_quality:
629623
evalRuns.reduce(
630624
(sum, run) => sum + (run.judging_results.metrics.codeQualityScore || 0),

evals/git-evals/run-single-eval.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,6 @@ async function runSingleEvalTask(options: {
212212
const metrics = result.judging_results.metrics
213213
console.log(` Overall Score: ${metrics.overallScore.toFixed(2)}/10`)
214214
console.log(` Completion: ${metrics.completionScore.toFixed(2)}/10`)
215-
console.log(` Efficiency: ${metrics.efficiencyScore.toFixed(2)}/10`)
216215
console.log(` Code Quality: ${metrics.codeQualityScore.toFixed(2)}/10`)
217216

218217
if (result.judging_results.strengths.length > 0) {

evals/git-evals/types.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ export interface FullEvalLog {
6464
average_runtime_sec: number
6565
average_cost_usd: number
6666
average_completion: number
67-
average_efficiency: number
6867
average_code_quality: number
6968
average_overall: number
7069
average_duration_ms: number
@@ -95,7 +94,6 @@ export const JudgingAnalysisSchema = z.object({
9594
weaknesses: z.array(z.string()),
9695
metrics: z.object({
9796
completionScore: z.number().min(0).max(10),
98-
efficiencyScore: z.number().min(0).max(10),
9997
codeQualityScore: z.number().min(0).max(10),
10098
overallScore: z.number().min(0).max(10),
10199
}),

0 commit comments

Comments
 (0)