evalops
diff --git a/‎TODO.md‎
Lines changed: 1 addition & 1 deletion b/‎TODO.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/command.rs‎
Lines changed: 15 additions & 1 deletion b/‎src/commands/eval/command.rs‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/commands/eval/command/batch.rs‎
Lines changed: 2 additions & 1 deletion b/‎src/commands/eval/command/batch.rs‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/commands/eval/command/options.rs‎
Lines changed: 6 additions & 1 deletion b/‎src/commands/eval/command/options.rs‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/commands/eval/command/report.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/commands/eval/command/report.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/report/trend.rs‎
Lines changed: 43 additions & 3 deletions b/‎src/commands/eval/report/trend.rs‎
Lines changed: 43 additions & 3 deletions
diff --git a/‎src/commands/eval/runner.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/commands/eval/runner.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/runner/execute.rs‎
Lines changed: 1 addition & 1 deletion b/‎src/commands/eval/runner/execute.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/commands/eval/runner/execute/artifact.rs‎
Lines changed: 112 additions & 1 deletion b/‎src/commands/eval/runner/execute/artifact.rs‎
Lines changed: 112 additions & 1 deletion
diff --git a/‎src/commands/feedback_eval/command.rs‎
Lines changed: 2 additions & 0 deletions b/‎src/commands/feedback_eval/command.rs‎
Lines changed: 2 additions & 0 deletions
@@ -129,7 +129,7 @@ This roadmap is derived from deep research into Greptile's public docs, blog, MC
 81. [x] Split `src/server/api.rs` by domain so the growing platform API stays maintainable.
 82. [x] Split `src/server/state.rs` into session lifecycle, persistence, progress, and GitHub coordination modules.
 83. [x] Add queue depth and worker saturation metrics for long-running review and eval jobs.
-84. [ ] Add retention policies for review artifacts, eval artifacts, and trend histories.
+84. [x] Add retention policies for review artifacts, eval artifacts, and trend histories.
 85. [x] Add storage migrations for richer comment lifecycle and reinforcement schemas.
 86. [ ] Add deployment docs for self-hosted review + analytics + trend retention setups.
 87. [ ] Add secret-management guidance and validation for multi-provider enterprise installs.
 
@@ -10,9 +10,11 @@ mod report;
 use anyhow::Result;
 use std::collections::HashSet;
 use std::path::{Path, PathBuf};
+use tracing::info;
 
 use crate::config;
 
+use super::runner::prune_eval_artifacts;
 use super::{EvalRunFilters, EvalRunMetadata, EvalRunOptions};
 use batch::run_eval_batch;
 use fixtures::run_eval_fixtures;
@@ -29,13 +31,25 @@ pub async fn eval_command(
     if options.trend_file.is_none() {
         options.trend_file = Some(config.eval_trend_path.clone());
     }
+    if let Some(artifact_dir) = options.artifact_dir.as_deref() {
+        let pruned =
+            prune_eval_artifacts(artifact_dir, config.retention.eval_artifact_max_age_days).await?;
+        if pruned > 0 {
+            info!(
+                artifact_dir = %artifact_dir.display(),
+                pruned,
+                "Pruned stale eval artifacts"
+            );
+        }
+    }
     ensure_frontier_eval_models(&config, &options)?;
     if options.repeat > 1 || !options.matrix_models.is_empty() {
         return run_eval_batch(config, &fixtures_dir, output_path.as_deref(), &options).await;
     }
 
     let execution = run_eval_fixtures(&config, &fixtures_dir, &options).await?;
-    let prepared_options = prepare_eval_options(&options)?;
+    let prepared_options =
+        prepare_eval_options(&options, config.retention.trend_history_max_entries)?;
     let report_output_path = output_path.clone().or_else(|| {
         options
             .artifact_dir
 
@@ -39,7 +39,8 @@ pub(super) async fn run_eval_batch(
     options: &EvalRunOptions,
 ) -> Result<()> {
     config.verification.fail_open = true;
-    let prepared_options = prepare_eval_options(options)?;
+    let prepared_options =
+        prepare_eval_options(options, config.retention.trend_history_max_entries)?;
     let models = matrix_models(&config, options);
     let repeat_total = options.repeat.max(1);
     let multi_model = models.len() > 1;
 
@@ -12,9 +12,13 @@ pub(super) struct PreparedEvalOptions {
     pub(super) baseline: Option<EvalReport>,
     pub(super) threshold_options: EvalThresholdOptions,
     pub(super) trend_path: Option<std::path::PathBuf>,
+    pub(super) trend_max_entries: usize,
 }
 
-pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedEvalOptions> {
+pub(super) fn prepare_eval_options(
+    options: &EvalRunOptions,
+    trend_max_entries: usize,
+) -> Result<PreparedEvalOptions> {
     let baseline = match options.baseline_report.as_deref() {
         Some(path) => Some(load_eval_report(path)?),
         None => None,
@@ -37,6 +41,7 @@ pub(super) fn prepare_eval_options(options: &EvalRunOptions) -> Result<PreparedE
             max_rule_f1_drop: max_rule_drop_thresholds,
         },
         trend_path: options.trend_file.clone(),
+        trend_max_entries,
     })
 }
 
 
@@ -45,7 +45,7 @@ pub(super) async fn materialize_eval_report(
         write_eval_report(&report, path).await?;
     }
     if let Some(path) = prepared_options.trend_path.as_deref() {
-        update_eval_quality_trend(&report, path).await?;
+        update_eval_quality_trend(&report, path, prepared_options.trend_max_entries).await?;
     }
 
     Ok(report)
 
@@ -8,6 +8,7 @@ use super::super::EvalReport;
 pub(in super::super) async fn update_eval_quality_trend(
     report: &EvalReport,
     path: &Path,
+    max_entries: usize,
 ) -> Result<()> {
     let Some(entry) = trend_entry_for_report(report) else {
         return Ok(());
@@ -23,6 +24,7 @@ pub(in super::super) async fn update_eval_quality_trend(
         QualityTrend::new()
     };
     trend.entries.push(entry);
+    trim_trend_entries(&mut trend.entries, max_entries);
 
     if let Some(parent) = path.parent() {
         tokio::fs::create_dir_all(parent)
@@ -35,6 +37,13 @@ pub(in super::super) async fn update_eval_quality_trend(
     Ok(())
 }
 
+fn trim_trend_entries(entries: &mut Vec<TrendEntry>, max_entries: usize) {
+    if entries.len() > max_entries {
+        let excess = entries.len() - max_entries;
+        entries.drain(0..excess);
+    }
+}
+
 fn trend_entry_for_report(report: &EvalReport) -> Option<TrendEntry> {
     let result = benchmark_result_for_report(report)?;
     let verification_health = report.verification_health.as_ref();
@@ -213,12 +222,17 @@ mod tests {
         let dir = tempdir().unwrap();
         let path = dir.path().join("trend.json");
 
-        update_eval_quality_trend(&sample_report(Some("first"), "2026-03-13T00:00:00Z"), &path)
-            .await
-            .unwrap();
+        update_eval_quality_trend(
+            &sample_report(Some("first"), "2026-03-13T00:00:00Z"),
+            &path,
+            200,
+        )
+        .await
+        .unwrap();
         update_eval_quality_trend(
             &sample_report(Some("second"), "2026-03-13T00:10:00Z"),
             &path,
+            200,
         )
         .await
         .unwrap();
@@ -258,4 +272,30 @@ mod tests {
             0.8
         );
     }
+
+    #[tokio::test]
+    async fn update_eval_quality_trend_trims_old_entries() {
+        let dir = tempdir().unwrap();
+        let path = dir.path().join("trend.json");
+
+        update_eval_quality_trend(
+            &sample_report(Some("first"), "2026-03-13T00:00:00Z"),
+            &path,
+            1,
+        )
+        .await
+        .unwrap();
+        update_eval_quality_trend(
+            &sample_report(Some("second"), "2026-03-13T00:10:00Z"),
+            &path,
+            1,
+        )
+        .await
+        .unwrap();
+
+        let content = tokio::fs::read_to_string(&path).await.unwrap();
+        let trend = QualityTrend::from_json(&content).unwrap();
+        assert_eq!(trend.entries.len(), 1);
+        assert_eq!(trend.entries[0].label.as_deref(), Some("second"));
+    }
 }
@@ -4,5 +4,5 @@ mod execute;
 mod matching;
 
 pub(super) use execute::{
-    describe_eval_fixture_graph, run_eval_fixture, EvalFixtureArtifactContext,
+    describe_eval_fixture_graph, prune_eval_artifacts, run_eval_fixture, EvalFixtureArtifactContext,
 };
@@ -18,7 +18,7 @@ use self::loading::prepare_fixture_execution;
 use self::result::build_fixture_result;
 use super::super::{EvalFixtureResult, LoadedEvalFixture};
 
-pub(in super::super) use self::artifact::EvalFixtureArtifactContext;
+pub(in super::super) use self::artifact::{prune_eval_artifacts, EvalFixtureArtifactContext};
 
 pub(crate) fn describe_eval_fixture_graph(
     repro_validate: bool,
 
@@ -1,4 +1,4 @@
-use anyhow::Result;
+use anyhow::{Context, Result};
 use serde::Serialize;
 use std::path::PathBuf;
 
@@ -100,6 +100,76 @@ pub(super) async fn maybe_write_fixture_artifact(
     Ok(Some(artifact_path.display().to_string()))
 }
 
+pub(crate) async fn prune_eval_artifacts(
+    artifact_dir: &std::path::Path,
+    max_age_days: i64,
+) -> Result<usize> {
+    let artifact_dir = artifact_dir.to_path_buf();
+    tokio::task::spawn_blocking(move || prune_eval_artifacts_blocking(&artifact_dir, max_age_days))
+        .await
+        .context("eval artifact retention task failed")?
+}
+
+fn prune_eval_artifacts_blocking(
+    artifact_dir: &std::path::Path,
+    max_age_days: i64,
+) -> Result<usize> {
+    if !artifact_dir.exists() {
+        return Ok(0);
+    }
+
+    let cutoff = std::time::SystemTime::now()
+        .checked_sub(std::time::Duration::from_secs(
+            max_age_days.max(1) as u64 * 86_400,
+        ))
+        .unwrap_or(std::time::SystemTime::UNIX_EPOCH);
+    prune_eval_artifacts_before(artifact_dir, cutoff)
+}
+
+fn prune_eval_artifacts_before(
+    artifact_dir: &std::path::Path,
+    cutoff: std::time::SystemTime,
+) -> Result<usize> {
+    let mut removed = 0;
+    prune_eval_artifacts_tree(artifact_dir, cutoff, true, &mut removed)?;
+    Ok(removed)
+}
+
+fn prune_eval_artifacts_tree(
+    path: &std::path::Path,
+    cutoff: std::time::SystemTime,
+    preserve_root: bool,
+    removed: &mut usize,
+) -> Result<()> {
+    for entry in std::fs::read_dir(path)? {
+        let entry = entry?;
+        let entry_path = entry.path();
+        let metadata = std::fs::symlink_metadata(&entry_path)?;
+
+        if metadata.is_dir() {
+            prune_eval_artifacts_tree(&entry_path, cutoff, false, removed)?;
+            if std::fs::read_dir(&entry_path)?.next().is_none() {
+                std::fs::remove_dir(&entry_path)?;
+                *removed += 1;
+            }
+        } else if metadata.is_file()
+            && metadata
+                .modified()
+                .map(|modified| modified < cutoff)
+                .unwrap_or(false)
+        {
+            std::fs::remove_file(&entry_path)?;
+            *removed += 1;
+        }
+    }
+
+    if !preserve_root && std::fs::read_dir(path)?.next().is_none() {
+        return Ok(());
+    }
+
+    Ok(())
+}
+
 fn sanitize_path_segment(value: &str) -> String {
     let mut sanitized = value
         .trim()
@@ -126,3 +196,44 @@ fn sanitize_path_segment(value: &str) -> String {
         sanitized
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::tempdir;
+
+    #[test]
+    fn prune_eval_artifacts_removes_stale_files_and_empty_dirs() {
+        let dir = tempdir().unwrap();
+        let nested = dir.path().join("fixtures");
+        std::fs::create_dir_all(&nested).unwrap();
+        let artifact = nested.join("old.json");
+        std::fs::write(&artifact, "{}").unwrap();
+
+        let removed = prune_eval_artifacts_before(
+            dir.path(),
+            std::time::SystemTime::now() + std::time::Duration::from_secs(1),
+        )
+        .unwrap();
+
+        assert_eq!(removed, 2);
+        assert!(!artifact.exists());
+        assert!(!nested.exists());
+    }
+
+    #[test]
+    fn prune_eval_artifacts_keeps_recent_files() {
+        let dir = tempdir().unwrap();
+        let nested = dir.path().join("fixtures");
+        std::fs::create_dir_all(&nested).unwrap();
+        let artifact = nested.join("recent.json");
+        std::fs::write(&artifact, "{}").unwrap();
+
+        let removed =
+            prune_eval_artifacts_before(dir.path(), std::time::SystemTime::UNIX_EPOCH).unwrap();
+
+        assert_eq!(removed, 0);
+        assert!(artifact.exists());
+        assert!(nested.exists());
+    }
+}
@@ -13,6 +13,7 @@ pub async fn feedback_eval_command(
     input_path: PathBuf,
     output_path: Option<PathBuf>,
     trend_path: Option<PathBuf>,
+    trend_max_entries: usize,
     confidence_threshold: f32,
     eval_report_path: Option<PathBuf>,
 ) -> Result<()> {
@@ -25,6 +26,7 @@ pub async fn feedback_eval_command(
         &loaded,
         output_path.as_deref(),
         trend_path.as_deref(),
+        trend_max_entries,
         confidence_threshold,
         eval_report.as_ref(),
     )
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ pub(super) async fn materialize_eval_report(`
`45`	`45`	`write_eval_report(&report, path).await?;`
`46`	`46`	`}`
`47`	`47`	`if let Some(path) = prepared_options.trend_path.as_deref() {`
`48`		`- update_eval_quality_trend(&report, path).await?;`
	`48`	`+ update_eval_quality_trend(&report, path, prepared_options.trend_max_entries).await?;`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`Ok(report)`