sourcegraph
diff --git a/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/environment/Dockerfile‎
Lines changed: 32 additions & 0 deletions b/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/environment/Dockerfile‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/instruction.md‎
Lines changed: 43 additions & 0 deletions b/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/instruction.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/task.toml‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/task.toml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions b/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/tests/eval.sh‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/tests/oracle_answer.json‎
Lines changed: 31 additions & 0 deletions b/‎benchmarks/csb_org_crossorg/ccx-crossorg-280/tests/oracle_answer.json‎
Lines changed: 31 additions & 0 deletions
@@ -0,0 +1,32 @@
+FROM ubuntu:22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Base tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    ca-certificates \
+    curl \
+    python3 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clone local checkout repos (baseline config: agent has local access to these)
+RUN git clone --depth 1 https://github.com/sg-evals/kubernetes--v1.32.0 /workspace/kubernetes--v1.32.0
+RUN git clone --depth 1 https://github.com/sg-evals/grafana--v11.4.0 /workspace/grafana--v11.4.0
+
+# Initialize git identity for agent commits
+RUN git config --global user.email "agent@example.com" && \
+    git config --global user.name "Agent" && \
+    git config --global safe.directory '*'
+
+# Create log directories
+RUN mkdir -p /logs/agent /logs/verifier
+
+# Pre-create claude user and set ownership at build time so Harbor's
+# runtime chown is a no-op (avoids 15-30 min delay on large repos).
+RUN (adduser --disabled-password --gecos '' claude 2>/dev/null || true) && \
+    for d in /workspace /app /testbed /logs; do [ -d "$d" ] && chown -R claude:claude "$d"; done || true
+
+ENTRYPOINT []
@@ -0,0 +1,43 @@
+# Prometheus Metrics Exposition Pattern Across Kubernetes and Grafana
+
+## Your Task
+
+Find the Go source files in both kubernetes/kubernetes and grafana/grafana that implement the Prometheus metrics exposition pattern: where metrics are registered using the Prometheus client_golang library, how metric collectors are organized, and where the `/metrics` HTTP endpoint is configured to expose them. Compare how both projects structure their metrics infrastructure.
+
+## Context
+
+You are working on a codebase task involving repos from the crossorg domain. Both Kubernetes and Grafana use the Prometheus client_golang library to expose internal metrics, but they organize their metrics registration and exposition differently. Your goal is to map these patterns across both codebases.
+
+## Available Resources
+
+The local `/workspace/` directory contains: sg-evals/kubernetes--v1.32.0, sg-evals/grafana--v11.4.0.
+
+**Note:** Additional repositories are accessible via Sourcegraph MCP tools:
+- `sg-evals/kubernetes--v1.32.0` (kubernetes/kubernetes)
+- `sg-evals/grafana--v11.4.0` (grafana/grafana)
+
+## Output Format
+
+Create a file at `/workspace/answer.json` with your findings in the following structure:
+
+```json
+{
+  "files": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go"}
+  ],
+  "symbols": [
+    {"repo": "org/repo-name", "path": "relative/path/to/file.go", "symbol": "SymbolName"}
+  ],
+  "chain": [],
+  "text": "Narrative explanation of your findings, citing repos and file paths."
+}
+```
+
+Include only the fields relevant to this task. Your answer is evaluated against a closed-world oracle — completeness matters.
+
+## Evaluation
+
+Your answer will be scored on:
+- **File recall and precision**: Did you find all relevant files?
+- **Symbol identification**: Correct symbol names and locations?
+- **Keyword presence**: Did you mention key concepts (metrics registry, collectors, exposition)?
@@ -0,0 +1,29 @@
+version = "1.0"
+
+[metadata]
+name = "CCX-crossorg-280"
+description = "Prometheus Metrics Exposition Pattern Across Kubernetes and Grafana"
+license = "Apache-2.0"
+
+[task]
+id = "CCX-crossorg-280"
+repo = "kubernetes/kubernetes"
+category = "cross-org-discovery"
+language = "go"
+difficulty = "hard"
+time_limit_sec = 900
+mcp_suite = "csb_org_crossorg"
+use_case_id = 280
+repo_set_id = "kubernetes-grafana-observability"
+mcp_unique = true
+verification_modes = ["artifact"]
+
+[verification]
+type = "test"
+command = "bash /tests/test.sh"
+
+reward_type = "score"
+description = "Prometheus Metrics Exposition Pattern Across Kubernetes and Grafana"
+
+[environment]
+build_timeout_sec = 600.0
@@ -0,0 +1,68 @@
+#!/bin/bash
+# eval.sh — MCP-unique benchmark evaluator for CCX-crossorg-280
+# Exit-code-first (SWE-Factory pattern):
+#   exit 0 — agent produced useful output (composite score > 0)
+#   exit 1 — total failure (composite score == 0 or missing answer)
+#
+# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
+
+set -euo pipefail
+
+TASK_ID="CCX-crossorg-280"
+ANSWER_PATH="/workspace/answer.json"
+TASK_SPEC_PATH="/tests/task_spec.json"
+ORACLE_CHECKS="/tests/oracle_checks.py"
+REWARD_PATH="/logs/verifier/reward.txt"
+
+mkdir -p /logs/verifier
+
+echo "=== CCX-crossorg-280 evaluator ==="
+echo "Task spec: $TASK_SPEC_PATH"
+echo "Answer:    $ANSWER_PATH"
+echo ""
+
+# sg_only mode guard: restore full repo if verifier wrapper exists
+if [ -f /tmp/.sg_only_mode ] && [ -f /tests/sgonly_verifier_wrapper.sh ]; then
+    echo "sg_only mode: sourcing verifier wrapper..."
+    source /tests/sgonly_verifier_wrapper.sh
+fi
+
+# Verify answer file exists
+if [ ! -f "$ANSWER_PATH" ]; then
+    echo "ERROR: answer.json not found at $ANSWER_PATH"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+# Validate answer is valid JSON
+if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
+    echo "ERROR: answer.json is not valid JSON"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "answer.json found and valid JSON"
+
+# Run oracle checks
+if [ ! -f "$ORACLE_CHECKS" ]; then
+    echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo "Running oracle checks..."
+SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
+
+# Validate score is a number
+if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
+    echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
+    echo "0.0" > "$REWARD_PATH"
+    exit 1
+fi
+
+echo ""
+echo "Composite score: $SCORE"
+echo "$SCORE" > "$REWARD_PATH"
+
+# Exit based on score (SWE-Factory exit-code-first pattern)
+python3 -c "import sys; sys.exit(0 if float('$SCORE') > 0 else 1)"
@@ -0,0 +1,31 @@
+{
+  "files": [
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/registry.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/counter.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/gauge.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/histogram.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/opts.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "pkg/kubelet/metrics/metrics.go"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/infra/metrics/metrics.go"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/api/metrics.go"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/infra/metrics/service.go"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/services/ngalert/metrics/metrics.go"}
+  ],
+  "symbols": [
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/registry.go", "symbol": "KubeRegistry"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/counter.go", "symbol": "Counter"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go", "symbol": "requestCounter"},
+    {"repo": "sg-evals/kubernetes--v1.32.0", "path": "staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go", "symbol": "Register"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/infra/metrics/metrics.go", "symbol": "MStatTotalDashboards"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/api/metrics.go", "symbol": "MApiStatus"},
+    {"repo": "sg-evals/grafana--v11.4.0", "path": "pkg/infra/metrics/service.go", "symbol": "InternalMetricsService"}
+  ],
+  "text": "Both Kubernetes and Grafana use the Prometheus client_golang library for metrics exposition but with distinct architectural patterns. Kubernetes wraps the Prometheus client in staging/src/k8s.io/component-base/metrics/ with a custom KubeRegistry that adds stability-level annotations and deprecation tracking. Individual subsystems (apiserver, kubelet, scheduler) register their own metrics using this wrapper. Grafana uses a more centralized approach with pkg/infra/metrics/ containing global metric definitions registered at startup, plus per-service metrics (ngalert, api). Both expose metrics via /metrics HTTP endpoints using promhttp handlers.",
+  "_metadata": {
+    "oracle_type": "file_set_match",
+    "discovery_method": "manual_code_review",
+    "verified_at": "2026-03-03"
+  }
+}