Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions challenge-benchmark-holdout-leakage-guard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Challenge Benchmark Holdout Leakage Guard

This module adds a benchmark holdout leakage guard for SCIBASE issue
[#18](https://github.com/SCIBASE-AI/SCIBASE.AI/issues/18). It protects
scientific bounty scoring by checking whether private holdout data was exposed
before leaderboard or award decisions.

The slice is intentionally narrow. It does not duplicate rubric readiness,
milestone progress, evidence freeze, IP redaction, payout eligibility, reviewer
consensus, solver prequalification, data-room access, clarification freeze,
award transparency, deliverable acceptance, or reproducibility environment
work.

## What It Checks

- Holdout artifacts have stable hashes before scoring.
- Holdout release events use sealed scoring channels only.
- Holdouts are not released before the submission deadline.
- Solver workspaces were not granted sealed holdout paths.
- Public baseline hashes do not match holdout hashes.
- Sponsor exceptions are approved outside the freeze window.
- Submission packages do not contain holdout artifact hashes.

## Local Usage

```bash
cd challenge-benchmark-holdout-leakage-guard
npm run check
npm test
npm run demo
```

`npm run demo` writes reviewer artifacts under `reports/`:

- `benchmark-leakage-packet.json`
- `benchmark-leakage-report.md`
- `summary.svg`
- `demo.mp4`

All examples use synthetic challenge metadata. No external services, accounts,
private challenge files, or payment information are required.
26 changes: 26 additions & 0 deletions challenge-benchmark-holdout-leakage-guard/acceptance-notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Acceptance Notes

## Reviewer Checklist

- Self-contained under `challenge-benchmark-holdout-leakage-guard/`.
- Dependency-free Node.js implementation.
- Synthetic challenge metadata only.
- Tests cover ready-to-score, sponsor-review, and quarantine decisions.
- Demo artifacts include JSON, Markdown, SVG, and MP4 outputs.

## Commands Run

```bash
npm run check
npm test
npm run demo
ffprobe -v error -show_entries format=duration,size -show_entries stream=codec_name,width,height -of default=noprint_wrappers=1 reports/demo.mp4
git diff --check
```

## Limitations

- This is a deterministic pre-scoring guard, not a live challenge storage or
data-room integration.
- Production integration should replace sample policy values and synthetic
events with signed SCIBASE challenge audit logs.
80 changes: 80 additions & 0 deletions challenge-benchmark-holdout-leakage-guard/demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
const fs = require("node:fs")
const path = require("node:path")
const { spawnSync } = require("node:child_process")
const { evaluateChallengePortfolio } = require("./index")
const { leakagePolicy, challenges } = require("./sample-data")

const reportsDir = path.join(__dirname, "reports")
fs.mkdirSync(reportsDir, { recursive: true })

const packet = evaluateChallengePortfolio({ challenges, leakagePolicy })
const { summary } = packet

fs.writeFileSync(
path.join(reportsDir, "benchmark-leakage-packet.json"),
`${JSON.stringify(packet, null, 2)}\n`,
)

const markdown = [
"# Challenge Benchmark Holdout Leakage Guard Report",
"",
`Generated challenges: ${summary.totalChallenges}`,
`Ready to score: ${summary.score}`,
`Needs review: ${summary.review}`,
`Quarantined: ${summary.quarantine}`,
`Scoring actions: ${summary.scoringActions}`,
`Audit digest: \`${packet.audit.digest}\``,
"",
"## Challenge Decisions",
...packet.decisions.flatMap((decision) => [
"",
`### ${decision.id}: ${decision.title}`,
`- Status: ${decision.status}`,
`- Holdout artifacts: ${decision.holdoutArtifacts}`,
`- Release events: ${decision.releaseEvents}`,
`- Solver grants: ${decision.solverWorkspaceGrants}`,
`- Findings: ${decision.findings.map((finding) => finding.code).join(", ") || "none"}`,
`- First action: ${decision.scoringActions[0]?.message || "none"}`,
]),
"",
]

fs.writeFileSync(path.join(reportsDir, "benchmark-leakage-report.md"), markdown.join("\n"))

const svg = `<svg xmlns="http://www.w3.org/2000/svg" width="960" height="540" viewBox="0 0 960 540">
<rect width="960" height="540" fill="#111827"/>
<text x="48" y="78" fill="#f9fafb" font-family="Arial" font-size="34" font-weight="700">Challenge Benchmark Holdout Leakage Guard</text>
<text x="48" y="124" fill="#cbd5e1" font-family="Arial" font-size="18">Synthetic scoring readiness packet for SCIBASE issue #18</text>
<rect x="48" y="170" width="250" height="150" rx="14" fill="#0f766e"/>
<text x="78" y="230" fill="#ecfeff" font-family="Arial" font-size="56" font-weight="700">${summary.score}</text>
<text x="78" y="270" fill="#ccfbf1" font-family="Arial" font-size="22">score</text>
<rect x="355" y="170" width="250" height="150" rx="14" fill="#b45309"/>
<text x="385" y="230" fill="#fff7ed" font-family="Arial" font-size="56" font-weight="700">${summary.review}</text>
<text x="385" y="270" fill="#ffedd5" font-family="Arial" font-size="22">review</text>
<rect x="662" y="170" width="250" height="150" rx="14" fill="#991b1b"/>
<text x="692" y="230" fill="#fef2f2" font-family="Arial" font-size="56" font-weight="700">${summary.quarantine}</text>
<text x="692" y="270" fill="#fee2e2" font-family="Arial" font-size="22">quarantine</text>
<text x="48" y="390" fill="#e5e7eb" font-family="Arial" font-size="20">Checks: holdout hashes, release channels, solver grants, public baselines, exceptions, submission hashes.</text>
<text x="48" y="430" fill="#9ca3af" font-family="Arial" font-size="16">Digest ${packet.audit.digest.slice(0, 24)}...</text>
</svg>
`
fs.writeFileSync(path.join(reportsDir, "summary.svg"), svg)

const ffmpeg = spawnSync("ffmpeg", [
"-y",
"-f",
"lavfi",
"-i",
"color=c=0x111827:s=960x540:d=6:r=15",
"-vf",
"drawbox=x=48:y=170:w=250:h=150:color=0x0f766e@1:t=fill,drawbox=x=355:y=170:w=250:h=150:color=0xb45309@1:t=fill,drawbox=x=662:y=170:w=250:h=150:color=0x991b1b@1:t=fill,drawbox=x=48:y=370:w=864:h=18:color=0xa78bfa@1:t=fill",
"-pix_fmt",
"yuv420p",
path.join(reportsDir, "demo.mp4"),
], { stdio: "ignore" })

if (ffmpeg.status !== 0) {
console.warn("ffmpeg video generation failed; summary.svg and JSON/Markdown reports were still generated.")
}

console.log(`Wrote benchmark leakage artifacts to ${reportsDir}`)
186 changes: 186 additions & 0 deletions challenge-benchmark-holdout-leakage-guard/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
const crypto = require("node:crypto")

function stableJson(value) {
if (Array.isArray(value)) {
return `[${value.map(stableJson).join(",")}]`
}
if (value && typeof value === "object") {
return `{${Object.keys(value).sort().map((key) => `${JSON.stringify(key)}:${stableJson(value[key])}`).join(",")}}`
}
return JSON.stringify(value)
}

function digestFor(value) {
return crypto.createHash("sha256").update(stableJson(value)).digest("hex")
}

function hoursBetween(a, b) {
return (new Date(b).getTime() - new Date(a).getTime()) / 3600000
}

function finding(code, severity, message, detail = {}) {
return { code, severity, message, detail }
}

function action(code, owner, message) {
return { code, owner, message }
}

function evaluateChallenge(challenge, policy) {
const findings = []
const holdoutHashes = new Set()
const holdoutPaths = new Set(challenge.holdoutArtifacts.map((artifact) => artifact.path))

for (const artifact of challenge.holdoutArtifacts) {
if (!artifact.hash || artifact.hash.length < policy.minHoldoutHashLength) {
findings.push(finding("HOLDOUT_HASH_MISSING", "blocker", "Holdout artifact needs a stable hash before scoring can start.", {
artifactId: artifact.id,
path: artifact.path,
}))
continue
}
holdoutHashes.add(artifact.hash)
}

for (const event of challenge.releaseEvents) {
if (!policy.allowedReleaseChannels.includes(event.channel)) {
findings.push(finding("HOLDOUT_RELEASED_TO_PUBLIC_CHANNEL", "blocker", "Holdout artifact was released through a non-scoring channel.", {
artifactId: event.artifactId,
channel: event.channel,
at: event.at,
actor: event.actor,
}))
}

if (new Date(event.at) < new Date(challenge.submissionDeadline)) {
findings.push(finding("HOLDOUT_RELEASED_BEFORE_DEADLINE", "blocker", "Holdout release happened before the submission deadline.", {
artifactId: event.artifactId,
channel: event.channel,
at: event.at,
submissionDeadline: challenge.submissionDeadline,
}))
}
}

for (const grant of challenge.solverWorkspaceGrants) {
const leakedPaths = grant.paths.filter((path) => holdoutPaths.has(path))
if (leakedPaths.length > 0 && new Date(grant.grantedAt) < new Date(challenge.submissionDeadline)) {
findings.push(finding("SOLVER_WORKSPACE_HOLDOUT_GRANT", "blocker", "Solver workspace received holdout paths before the deadline.", {
solverId: grant.solverId,
leakedPaths,
grantedAt: grant.grantedAt,
}))
}
}

for (const baseline of challenge.publicBaselines) {
if (holdoutHashes.has(baseline.hash)) {
findings.push(finding("PUBLIC_BASELINE_HASH_OVERLAP", "blocker", "A public baseline artifact matches the sealed holdout hash.", {
baselineId: baseline.id,
channel: baseline.channel,
hash: baseline.hash,
}))
}
}

for (const exception of challenge.sponsorExceptions) {
const hoursToDeadline = hoursBetween(exception.approvedAt || challenge.submissionDeadline, challenge.submissionDeadline)
if (!exception.approved) {
findings.push(finding("UNAPPROVED_SPONSOR_EXCEPTION", "warning", "Sponsor exception was logged but not approved before scoring.", {
exceptionId: exception.id,
artifactId: exception.artifactId,
reason: exception.reason,
}))
continue
}
if (hoursToDeadline <= policy.freezeWindowHours) {
findings.push(finding("LATE_SPONSOR_EXCEPTION_REVIEW", "warning", "Approved sponsor exception landed inside the benchmark freeze window.", {
exceptionId: exception.id,
artifactId: exception.artifactId,
approvedAt: exception.approvedAt,
freezeWindowHours: policy.freezeWindowHours,
}))
}
}

for (const submission of challenge.submissions) {
const overlappingHashes = submission.artifactHashes.filter((hash) => holdoutHashes.has(hash))
if (overlappingHashes.length > 0) {
findings.push(finding("SUBMISSION_CONTAINS_HOLDOUT_HASH", "blocker", "Submission package contains an artifact hash matching the sealed holdout.", {
solverId: submission.solverId,
submittedAt: submission.submittedAt,
overlappingHashes,
}))
}
}

const blockers = findings.filter((item) => item.severity === "blocker")
const warnings = findings.filter((item) => item.severity === "warning")
const status = blockers.length > 0 ? "quarantine" : (warnings.length > 0 ? "review" : "score")

const decision = {
id: challenge.id,
title: challenge.title,
status,
holdoutArtifacts: challenge.holdoutArtifacts.length,
releaseEvents: challenge.releaseEvents.length,
solverWorkspaceGrants: challenge.solverWorkspaceGrants.length,
publicBaselines: challenge.publicBaselines.length,
submissions: challenge.submissions.length,
findings,
scoringActions: buildScoringActions(status, findings),
}

return {
...decision,
auditDigest: digestFor(decision),
}
}

function buildScoringActions(status, findings) {
if (status === "score") {
return [action("RELEASE_TO_SCORING", "scoring-admin", "Proceed with sealed evaluator scoring.")]
}

const actions = []
for (const item of findings) {
if (item.severity === "blocker") {
actions.push(action(`QUARANTINE_${item.code}`, "challenge-ops", item.message))
} else {
actions.push(action(`REVIEW_${item.code}`, "sponsor-reviewer", item.message))
}
}

if (status === "quarantine") {
actions.push(action("FREEZE_LEADERBOARD", "challenge-ops", "Freeze leaderboard publication until leakage review is resolved."))
}

return [...new Map(actions.map((item) => [item.code, item])).values()]
}

function evaluateChallengePortfolio({ challenges, leakagePolicy }) {
const decisions = challenges.map((challenge) => evaluateChallenge(challenge, leakagePolicy))
const summary = {
totalChallenges: decisions.length,
score: decisions.filter((decision) => decision.status === "score").length,
review: decisions.filter((decision) => decision.status === "review").length,
quarantine: decisions.filter((decision) => decision.status === "quarantine").length,
scoringActions: decisions.reduce((sum, decision) => sum + decision.scoringActions.length, 0),
}

return {
generatedAt: "2026-05-21T14:25:00.000Z",
policy: leakagePolicy,
summary,
decisions,
audit: {
source: "synthetic-challenge-benchmark-leakage-review",
digest: digestFor({ summary, decisions }),
},
}
}

module.exports = {
evaluateChallenge,
evaluateChallengePortfolio,
}
12 changes: 12 additions & 0 deletions challenge-benchmark-holdout-leakage-guard/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"name": "challenge-benchmark-holdout-leakage-guard",
"version": "1.0.0",
"description": "Benchmark holdout leakage guard for SCIBASE scientific bounty challenges",
"private": true,
"scripts": {
"check": "node --check index.js && node --check sample-data.js && node --check test.js && node --check demo.js",
"test": "node test.js",
"demo": "node demo.js"
},
"license": "MIT"
}
Loading