Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
429 changes: 330 additions & 99 deletions reproducibility/site/scripts/build-data.ts

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions reproducibility/site/src/layouts/Default.astro
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const navLinks = [
{ label: "Datasets", href: "/datasets/" },
{ label: "Methods", href: "/methods/" },
{ label: "Models", href: "/models/" },
{ label: "Retrievers", href: "/retrievers/" },
{ label: "Cite", href: "/cite/" },
{ label: "About", href: "/about" },
{ label: "Toolkit", href: "https://querygym.com", external: true, newTab: true },
Expand Down
8 changes: 3 additions & 5 deletions reproducibility/site/src/pages/datasets/[id].astro
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ const metricCols: string[] = view?.metric_columns ?? datasetMeta?.eval_metrics ?
<tr class="border-b border-qg-border text-left">
<th class="px-3 py-2">Method</th>
<th class="px-3 py-2">Model</th>
<th class="px-3 py-2">Params</th>
<th class="px-3 py-2">Retriever</th>
{metricCols.map((m) => (
<th class="px-3 py-2 text-right qg-mono text-xs">{m}</th>
))}
Expand All @@ -63,11 +63,9 @@ const metricCols: string[] = view?.metric_columns ?? datasetMeta?.eval_metrics ?
<tbody>
{runs.map((r: any) => (
<tr class="border-b border-qg-border/60 hover:bg-qg-bg-soft">
<td class="px-3 py-2 font-medium">{r.method_id}</td>
<td class="px-3 py-2 font-medium">{r.method_display ?? r.method_id}</td>
<td class="px-3 py-2 qg-mono text-xs">{r.model}</td>
<td class="px-3 py-2 qg-mono text-xs text-qg-fg-muted">
{r.method_params_json}
</td>
<td class="px-3 py-2 text-xs">{r.retriever_display ?? r.retriever_id}</td>
{metricCols.map((m) => (
<td class="px-3 py-2 text-right">
<MetricCell value={r.metrics?.[m]} best={r.best_for?.[m]} />
Expand Down
232 changes: 185 additions & 47 deletions reproducibility/site/src/pages/index.astro
Original file line number Diff line number Diff line change
@@ -1,68 +1,206 @@
---
import Default from "../layouts/Default.astro";
import EmptyState from "../components/EmptyState.astro";
import Stat from "../components/Stat.astro";
import overview from "../data/overview.json";
import datasets from "../data/datasets.json";
import matrix from "../data/matrix.json";
import retrievers from "../data/retrievers.json";
import models from "../data/models.json";

const populated = overview.run_count > 0;
const featured = datasets.filter((d: any) => d.run_count > 0).slice(0, 6);

// Short dataset labels for the matrix header.
const SHORT: Record<string, string> = {
"msmarco-v1-passage.trecdl2019": "DL 2019",
"msmarco-v1-passage.trecdl2020": "DL 2020",
"msmarco-v1-passage.dlhard": "DL-HARD",
"beir-v1.0.0-scifact": "SciFact",
"beir-v1.0.0-arguana": "ArguAna",
"beir-v1.0.0-trec-covid": "COVID",
"beir-v1.0.0-fiqa": "FiQA",
"beir-v1.0.0-dbpedia-entity": "DBPedia",
"beir-v1.0.0-trec-news": "News",
};

const METRIC_LABEL: Record<string, string> = {
ndcg_cut_10: "nDCG@10",
recall_1000: "R@1k",
recall_100: "R@100",
};

const rows = [...matrix.rows].sort((a: any, b: any) => {
if (a.method_id !== b.method_id) return a.method_id.localeCompare(b.method_id);
if (a.model !== b.model) return a.model.localeCompare(b.model);
return a.retriever_id.localeCompare(b.retriever_id);
});

const datasetCols = matrix.dataset_columns;
---

<Default
title="Home"
description="QueryGym leaderboard — query reformulation methods × LLMs across IR benchmarks."
title="Leaderboard"
description="QueryGym reproducibility leaderboard — query reformulation methods × LLMs × retrievers across IR benchmarks."
>
<section class="mb-10">
<section class="mb-8">
<h1 class="text-3xl font-bold md:text-4xl">QueryGym Leaderboard</h1>
<p class="mt-3 max-w-3xl text-qg-fg-muted">
Reproducible LLM-based query reformulation results across BEIR, MS MARCO,
and TREC DL benchmarks. Every row is backed by a committed JSON, a TREC
run file, and the reformulated queries — verifiable from a fresh clone.
Reproducible LLM-based query reformulation results across MS MARCO DL,
DL-HARD, and BEIR — for BM25, SPLADE++, and BGE retrievers. Click any
score to see how to reproduce that run.
</p>
<div class="mt-4 flex flex-wrap gap-3 text-sm">
<a
href="/datasets/"
class="rounded-md bg-qg-accent px-4 py-2 font-medium text-white hover:opacity-90"
>Browse datasets</a
>
<a
href="/about"
class="rounded-md border border-qg-border px-4 py-2 font-medium hover:bg-qg-bg-soft"
>How to submit</a
>
</div>
</section>

<section class="mb-10 grid grid-cols-2 gap-4 md:grid-cols-4">
<section class="mb-6 grid grid-cols-2 gap-4 md:grid-cols-5">
<Stat label="Runs" value={overview.run_count} />
<Stat label="Datasets" value={overview.dataset_count} />
<Stat label="Methods" value={overview.method_count} />
<Stat label="LLMs" value={overview.model_count} />
<Stat label="Retrievers" value={overview.retriever_count} />
<Stat label="Datasets" value={overview.dataset_count} />
</section>

<section>
<h2 class="mb-4 text-xl font-semibold">Datasets with results</h2>
{
populated ? (
<ul class="grid gap-3 md:grid-cols-2 lg:grid-cols-3">
{featured.map((d: any) => (
<li class="rounded-lg border border-qg-border bg-qg-bg-soft p-4 hover:border-qg-accent">
<a href={`/datasets/${d.id}`} class="block">
<div class="font-semibold">{d.name}</div>
<div class="mt-1 text-xs text-qg-fg-muted">
{d.run_count} run{d.run_count === 1 ? "" : "s"} · {d.eval_metrics.join(", ")}
</div>
</a>
</li>
))}
</ul>
) : (
<EmptyState
title="No SIGIR runs landed yet"
body="The schema is locked and the pipeline is live. Once the SIGIR backfill PR lands, results appear here automatically."
/>
)
}
</section>
{
populated && (
<section class="mb-4">
<div class="flex flex-wrap items-center gap-3 text-sm">
<span class="text-qg-fg-muted">Retriever:</span>
<div id="qg-filter-retriever" class="flex flex-wrap gap-1.5">
<button data-value="" class="qg-chip qg-chip-active">All</button>
{retrievers.map((r: any) => (
<button data-value={r.id} class="qg-chip">{r.display_name}</button>
))}
</div>
<span class="ml-4 text-qg-fg-muted">Model:</span>
<div id="qg-filter-model" class="flex flex-wrap gap-1.5">
<button data-value="" class="qg-chip qg-chip-active">All</button>
{models.map((m: any) => (
<button data-value={m.id} class="qg-chip">{m.id}</button>
))}
</div>
<span class="ml-4 text-qg-fg-muted">Metric:</span>
<div id="qg-filter-metric" class="flex flex-wrap gap-1.5">
<button data-value="primary" class="qg-chip qg-chip-active">nDCG@10</button>
<button data-value="secondary" class="qg-chip">Recall</button>
</div>
</div>
</section>
)
}

{
populated ? (
<section class="overflow-x-auto rounded border border-qg-border">
<table id="qg-matrix" class="w-full text-sm">
<thead class="bg-qg-bg-soft text-xs uppercase tracking-wide text-qg-fg-muted">
<tr>
<th class="px-3 py-2 text-left">Method</th>
<th class="px-3 py-2 text-left">Model</th>
<th class="px-3 py-2 text-left">Retriever</th>
{datasetCols.map((d: any) => (
<th
class="qg-mono px-3 py-2 text-right text-xs"
title={d.id}
>
<span class="qg-col-label-primary">
{SHORT[d.id] ?? d.name}
<span class="text-qg-fg-muted"> / {METRIC_LABEL[d.primary_metric] ?? d.primary_metric}</span>
</span>
<span class="qg-col-label-secondary hidden">
{SHORT[d.id] ?? d.name}
<span class="text-qg-fg-muted"> / {METRIC_LABEL[d.secondary_metric] ?? d.secondary_metric}</span>
</span>
</th>
))}
</tr>
</thead>
<tbody>
{rows.map((row: any) => (
<tr
class="border-t border-qg-border/60 hover:bg-qg-bg-soft qg-matrix-row"
data-method={row.method_id}
data-model={row.model}
data-retriever={row.retriever_id}
>
<td class="px-3 py-2 font-medium">{row.method_display ?? row.method_id}</td>
<td class="px-3 py-2 qg-mono text-xs">{row.model}</td>
<td class="px-3 py-2 text-xs">{row.retriever_display ?? row.retriever_id}</td>
{datasetCols.map((d: any) => {
const cell = row.values?.[d.id] ?? {};
const runId = row.run_ids?.[d.id];
const primary = cell[d.primary_metric];
const secondary = d.secondary_metric ? cell[d.secondary_metric] : null;
return (
<td class="qg-mono px-3 py-2 text-right tabular-nums">
{runId ? (
<a class="hover:text-qg-accent hover:underline" href={`/runs/${runId}`} title="View run + reproduce">
<span class={`qg-cell-primary ${primary?.best ? "font-bold text-qg-accent" : ""}`}>
{primary !== undefined ? primary.value.toFixed(3) : "—"}
</span>
{secondary && (
<span class={`qg-cell-secondary hidden ${secondary.best ? "font-bold text-qg-accent" : ""}`}>
{secondary.value.toFixed(3)}
</span>
)}
</a>
) : (
<span class="text-qg-fg-muted">—</span>
)}
</td>
);
})}
</tr>
))}
</tbody>
</table>
</section>
) : (
<div class="mt-8 rounded-lg border border-qg-border bg-qg-bg-soft p-6 text-qg-fg-muted">
No runs yet. The matrix will populate when results land.
</div>
)
}
</Default>

<style>
.qg-chip {
@apply rounded-full border border-qg-border bg-qg-bg-soft px-3 py-1 text-xs font-medium text-qg-fg-muted hover:border-qg-accent;
}
.qg-chip-active {
@apply border-qg-accent bg-qg-accent text-white hover:border-qg-accent;
}
</style>

<script>
const tbody = document.querySelector<HTMLTableSectionElement>("#qg-matrix tbody");
if (tbody) {
const filters = { retriever: "", model: "", metric: "primary" };

function applyFilters() {
tbody.querySelectorAll<HTMLTableRowElement>(".qg-matrix-row").forEach((tr) => {
const okR = !filters.retriever || tr.dataset.retriever === filters.retriever;
const okM = !filters.model || tr.dataset.model === filters.model;
tr.style.display = okR && okM ? "" : "none";
});
const primaryShown = filters.metric === "primary";
document.querySelectorAll(".qg-col-label-primary").forEach((el) => el.classList.toggle("hidden", !primaryShown));
document.querySelectorAll(".qg-col-label-secondary").forEach((el) => el.classList.toggle("hidden", primaryShown));
document.querySelectorAll(".qg-cell-primary").forEach((el) => el.classList.toggle("hidden", !primaryShown));
document.querySelectorAll(".qg-cell-secondary").forEach((el) => el.classList.toggle("hidden", primaryShown));
}

function wireGroup(groupId: string, key: "retriever" | "model" | "metric") {
const group = document.getElementById(groupId);
if (!group) return;
group.querySelectorAll<HTMLButtonElement>("button").forEach((btn) => {
btn.addEventListener("click", () => {
group.querySelectorAll("button").forEach((b) => b.classList.remove("qg-chip-active"));
btn.classList.add("qg-chip-active");
filters[key] = btn.dataset.value ?? "";
applyFilters();
});
});
}

wireGroup("qg-filter-retriever", "retriever");
wireGroup("qg-filter-model", "model");
wireGroup("qg-filter-metric", "metric");
}
</script>
101 changes: 101 additions & 0 deletions reproducibility/site/src/pages/methods/[id].astro
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
---
import Default from "../../layouts/Default.astro";
import EmptyState from "../../components/EmptyState.astro";
import methods from "../../data/methods.json";

const shards = import.meta.glob<{ default: any }>(
"../../data/views/method-*.json",
{ eager: true },
);
function shardFor(id: string): any | null {
const key = Object.keys(shards).find((k) => k.endsWith(`/method-${id}.json`));
return key ? shards[key].default : null;
}

export async function getStaticPaths() {
const list = (await import("../../data/methods.json")).default as any[];
return list.map((m) => ({ params: { id: m.id } }));
}

const { id } = Astro.params;
const view = shardFor(id!);
const meta = methods.find((m: any) => m.id === id);
const rows = view?.rows ?? [];

const SHORT: Record<string, string> = {
"msmarco-v1-passage.trecdl2019": "DL 2019",
"msmarco-v1-passage.trecdl2020": "DL 2020",
"msmarco-v1-passage.dlhard": "DL-HARD",
"beir-v1.0.0-scifact": "SciFact",
"beir-v1.0.0-arguana": "ArguAna",
"beir-v1.0.0-trec-covid": "COVID",
"beir-v1.0.0-fiqa": "FiQA",
"beir-v1.0.0-dbpedia-entity": "DBPedia",
"beir-v1.0.0-trec-news": "News",
};
const METRIC_LABEL: Record<string, string> = {
ndcg_cut_10: "nDCG@10", recall_1000: "R@1k", recall_100: "R@100",
};

const datasetCols = (await import("../../data/matrix.json")).default.dataset_columns;

const title = view?.method_display ?? meta?.display ?? id ?? "Method";
---

<Default title={title} description={`Per-method leaderboard for ${title}.`}>
<a href="/methods/" class="text-sm text-qg-fg-muted hover:text-qg-fg">← All methods</a>
<h1 class="mt-2 text-2xl font-bold md:text-3xl">{title}</h1>
<div class="mt-1 qg-mono text-sm text-qg-fg-muted">{id}</div>
<div class="mt-1 text-sm text-qg-fg-muted">{rows.length} model × retriever combinations</div>

{
rows.length === 0 ? (
<div class="mt-8">
<EmptyState title="No runs for this method yet" body="" />
</div>
) : (
<section class="mt-6 overflow-x-auto rounded border border-qg-border">
<table class="w-full text-sm">
<thead class="bg-qg-bg-soft text-xs uppercase tracking-wide text-qg-fg-muted">
<tr>
<th class="px-3 py-2 text-left">Model</th>
<th class="px-3 py-2 text-left">Retriever</th>
{datasetCols.map((d: any) => (
<th class="qg-mono px-3 py-2 text-right text-xs" title={d.id}>
{SHORT[d.id] ?? d.name}
<span class="text-qg-fg-muted"> / {METRIC_LABEL[d.primary_metric] ?? d.primary_metric}</span>
</th>
))}
</tr>
</thead>
<tbody>
{rows.map((row: any) => (
<tr class="border-t border-qg-border/60 hover:bg-qg-bg-soft">
<td class="px-3 py-2 qg-mono text-xs">{row.model}</td>
<td class="px-3 py-2 text-xs">{row.retriever_display ?? row.retriever_id}</td>
{datasetCols.map((d: any) => {
const cell = row.values?.[d.id] ?? {};
const runId = row.run_ids?.[d.id];
const primary = cell[d.primary_metric];
return (
<td class="qg-mono px-3 py-2 text-right tabular-nums">
{runId && primary !== undefined ? (
<a class="hover:text-qg-accent hover:underline" href={`/runs/${runId}`}>
<span class={primary.best ? "font-bold text-qg-accent" : ""}>
{primary.value.toFixed(3)}
</span>
</a>
) : (
<span class="text-qg-fg-muted">—</span>
)}
</td>
);
})}
</tr>
))}
</tbody>
</table>
</section>
)
}
</Default>
Loading
Loading