Skip to content

Commit 71b4393

Browse files
committed
Refine crossref check and improve test cases
1 parent d48675c commit 71b4393

14 files changed

Lines changed: 220 additions & 93 deletions

File tree

.claude-plugin/marketplace.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
"path": "./"
1717
},
1818
"description": "A bibliography toolkit for LaTeX",
19-
"version": "1.5.1",
19+
"version": "1.6.0",
2020
"keywords": ["bibtex", "bibliography", "latex", "overleaf", "academic", "reference", "citation"],
2121
"category": "academic",
2222
"license": "MIT"

.claude-plugin/plugin.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "bibtools",
33
"description": "A bibliography toolkit for LaTeX",
4-
"version": "1.5.1",
4+
"version": "1.6.0",
55
"author": {
66
"name": "Yunguan Fu"
77
},

docs/build.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def build_html(cards_html: str) -> str:
409409
font-size: 0.75rem;
410410
}}
411411
412-
.diff-line {{ padding: 0 1rem; white-space: pre; }}
412+
.diff-line {{ padding: 0 1rem; white-space: pre; display: block; min-width: fit-content; }}
413413
.diff-line.del {{ background: var(--del-bg); color: var(--del-line); }}
414414
.diff-line.add {{ background: var(--add-bg); color: var(--add-line); }}
415415
.diff-line.ctx {{ color: var(--text-muted); }}

docs/index.html

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@
222222
font-size: 0.75rem;
223223
}
224224

225-
.diff-line { padding: 0 1rem; white-space: pre; }
225+
.diff-line { padding: 0 1rem; white-space: pre; display: block; min-width: fit-content; }
226226
.diff-line.del { background: var(--del-bg); color: var(--del-line); }
227227
.diff-line.add { background: var(--add-bg); color: var(--add-line); }
228228
.diff-line.ctx { color: var(--text-muted); }
@@ -402,7 +402,7 @@ <h2 class="section-title">Examples</h2>
402402
<div class="diff-card">
403403
<div class="diff-header">
404404
<span class="diff-title">Duplicate pair (bioRxiv preprint + published version)</span>
405-
<span class="stats"><span class="add-count">+1</span></span>
405+
<span class="stats"><span class="add-count">+4</span> <span class="del-count">-2</span></span>
406406
<span class="diff-badge badge-duplicate">duplicate detected</span>
407407
</div>
408408
<div class="diff-body">
@@ -418,12 +418,15 @@ <h2 class="section-title">Examples</h2>
418418
<div class="diff-line ctx"> </div>
419419
<div class="diff-line ctx"> @article{watson2023novo,</div>
420420
<div class="diff-line ctx"> title={De novo design of protein structure and function with RFdiffusion},</div>
421-
<div class="diff-line ctx"> author={Watson, Joseph L and Juergens, David and Bennett, Nathaniel R and Trippe, Brian L and Yim, Jason and Eisenach, Helen E and Ahern, Woody and Borst, Andrew J and Ragotte, Robert J and Milles, Lukas F and others},</div>
421+
<div class="diff-line del">- author={Watson, Joseph L and Juergens, David and Bennett, Nathaniel R and Trippe, Brian L and Yim, Jason and Eisenach, Helen E and Ahern, Woody and Borst, Andrew J and Ragotte, Robert J and Milles, Lukas F and others},</div>
422+
<div class="diff-line add">+ author={Watson, Joseph L. and Juergens, David and Bennett, Nathaniel R. and Trippe, Brian L. and Yim, Jason and Eisenach, Helen E. and Ahern, Woody and Borst, Andrew J. and Ragotte, Robert J. and Milles, Lukas F. and Wicky, Basile I. M. and Hanikel, Nikita and Pellock, Samuel J. and Courbet, Alexis and Sheffler, William and Wang, Jue and Venkatesh, Preetham and Sappington, Isaac and Torres, Susana Vázquez and Lauko, Anna and De Bortoli, Valentin and Mathieu, Emile and Ovchinnikov, Sergey and Barzilay, Regina and Jaakkola, Tommi S. and DiMaio, Frank and Baek, Minkyung and Baker, David},</div>
422423
<div class="diff-line ctx"> journal={Nature},</div>
423424
<div class="diff-line ctx"> volume={620},</div>
424425
<div class="diff-line ctx"> pages={1089--1100},</div>
425426
<div class="diff-line ctx"> year={2023},</div>
426-
<div class="diff-line ctx"> publisher={Nature Publishing Group UK London}</div>
427+
<div class="diff-line del">- publisher={Nature Publishing Group UK London}</div>
428+
<div class="diff-line add">+ publisher={Nature Publishing Group UK London},</div>
429+
<div class="diff-line add">+ number={7976}</div>
427430
<div class="diff-line ctx"> }</div>
428431
</div>
429432
</div>

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "bibtools"
3-
version = "1.5.1"
3+
version = "1.6.0"
44
description = "A bibliography toolkit for LaTeX, built as agent skills"
55
requires-python = ">=3.10"
66
license = "MIT"

skills/bibtidy/SKILL.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Assume standard brace-style BibTeX entries like `@article{...}`. Parenthesized B
2727
| **Field comparison** | `python3 $TOOLS_DIR/compare.py <file.bib> [--key KEY]` |
2828
| CrossRef DOI lookup | `python3 $TOOLS_DIR/crossref.py doi <DOI>` |
2929
| CrossRef title search | `python3 $TOOLS_DIR/crossref.py search "<title>"` |
30+
| CrossRef bibliographic search | `python3 $TOOLS_DIR/crossref.py bibliographic "<query>"` |
3031
| Duplicate detection | `python3 $TOOLS_DIR/duplicates.py <file.bib>` |
3132
| **Apply edits** | `python3 $TOOLS_DIR/edit.py <file.bib> <patches.json>` |
3233
| Web verification | web search (preferred) or CrossRef scripts (fallback) |
@@ -84,10 +85,10 @@ For unchanged entries, do NOT add any comments or URLs.
8485
2. Back up for format validation: `cp <file>.bib <file>.bib.orig`
8586
3. Preserve `@string`, `@preamble`, `@comment` blocks verbatim
8687
4. Run duplicate detection: `python3 $TOOLS_DIR/duplicates.py <file.bib>`
87-
5. **Run field comparison**: `python3 $TOOLS_DIR/compare.py <file.bib>` — this programmatically compares every entry against CrossRef and returns exact field-level mismatches. Do NOT skip this step or rely on visual comparison alone. The output is a JSON list; each element has `key`, `versions` (a list of CrossRef matches, each with `mismatches`, `url`, `doi`, etc.), and `error`. **Skip rule**: if an entry has zero mismatches across all versions and no error in the compare.py output, skip it entirely — do NOT investigate, modify, or add comments to it. Only proceed with entries that compare.py flagged (mismatches, errors, or duplicates from step 4).
88-
6. **Verify every planned modification with web search** — for entries that compare.py flagged with mismatches or errors, and for entries flagged as duplicates, verify the planned action via web search. For `fix` patches, gather one or more source URLs. Entries where `compare.py` returned an error (e.g. "No exact title match") still need full verification — the verification agent should search for the paper and check all fields. **Important: verification agents MUST NOT override `compare.py` field values.** CrossRef is the authoritative source for metadata (pages, volume, number, etc.) because it receives data directly from publishers via DOI registration. When web search finds a conflicting value (e.g. different page numbers on a conference website), always use the CrossRef value and add `% bibtidy: REVIEW` if desired — but do NOT keep the old value.
88+
5. **Run field comparison**: `python3 $TOOLS_DIR/compare.py <file.bib>` — this programmatically compares every entry against CrossRef and returns exact field-level mismatches. Do NOT skip this step or rely on visual comparison alone. The output is a JSON list; each element has `key`, `versions` (a list of alternative CrossRef candidate matches for the same entry, each with `mismatches`, `url`, `doi`, etc.), and `error`. When multiple versions are returned, choose the best matching candidate; do not combine fields from different versions. **Skip rule**: if an entry has zero mismatches across all versions and no error in the compare.py output, skip it entirely — do NOT investigate, modify, or add comments to it. Only proceed with entries that compare.py flagged (mismatches, errors, or duplicates from step 4).
89+
6. **Verify every planned modification with web search** — for entries that compare.py flagged with mismatches or errors, and for entries flagged as duplicates, verify the planned action via web search. For `fix` patches, gather one or more source URLs. Entries where `compare.py` returned an error (e.g. "No exact title match") still need full verification — the verification agent should search for the paper and check all fields. **Important: after selecting the best-matching version, verification agents MUST NOT override that selected version's `compare.py` field values.** CrossRef is the authoritative source for metadata (pages, volume, number, etc.) because it receives data directly from publishers via DOI registration. When web search finds a conflicting value (e.g. different page numbers on a conference website), always use the CrossRef value and add `% bibtidy: REVIEW` if desired — but do NOT keep the old value.
8990
7. **Flag hallucinated/non-existent references** — if compare.py returned an error (e.g. "No CrossRef results found" or "No exact title match in CrossRef results") AND web search also finds no matching paper, the reference likely does not exist. Add `% bibtidy: NOT FOUND — no matching paper on CrossRef or web search; verify this reference exists` above the entry, then comment out the entire entry (prefix every line with `% `). Do NOT add a URL line.
90-
8. Apply fixes **sequentially** using `edit.py` — do NOT edit the .bib file directly with agent editing tools (for example, Claude Code Edit or Codex `apply_patch`), and do NOT rewrite the entire file. Build a patches.json for each entry (or batch) and run `python3 $TOOLS_DIR/edit.py <file.bib> <patches.json>`. This ensures the commented original, source URLs, and explanation are always included. You MUST apply **every** mismatch reported by `compare.py` — do not skip any field (including `number`, `pages`, `volume`). Use the `crossref_value` exactly as given (do NOT rephrase, reformat, or partially apply it). For title mismatches on preprint→published upgrades, replace the entire title with the CrossRef title — do NOT try to edit parts of the old title. Never reject a CrossRef value because another source disagrees. Every patch MUST include `urls` (list of source URLs) and `explanation` (what changed and why). Include the CrossRef URL from compare.py's `url` field when available, plus any other authoritative source (DOI URL, venue page) found via web search.
91+
8. Apply fixes **sequentially** using `edit.py` — do NOT edit the .bib file directly with agent editing tools (for example, Claude Code Edit or Codex `apply_patch`), and do NOT rewrite the entire file. Build a patches.json for each entry (or batch) and run `python3 $TOOLS_DIR/edit.py <file.bib> <patches.json>`. This ensures the commented original, source URLs, and explanation are always included. After selecting the correct version, you MUST apply **every** mismatch from that selected version — do not skip any field (including `number`, `pages`, `volume`). Use the `crossref_value` exactly as given (do NOT rephrase, reformat, or partially apply it). For title mismatches on preprint→published upgrades, replace the entire title with the CrossRef title — do NOT try to edit parts of the old title. Never reject a CrossRef value because another source disagrees. Every patch MUST include `urls` (list of source URLs) and `explanation` (what changed and why). Include the CrossRef URL from compare.py's `url` field when available, plus any other authoritative source (DOI URL, venue page) found via web search.
9192
9. Run format validation; fix violations and re-run until clean
9293
10. Delete backup: `rm <file>.bib.orig`
9394
11. Print a Markdown summary table with headers `Metric | Count` and exactly these rows: total entries, verified, fixed, not found. Do NOT include a separate "needs manual review" row.
@@ -103,7 +104,7 @@ Use subagents, when available, to verify multiple entries concurrently. This dra
103104

104105
**When CrossRef fails**, find the paper's official venue page via web search. Many venues (JMLR, NeurIPS, CVPR, etc.) provide a downloadable `.bib` file — fetch it directly when possible. An official `.bib` is the most reliable source: it has exact title, authors, volume, number, and pages with no guessing.
105106

106-
Launch verification subagents in one batch so they run concurrently. Group into batches of ~10 if there are many entries.
107+
Launch verification subagents in one batch so they run concurrently. Cap at **6 subagents** and distribute entries evenly across them (e.g., 18 entries = 3 per subagent, 60 entries = 10 per subagent). For ≤6 entries, use one subagent per entry. If the user explicitly requests more parallelism, you may increase beyond 6.
107108

108109
**Step 2 — Collect results:** Read each agent's returned summary.
109110

@@ -145,7 +146,7 @@ For each `@article`, `@inproceedings`, `@book`, etc.:
145146

146147
**1. Verify existence** — Search for `"<title>" <first author last name>`. If not found: `% bibtidy: NOT FOUND — verify manually`
147148

148-
**2. Cross-check metadata** — If DOI exists, fetch via `crossref.py doi <DOI>`. Otherwise `crossref.py search "<title>"`. Compare title, year, authors, journal, volume, pages.
149+
**2. Cross-check metadata**Always search via `crossref.py search "<title>"`. If DOI exists, also fetch via `crossref.py doi <DOI>`. If neither finds a match, fall back to `crossref.py bibliographic "<title>"`. Compare title, year, authors, journal, volume, number, pages.
149150

150151
**3. Check for published preprints** — If journal contains "arxiv"/"biorxiv"/"chemrxiv", search for published version. Update title, venue, year, volume, pages, entry type. Only update if confirmed via DOI or two independent sources.
151152

skills/bibtidy/tools/compare.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import re
1818
import sys
1919

20-
from crossref import fetch_doi, search_title
20+
from crossref import fetch_doi, search_bibliographic, search_title
2121
from duplicates import normalize_doi, normalize_title, parse_bib_entries, split_bibtex_authors
2222

2323

@@ -179,34 +179,44 @@ def lookup_and_compare(entry: dict, timeout: int = 10) -> dict:
179179
key = entry["key"]
180180
result = {"key": key, "versions": [], "error": None}
181181

182-
# Try DOI first, then title search
182+
title = entry.get("title", "")
183183
doi = entry.get("doi", "").strip()
184-
if doi:
185-
doi = normalize_doi(doi)
186-
cr = fetch_doi(doi, timeout=timeout)
187-
else:
188-
title = entry.get("title", "")
189-
if not title:
190-
result["error"] = "No DOI or title to search"
191-
return result
192-
cr = search_title(title, rows=3, timeout=timeout)
193-
194-
if "error" in cr:
195-
result["error"] = cr["error"]
184+
if not title and not doi:
185+
result["error"] = "No DOI or title to search"
196186
return result
197187

198-
if "results" in cr:
199-
items = cr["results"]
200-
if not items:
201-
result["error"] = "No CrossRef results found"
202-
return result
203-
bib_title_norm = normalize_title(entry.get("title", ""))
204-
matches = [item for item in items if normalize_title(item.get("title") or "") == bib_title_norm]
205-
if not matches:
206-
result["error"] = "No exact title match in CrossRef results"
207-
return result
208-
else:
209-
matches = [cr]
188+
# Collect CrossRef results from multiple strategies.
189+
matches = []
190+
last_error = None
191+
bib_title_norm = normalize_title(title)
192+
193+
def _search_and_filter(search_fn, query):
194+
nonlocal last_error
195+
cr = search_fn(query, rows=3, timeout=timeout)
196+
if "error" in cr:
197+
last_error = cr["error"]
198+
return []
199+
return [item for item in cr.get("results", [])
200+
if normalize_title(item.get("title") or "") == bib_title_norm]
201+
202+
if title:
203+
matches = _search_and_filter(search_title, title)
204+
205+
if doi:
206+
cr = fetch_doi(normalize_doi(doi), timeout=timeout)
207+
if "error" in cr:
208+
last_error = cr["error"]
209+
else:
210+
existing_dois = {m.get("doi") for m in matches}
211+
if cr.get("doi") not in existing_dois:
212+
matches.append(cr)
213+
214+
if not matches and title:
215+
matches = _search_and_filter(search_bibliographic, title)
216+
217+
if not matches:
218+
result["error"] = last_error or "No exact title match in CrossRef results"
219+
return result
210220

211221
result["versions"] = [
212222
{

skills/bibtidy/tools/crossref.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
"""CrossRef API utilities for BibTeX validation.
33
44
Usage:
5-
python3 crossref.py doi <DOI> — fetch metadata for a specific DOI
6-
python3 crossref.py search "<title>" — search by title, return top 3 results
5+
python3 crossref.py doi <DOI> — fetch metadata for a specific DOI
6+
python3 crossref.py search "<title>" — search by title, return top 3 results
7+
python3 crossref.py bibliographic "<query>" — broad bibliographic search, return top 3 results
78
89
Options:
910
--timeout SECONDS HTTP timeout (default: 10)
@@ -132,9 +133,9 @@ def fetch_doi(doi: str, timeout: int = 10) -> dict:
132133
return {"error": f"Malformed response from CrossRef: {exc}"}
133134

134135

135-
def search_title(title: str, rows: int = 3, timeout: int = 10) -> dict:
136-
"""Search CrossRef by title, returning up to `rows` results."""
137-
params = urllib.parse.urlencode({"query.bibliographic": title, "rows": rows, "mailto": MAILTO})
136+
def _search(param_name: str, query: str, rows: int, timeout: int) -> dict:
137+
"""Search CrossRef works by a given query parameter."""
138+
params = urllib.parse.urlencode({param_name: query, "rows": rows, "mailto": MAILTO})
138139
url = f"{CROSSREF_API}/works?{params}"
139140
result = _safe_fetch(url, timeout)
140141
if "error" in result:
@@ -146,6 +147,16 @@ def search_title(title: str, rows: int = 3, timeout: int = 10) -> dict:
146147
return {"error": f"Malformed response from CrossRef: {exc}"}
147148

148149

150+
def search_title(title: str, rows: int = 3, timeout: int = 10) -> dict:
151+
"""Search CrossRef by title, returning up to `rows` results."""
152+
return _search("query.title", title, rows, timeout)
153+
154+
155+
def search_bibliographic(query: str, rows: int = 3, timeout: int = 10) -> dict:
156+
"""Search CrossRef by general bibliographic query, returning up to `rows` results."""
157+
return _search("query.bibliographic", query, rows, timeout)
158+
159+
149160
def main() -> None:
150161
parser = argparse.ArgumentParser(description="CrossRef API utilities for BibTeX validation")
151162
parser.add_argument("--timeout", type=int, default=10, help="HTTP request timeout in seconds")
@@ -157,12 +168,17 @@ def main() -> None:
157168
search_parser = subparsers.add_parser("search", help="Search by title")
158169
search_parser.add_argument("title", help="Title string to search for")
159170

171+
bib_parser = subparsers.add_parser("bibliographic", help="Search by bibliographic query")
172+
bib_parser.add_argument("query", help="Bibliographic query string")
173+
160174
args = parser.parse_args()
161175

162176
if args.command == "doi":
163177
result = fetch_doi(args.doi_value, timeout=args.timeout)
164178
elif args.command == "search":
165179
result = search_title(args.title, timeout=args.timeout)
180+
elif args.command == "bibliographic":
181+
result = search_bibliographic(args.query, timeout=args.timeout)
166182
else:
167183
parser.print_help()
168184
sys.exit(1)

0 commit comments

Comments
 (0)