Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/web/src/lib/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ export function sanitizeExcerpt(html: string): string {
/** Format a pl-* tag name into a human-readable label */
export function formatTagName(tag: string): string {
// "pl-113-100" → "PL 113-100"
return tag.replace(/^pl-/, "PL ").replace(/-/g, "-");
return tag.replace(/^pl-/, "PL ");
}

/** Extract year from a tag name (congress number) or fall back to ISO date string */
Expand Down
20 changes: 20 additions & 0 deletions packages/annotator/src/__tests__/annotator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,26 @@ describe('annotationToYaml', () => {
expect(yaml).toContain('caseName: "Case with \\"quotes\\""');
expect(yaml).toContain('holdingSummary: "Summary with \\"quotes\\""');
});

it('escapes backslashes before double quotes (CWE-116)', () => {
// Backslashes must be escaped first so an input like `\"X\"` survives
// round-tripping as YAML — otherwise the output is unbalanced.
const yaml = annotationToYaml({
targetSection: '18 U.S.C. 111',
lastSyncedET: '2025-06-15T12:00:00.000Z',
cases: [{
caseName: 'X \\ Y',
citation: '',
court: 'District',
date: '2024-01-01',
holdingSummary: 'path: C:\\Users\\test',
sourceUrl: 'https://example.com',
impact: 'historical',
}],
});
expect(yaml).toContain('caseName: "X \\\\ Y"');
expect(yaml).toContain('holdingSummary: "path: C:\\\\Users\\\\test"');
});
});

describe('getApiToken', () => {
Expand Down
11 changes: 8 additions & 3 deletions packages/annotator/src/annotator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,23 @@ export function buildAnnotationPath(section: string): string {
return `annotations/title-${titleNum}/section-${sectionNum}.yaml`;
}

/** Escape backslashes and double quotes for YAML double-quoted scalar. */
function yamlEscape(s: string): string {
return s.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
}

/** Serialize a PrecedentAnnotation to simple YAML (no external deps) */
export function annotationToYaml(annotation: PrecedentAnnotation): string {
const lines: string[] = [];
lines.push(`targetSection: "${annotation.targetSection}"`);
lines.push(`lastSyncedET: "${annotation.lastSyncedET}"`);
lines.push('cases:');
for (const c of annotation.cases) {
lines.push(` - caseName: "${c.caseName.replace(/"/g, '\\"')}"`);
lines.push(` citation: "${c.citation.replace(/"/g, '\\"')}"`);
lines.push(` - caseName: "${yamlEscape(c.caseName)}"`);
lines.push(` citation: "${yamlEscape(c.citation)}"`);
lines.push(` court: "${c.court}"`);
lines.push(` date: "${c.date}"`);
lines.push(` holdingSummary: "${c.holdingSummary.replace(/"/g, '\\"')}"`);
lines.push(` holdingSummary: "${yamlEscape(c.holdingSummary)}"`);
lines.push(` sourceUrl: "${c.sourceUrl}"`);
lines.push(` impact: "${c.impact}"`);
}
Expand Down
11 changes: 11 additions & 0 deletions packages/fetcher/src/__tests__/fetcher.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,17 @@ describe('parseReleasePoints', () => {
expect(points).toHaveLength(1);
expect(points[0]?.publicLaw).toBe('');
});

it('does not catastrophically backtrack on malformed input (CodeQL js/polynomial-redos)', () => {
// Long string of repeated non-quote chars that look like an unclosed href.
// The old [^"]* prefix would let the engine try every cut point.
const evil = `href="${'a'.repeat(40000)}/releasepoints/us/pl/118/200/xml_usc42@118-200X`;
const start = performance.now();
const points = parseReleasePoints(evil);
const elapsedMs = performance.now() - start;
expect(points).toEqual([]);
expect(elapsedMs).toBeLessThan(100);
});
});

// --- fetchWithRetry ---
Expand Down
4 changes: 3 additions & 1 deletion packages/fetcher/src/fetcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ export function parseReleasePoints(html: string): ReleasePoint[] {
const currentRelease = parseCurrentRelease(html);

// Match links like: /download/releasepoints/us/pl/118/42/xml_usc42@118-200.zip
const linkPattern = /href="([^"]*\/releasepoints\/us\/pl\/(\d+)\/([^/]+)\/[^"]*\.zip)"/g;
// Anchor segments to non-slash/non-quote chars so we don't get polynomial
// backtracking on malformed input (CodeQL js/polynomial-redos).
const linkPattern = /href="((?:https?:\/\/[^"/]+)?\/download\/releasepoints\/us\/pl\/(\d+)\/([^/"]+)\/xml_usc[^"/]+\.zip)"/g;
let match: RegExpExecArray | null;

// Extract unique title numbers from XML download links
Expand Down
26 changes: 11 additions & 15 deletions scripts/bulk-import.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
* Requires: packages to be built first (`pnpm build`).
*/

import { writeFile, mkdir, readFile, rm } from 'node:fs/promises';
import { writeFile, mkdir, readFile, rm, mkdtemp } from 'node:fs/promises';
import { join, dirname } from 'node:path';
import { tmpdir } from 'node:os';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { readdirSync, statSync } from 'node:fs';
import { readdirSync } from 'node:fs';

const execFileAsync = promisify(execFile);

Expand Down Expand Up @@ -50,27 +51,27 @@ interface TitleResult {
async function importTitle(paddedTitle: string): Promise<TitleResult> {
const displayTitle = parseInt(paddedTitle, 10).toString();
const url = buildUrl(paddedTitle);
const tmpZip = `/tmp/usc-title-${paddedTitle}.zip`;
const tmpDir = `/tmp/usc-title-${paddedTitle}`;
// Random per-invocation temp dir — no predictable path, no symlink race.
const workDir = await mkdtemp(join(tmpdir(), 'usc-title-'));
const tmpZip = join(workDir, 'download.zip');
const tmpDir = join(workDir, 'extract');

console.log(`\n=== Title ${displayTitle} ===`);
console.log(`Downloading: ${url}`);

try {
// Download
await execFileAsync('curl', ['-sL', '-o', tmpZip, url], { timeout: 60000 });
const zipStat = statSync(tmpZip);
console.log(` Downloaded ${(zipStat.size / 1024 / 1024).toFixed(2)} MB`);

// Check if it's a valid ZIP (non-empty, starts with PK)
// Validate as ZIP via the buffer we'll need anyway — derives size without a separate stat call.
const header = await readFile(tmpZip);
console.log(` Downloaded ${(header.length / 1024 / 1024).toFixed(2)} MB`);
if (header.length < 100 || header[0] !== 0x50 || header[1] !== 0x4b) {
console.error(' FAILED: Downloaded file is not a valid ZIP');
return { title: displayTitle, sections: 0, error: 'Invalid ZIP' };
}

// Extract
await rm(tmpDir, { recursive: true, force: true });
await mkdir(tmpDir, { recursive: true });
await execFileAsync('unzip', ['-o', '-q', tmpZip, '-d', tmpDir], { timeout: 60000 });

Expand Down Expand Up @@ -107,18 +108,13 @@ async function importTitle(paddedTitle: string): Promise<TitleResult> {

console.log(` Written to ${OUTPUT_ROOT}/statutes/title-${displayTitle}/`);

// Cleanup
await rm(tmpZip, { force: true });
await rm(tmpDir, { recursive: true, force: true });

return { title: displayTitle, sections: files.length };
} catch (error: unknown) {
const msg = error instanceof Error ? error.message : String(error);
console.error(` FAILED: ${msg}`);
// Cleanup on failure too
await rm(tmpZip, { force: true }).catch(() => {});
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
return { title: displayTitle, sections: 0, error: msg };
} finally {
await rm(workDir, { recursive: true, force: true }).catch(() => {});
}
}

Expand Down
14 changes: 8 additions & 6 deletions scripts/lib/import-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
* state persistence, git operations, and ZIP download/extraction.
*/

import { writeFile, mkdir, readFile, rm } from 'node:fs/promises';
import { writeFile, mkdir, readFile, rm, mkdtemp } from 'node:fs/promises';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { readdirSync } from 'node:fs';
Expand Down Expand Up @@ -154,8 +155,11 @@ export async function downloadAndExtractXml(
log: Logger
): Promise<string | null> {
const url = titleZipUrl(rp, paddedTitle);
const tmpZip = `/tmp/usc-hist-${paddedTitle}-${rp.congress}-${rp.law}.zip`;
const tmpDir = `/tmp/usc-hist-${paddedTitle}-${rp.congress}-${rp.law}`;
// Random per-invocation temp dir under the OS temp root — no predictable
// path, no symlink race.
const workDir = await mkdtemp(join(tmpdir(), 'usc-hist-'));
const tmpZip = join(workDir, 'download.zip');
const tmpDir = join(workDir, 'extract');

await rateLimiter.waitAndConsume();

Expand All @@ -176,7 +180,6 @@ export async function downloadAndExtractXml(
}

await writeFile(tmpZip, buf);
await rm(tmpDir, { recursive: true, force: true });
await mkdir(tmpDir, { recursive: true });
await execFileAsync('unzip', ['-o', '-q', tmpZip, '-d', tmpDir], { timeout: 60_000 });

Expand All @@ -185,7 +188,6 @@ export async function downloadAndExtractXml(

return await readFile(xmlPath, 'utf-8');
} finally {
await rm(tmpZip, { force: true }).catch(() => {});
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
await rm(workDir, { recursive: true, force: true }).catch(() => {});
}
}
Loading