Skip to content
Merged
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ PR4 introduced an observation/canonical/resolver tier on top of raw form storage

`src/sec.ts` invokes **`Sqlite.init()`** when the installed `workglow` package defines it (`typeof Sqlite.init === "function"`), so newer Workglow releases load the SQLite binding before `getDb()` opens a database. Older `workglow` versions without `init` skip this step.

**`getDb()` is SQLite-only.** It throws `SecCliConfigurationError` when `SEC_DB_TYPE !== "sqlite"` to prevent the silent data divergence that occurred before (`getDb()` would open a stray SQLite file even under Postgres, and rows written through it never reached the configured backend). Tasks that need a raw SQL fast path beyond what `ITabularStorage` exposes must branch on `SEC_DB_TYPE` themselves — see `src/storage/entity/cikNameBulkWriter.ts` for the pattern (SQLite → `getDb()`, Postgres → `getPgPool()`, otherwise → repository `putBulk` for tests).

### Dependency Injection

Uses the `workglow` package’s `globalServiceRegistry` with typed tokens. Production uses `SqliteTabularRepository`, tests use `InMemoryTabularRepository`. Call `resetDependencyInjectionsForTesting()` from `src/config/TestingDI.ts` in test setup.
Expand Down
12 changes: 11 additions & 1 deletion src/cli/groups/init.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,21 @@ export function addInitCommand(parent: Command): void {
mkdirSync(rawDataFolder, { recursive: true });
console.log(`Created directory: ${rawDataFolder}`);

// Re-read env so DI picks up new values
// Re-read env so DI picks up new values. Push every var the
// wizard collected, including the Postgres set — assertSecCliEnvConfigured
// now fails fast for a Postgres dbType with missing PG env, so
// the wizard would crash immediately after writing .env.local
// if we left these blank.
process.env.SEC_DB_TYPE = config.dbType;
process.env.SEC_DB_FOLDER = config.dbFolder;
process.env.SEC_DB_NAME = config.dbName;
process.env.SEC_RAW_DATA_FOLDER = config.rawDataFolder;
if (config.pgUrl !== undefined) process.env.SEC_PG_URL = config.pgUrl;
if (config.pgHost !== undefined) process.env.SEC_PG_HOST = config.pgHost;
if (config.pgPort !== undefined) process.env.SEC_PG_PORT = config.pgPort;
if (config.pgUser !== undefined) process.env.SEC_PG_USER = config.pgUser;
if (config.pgPassword !== undefined) process.env.SEC_PG_PASSWORD = config.pgPassword;
if (config.pgDatabase !== undefined) process.env.SEC_PG_DATABASE = config.pgDatabase;

EnvToDI();
if (config.dbType === "sqlite" && typeof Sqlite.init === "function") {
Expand Down
7 changes: 7 additions & 0 deletions src/cli/groups/query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -96,6 +97,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -138,6 +140,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -181,6 +184,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -222,6 +226,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -263,6 +268,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down Expand Up @@ -300,6 +306,7 @@ export function addQueryCommands(program: Command): void {
renderTable(result.rows as Record<string, unknown>[], columns, {
format,
total: result.total,
totalApprox: result.totalApprox,
offset,
limit,
})
Expand Down
34 changes: 34 additions & 0 deletions src/cli/output/TableRenderer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,40 @@ describe("renderTable", () => {
const lines = result.split("\n");
expect(lines[1]).toBe("1,,");
});

it("defuses formula-injection prefixes by quoting them", () => {
// Spreadsheets interpret cells starting with =/+/-/@ as formulas
// (incl. data exfiltration via WEBSERVICE/HYPERLINK). Prefix with
// a single quote so a CSV emitted from `sec query --format csv` is
// safe to open in Excel/Sheets/Numbers.
const rows = [
{ id: 1, name: "=cmd|' /C calc'!A0", value: 1 },
{ id: 2, name: "+1+1", value: 2 },
{ id: 3, name: "-1+1", value: 3 },
{ id: 4, name: "@SUM(A1:A9)", value: 4 },
{ id: 5, name: "\tleading tab", value: 5 },
];
const result = renderTable(rows, columns, { format: "csv" });
const lines = result.split("\n");
expect(lines[1]).toBe(`1,'=cmd|' /C calc'!A0,1`);
expect(lines[2]).toBe("2,'+1+1,2");
expect(lines[3]).toBe("3,'-1+1,3");
expect(lines[4]).toBe("4,'@SUM(A1:A9),4");
expect(lines[5]).toBe("5,'\tleading tab,5");
});

it("does not prefix benign leading characters", () => {
const rows = [
{ id: 1, name: "Alice", value: 1 },
{ id: 2, name: "1+1", value: 2 },
{ id: 3, name: "", value: 3 },
];
const result = renderTable(rows, columns, { format: "csv" });
const lines = result.split("\n");
expect(lines[1]).toBe("1,Alice,1");
expect(lines[2]).toBe("2,1+1,2");
expect(lines[3]).toBe("3,,3");
});
});

describe("table format", () => {
Expand Down
36 changes: 32 additions & 4 deletions src/cli/output/TableRenderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,16 @@ export interface RenderOptions {
readonly total?: number;
readonly offset?: number;
readonly limit?: number;
/**
* Set when the displayed `total` is a lower bound — the underlying
* query streamed and stopped after collecting offset+limit matches
* without exhausting the dataset. Rendered as "≥ N" with a hint to
* narrow the filter.
*/
readonly totalApprox?: {
readonly atLeast: number;
readonly exhausted: boolean;
};
}

function truncate(value: string, width: number): string {
Expand All @@ -26,10 +36,20 @@ function pad(value: string, width: number): string {
}

function escapeCsvValue(value: string): string {
if (value.includes(",") || value.includes('"') || value.includes("\n")) {
return '"' + value.replace(/"/g, '""') + '"';
// Defuse CSV/spreadsheet formula injection. When Excel/Sheets/Numbers
// open a CSV, a cell starting with =/+/-/@ (or with leading TAB/CR
// that some loaders strip) is interpreted as a formula, which can
// exfiltrate data via WEBSERVICE/HYPERLINK or run external commands.
// Prefix a single quote — spreadsheets render it as a literal and hide
// the prefix; plain CSV consumers see the original text with one
// leading apostrophe, which is a small price for not shipping a known
// attack vector.
const dangerous = value.length > 0 && /^[=+\-@\t\r]/.test(value);
let escaped = dangerous ? "'" + value : value;
if (escaped.includes(",") || escaped.includes('"') || escaped.includes("\n")) {
return '"' + escaped.replace(/"/g, '""') + '"';
}
return value;
return escaped;
}

function cellValue(row: Record<string, unknown>, key: string): string {
Expand Down Expand Up @@ -74,7 +94,15 @@ function renderTextTable(
const start = count === 0 ? 0 : offset + 1;
const end = count === 0 ? 0 : offset + count;
lines.push("");
lines.push(`Showing ${start}-${end} of ${options.total} results`);
const isApprox =
options.totalApprox !== undefined && options.totalApprox.exhausted === false;
const totalLabel = isApprox ? `≥ ${options.total}` : `${options.total}`;
lines.push(`Showing ${start}-${end} of ${totalLabel} results`);
if (isApprox) {
lines.push(
`(streamed; narrow the filter for an exact count and full pagination)`
);
}

if (count > 0 && end < options.total) {
lines.push(`(use --offset ${end} for next page)`);
Expand Down
19 changes: 19 additions & 0 deletions src/cli/queries/CikQuery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,23 @@ describe("queryCiks", () => {
expect(result.rows).toEqual([]);
expect(result.tableEmpty).toBe(false);
});

it("paginates without scanning the full table when the needle is empty", async () => {
// Regression: previously walked every row even for empty needle.
// Empty needle now uses size() + getOffsetPage() so memory stays
// bounded regardless of table size.
await seed([
{ cik: 1, name: "AAA" },
{ cik: 2, name: "BBB" },
{ cik: 3, name: "CCC" },
{ cik: 4, name: "DDD" },
{ cik: 5, name: "EEE" },
]);
const result = await queryCiks({ limit: 2, offset: 1 });
expect(result.total).toBe(5);
expect(result.rows.length).toBe(2);
expect(result.tableEmpty).toBe(false);
// Should NOT have a totalApprox — total is exact.
expect(result.totalApprox).toBeUndefined();
});
});
61 changes: 55 additions & 6 deletions src/cli/queries/CikQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,58 @@ export interface CikQueryResult extends QueryResult<CikNameType> {
}

/**
* Queries the `cik_names` table for companies whose name matches the given needle.
* Case-insensitive. Ranks exact match first, then prefix, then substring;
* ties broken by shorter name, then lower CIK.
* Soft cap on substring/prefix matches we'll collect before sorting. Stops
* the empty-needle case (which previously walked the entire ~1M-row table)
* and any pathologically broad needle from exhausting memory. Picked so a
* normal `offset+limit` of a few hundred has plenty of headroom for the
* rank-based reordering.
*/
const MAX_FUZZY_MATCHES = 1000;

/**
* Queries the `cik_names` table for companies whose name matches the given
* needle. Case-insensitive. Ranks exact match first, then prefix, then
* substring; ties broken by shorter name, then lower CIK.
*
* `tableEmpty` is true when the underlying table has no rows at all
* (distinct from "no matches") so callers can prompt the user to run the
* ingest.
*
* `tableEmpty` is true when the underlying table has no rows at all (distinct
* from "no matches") so callers can prompt the user to run the ingest.
* Two operating modes:
*
* 1. **Empty needle** — `size() + getOffsetPage()`. No scan, no
* sorting. The "rank" concept doesn't apply because there's no
* target; rows come back in PK order.
* 2. **Exact / prefix / substring** — streams via `records()` because
* workglow has no LIKE operator AND its equality is case-sensitive
* (so even `--exact` can't push down: SEC stores names as
* "Apple Inc." and a user querying "APPLE INC." would miss). Capped
* at `MAX_FUZZY_MATCHES` so the worst case is bounded; if the cap
* fires, `totalApprox.exhausted` is `false` and the UI renders
* "≥ N".
*/
export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult> {
const repo = globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN);
const limit = params.limit ?? 25;
const offset = params.offset ?? 0;
const needle = params.name?.toLowerCase().trim() ?? "";

// Mode 1: empty needle — straight pagination.
if (needle === "" && !params.exact) {
const total = await repo.size();
const rows = (await repo.getOffsetPage(offset, limit)) ?? [];
return { rows, total, tableEmpty: total === 0 };
}

// Mode 2/3: stream and rank. We have to stream rather than `query()`
// because workglow's equality is case-sensitive and there is no LIKE
// operator — both case-insensitive exact match and prefix/substring
// matches have to be evaluated client-side. Capped at MAX_FUZZY_MATCHES
// so the worst case is bounded; if the cap fires, `totalApprox.exhausted`
// is `false` and the UI renders "≥ N".
const matches: { row: CikNameType; rank: number }[] = [];
let anyRowSeen = false;
let exhausted = true;
for await (const row of repo.records(5000)) {
anyRowSeen = true;
if (row.name === null || row.name === undefined) continue;
Expand All @@ -43,7 +80,7 @@ export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult>
if (params.exact) {
if (hay !== needle) continue;
rank = 0;
} else if (needle === "" || hay === needle) {
} else if (hay === needle) {
rank = 0;
} else if (hay.startsWith(needle)) {
rank = 1;
Expand All @@ -53,6 +90,10 @@ export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult>
continue;
}
matches.push({ row, rank });
if (matches.length >= MAX_FUZZY_MATCHES) {
exhausted = false;
break;
}
}

matches.sort(
Expand All @@ -62,9 +103,17 @@ export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult>
a.row.cik - b.row.cik
);

// If we hit the cap, we don't know the true total — surface it as a
// lower bound so the renderer says "≥ N". When the stream drained
// (exhausted), matches.length IS the exact total and we omit
// totalApprox entirely so consumers can rely on its presence as the
// streamed-and-capped signal.
// tableEmpty stays correct because we still see at least one row
// before hitting the cap unless the table is empty.
return {
rows: matches.slice(offset, offset + limit).map((m) => m.row),
total: matches.length,
tableEmpty: !anyRowSeen,
...(exhausted ? {} : { totalApprox: { atLeast: matches.length, exhausted: false } }),
};
}
62 changes: 39 additions & 23 deletions src/cli/queries/CrowdfundingQuery.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { globalServiceRegistry } from "workglow";
import type { SearchCriteria } from "workglow";
import type { Crowdfunding } from "../../storage/portal/CrowdfundingSchema";
import { CROWDFUNDING_REPOSITORY_TOKEN } from "../../storage/portal/CrowdfundingSchema";
import type { QueryResult } from "./EntityQuery";
import { collectPage, streamMatchingRows } from "./_streamMatches";

export interface CrowdfundingQueryParams {
readonly search?: string;
Expand All @@ -20,33 +22,47 @@ export async function queryCrowdfunding(
const limit = params.limit ?? 25;
const offset = params.offset ?? 0;

let items: Crowdfunding[];

if (params.cik !== undefined) {
items = (await repo.query({ cik: params.cik } as Partial<Crowdfunding>)) ?? [];
} else {
items = (await repo.getAll()) ?? [];
}

if (params.search) {
const searchLower = params.search.toLowerCase();
items = items.filter((c) => c.name.toLowerCase().includes(searchLower));
const criteria: SearchCriteria<Crowdfunding> = {};
if (params.cik !== undefined) (criteria as Partial<Crowdfunding>).cik = params.cik;
if (params.portal !== undefined) (criteria as Partial<Crowdfunding>).portal_cik = params.portal;
// One side of the date range pushes down via SearchCondition; both
// sides need the predicate (workglow takes one condition per column).
if (params.after !== undefined && params.before === undefined) {
(criteria as any).filing_date = { value: params.after, operator: ">=" };
} else if (params.before !== undefined && params.after === undefined) {
(criteria as any).filing_date = { value: params.before, operator: "<=" };
}

if (params.portal !== undefined) {
items = items.filter((c) => c.portal_cik === params.portal);
}

if (params.after !== undefined) {
items = items.filter((c) => c.filing_date >= params.after!);
}
const hasRange = params.after !== undefined && params.before !== undefined;
const hasSearch = params.search !== undefined && params.search !== "";
const needsPredicate = hasRange || hasSearch;

if (params.before !== undefined) {
items = items.filter((c) => c.filing_date <= params.before!);
if (!needsPredicate) {
const hasCriteria = Object.keys(criteria).length > 0;
if (hasCriteria) {
const total = await repo.count(criteria);
const rows = (await repo.query(criteria, { limit, offset })) ?? [];
return { rows, total };
}
const total = await repo.size();
const rows = (await repo.getOffsetPage(offset, limit)) ?? [];
return { rows, total };
}

const total = items.length;
const rows = items.slice(offset, offset + limit);
const searchLower = hasSearch ? params.search!.toLowerCase() : null;
const predicate = (c: Crowdfunding): boolean => {
if (params.after !== undefined && c.filing_date < params.after) return false;
if (params.before !== undefined && c.filing_date > params.before) return false;
if (searchLower !== null && !c.name.toLowerCase().includes(searchLower)) return false;
return true;
};

return { rows, total };
const { rows, total, exhausted } = await collectPage(
streamMatchingRows(repo, criteria, predicate),
offset,
limit
);
// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
}
Loading