Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/config/DefaultDI.ts
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ import {
FORM_8K_EVENT_REPOSITORY_TOKEN,
Form8KEventPrimaryKeyNames,
Form8KEventSchema,
Form8KEventUniqueIndexes,
} from "../storage/form-8k-event/Form8KEventSchema";
import {
SPAC_REPOSITORY_TOKEN,
Expand Down Expand Up @@ -963,10 +964,17 @@ export const DefaultDI = () => {
// ------------------------------ Form 8-K Events --------------------------------
globalServiceRegistry.registerInstance(
FORM_8K_EVENT_REPOSITORY_TOKEN,
createStorage("form_8k_events", Form8KEventSchema, Form8KEventPrimaryKeyNames, [
["cik", "filing_date"],
["item_code"],
["accession_number"],
])
createStorage(
"form_8k_events",
Form8KEventSchema,
Form8KEventPrimaryKeyNames,
[
["cik", "filing_date"],
["item_code"],
["accession_number"],
["extractor_id", "extractor_version"],
],
Form8KEventUniqueIndexes
)
);
};
20 changes: 15 additions & 5 deletions src/config/TestingDI.ts
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ import {
FORM_8K_EVENT_REPOSITORY_TOKEN,
Form8KEventPrimaryKeyNames,
Form8KEventSchema,
Form8KEventUniqueIndexes,
} from "../storage/form-8k-event/Form8KEventSchema";
import {
SPAC_REPOSITORY_TOKEN,
Expand Down Expand Up @@ -869,10 +870,19 @@ export function resetDependencyInjectionsForTesting() {
// Form 8-K Events
globalServiceRegistry.registerInstance(
FORM_8K_EVENT_REPOSITORY_TOKEN,
new InMemoryTabularStorage(Form8KEventSchema, Form8KEventPrimaryKeyNames, [
["cik", "filing_date"],
["item_code"],
["accession_number"],
])
new InMemoryTabularStorage(
Form8KEventSchema,
Form8KEventPrimaryKeyNames,
[
["cik", "filing_date"],
["item_code"],
["accession_number"],
["extractor_id", "extractor_version"],
],
undefined, // clientProvidedKeys (default)
undefined, // tabularMigrations
undefined, // migrationName
Form8KEventUniqueIndexes
)
);
}
5 changes: 5 additions & 0 deletions src/config/setupAllDatabases.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ import { UNDERWRITER_LINK_REPOSITORY_TOKEN } from "../storage/canonical/Underwri
import { USE_OF_PROCEEDS_REPOSITORY_TOKEN } from "../storage/use-of-proceeds/UseOfProceedsSchema";
import { XBRL_FACT_REPOSITORY_TOKEN } from "../storage/xbrl/XbrlFactSchema";
import { FORM_8K_EVENT_REPOSITORY_TOKEN } from "../storage/form-8k-event/Form8KEventSchema";
import { migrateLegacyForm8KEventsTable } from "../storage/form-8k-event/Form8KEventLegacyMigration";
import { CANONICAL_COMPANY_REPOSITORY_TOKEN } from "../storage/canonical/CanonicalCompanySchema";
import {
CANONICAL_COMPANY_ADDRESS_REPOSITORY_TOKEN,
Expand Down Expand Up @@ -188,6 +189,10 @@ export async function setupAllDatabases(): Promise<void> {
await globalServiceRegistry.get(UNDERWRITER_LINK_REPOSITORY_TOKEN).setupDatabase();
await globalServiceRegistry.get(USE_OF_PROCEEDS_REPOSITORY_TOKEN).setupDatabase();
await globalServiceRegistry.get(XBRL_FACT_REPOSITORY_TOKEN).setupDatabase();
// Drop the legacy form_8k_events shape (no event_id / extractor_id /
// extractor_version) before creating the current one; the natural-key PK
// of the legacy table cannot be ALTERed away on either backend.
await migrateLegacyForm8KEventsTable();
await globalServiceRegistry.get(FORM_8K_EVENT_REPOSITORY_TOKEN).setupDatabase();
// View DDL is created here only on the SQLite path; the Postgres backend
// owns its own view bootstrap (and getDb() now throws when SEC_DB_TYPE
Expand Down
38 changes: 38 additions & 0 deletions src/sec/edgar/accessionNumber.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
* @license
* Copyright 2026 Steven Roussey <sroussey@gmail.com>
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect, it } from "bun:test";
import { Type } from "typebox";
import Value from "typebox/value";
import { TypeAccessionNumber } from "./accessionNumber";

const Wrapper = Type.Object({ accessionNumber: TypeAccessionNumber() });

describe("TypeAccessionNumber", () => {
it("accepts a well-formed 20-character accession", () => {
expect(Value.Check(Wrapper, { accessionNumber: "0001193125-21-066104" })).toBe(true);
});

it("rejects a 22-character (too-long) accession at the input boundary", () => {
expect(Value.Check(Wrapper, { accessionNumber: "0001193125-21-066104XX" })).toBe(false);
});

it("rejects a 21-character accession with a trailing extra digit", () => {
expect(Value.Check(Wrapper, { accessionNumber: "0001193125-21-0661040" })).toBe(false);
});

it("rejects an accession missing one of the hyphens", () => {
expect(Value.Check(Wrapper, { accessionNumber: "000119312521066104XX" })).toBe(false);
});

it("rejects an accession with letters in the digit segments", () => {
expect(Value.Check(Wrapper, { accessionNumber: "AAAA193125-21-066104" })).toBe(false);
});

it("rejects an empty string", () => {
expect(Value.Check(Wrapper, { accessionNumber: "" })).toBe(false);
});
});
26 changes: 26 additions & 0 deletions src/sec/edgar/accessionNumber.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* @license
* Copyright 2026 Steven Roussey <sroussey@gmail.com>
* SPDX-License-Identifier: Apache-2.0
*/

import { Type } from "typebox";

/**
* EDGAR accession numbers are exactly 20 characters: a 10-digit filer ID, a
* 2-digit year, and a 6-digit sequence, joined by hyphens
* (`NNNNNNNNNN-YY-NNNNNN`). The pattern and length cap are enforced wherever
* an accession number crosses a trust boundary (task input, persisted
* schema) so an over-long or malformed value cannot smuggle past the
* validator and land in the database.
*/
export const ACCESSION_NUMBER_MAX_LENGTH = 20;
export const ACCESSION_NUMBER_PATTERN = "^\\d{10}-\\d{2}-\\d{6}$";

export const TypeAccessionNumber = (annotations: Record<string, unknown> = {}) =>
Type.String({
maxLength: ACCESSION_NUMBER_MAX_LENGTH,
pattern: ACCESSION_NUMBER_PATTERN,
description: "EDGAR accession number (NNNNNNNNNN-YY-NNNNNN)",
...annotations,
});
47 changes: 47 additions & 0 deletions src/sec/forms/Form.entity.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/**
* @license
* Copyright 2026 Steven Roussey <sroussey@gmail.com>
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect, it } from "bun:test";
import { Form_8_K } from "./miscellaneous-filings/Form_8_K";

describe("Form XML parser entity expansion hardening", () => {
/**
* "Billion laughs" — geometric entity expansion. With expansion enabled the
* 10-deep nested chain `lol9 -> 10 x lol8 -> ... -> 10^9 x "lol"` produces a
* ~1 GB string and pegs CPU. With `processEntities: false` the parser leaves
* the `&lolN;` byte sequences literal, so the parse is bounded by input size.
*/
const BILLION_LAUGHS = `<?xml version="1.0"?>
<!DOCTYPE edgarSubmission [
<!ENTITY lol "lol">
<!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
<!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
<!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
<!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;">
<!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;">
<!ENTITY lol7 "&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;">
<!ENTITY lol8 "&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;">
<!ENTITY lol9 "&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;">
]>
<edgarSubmission>
<primaryDocumentDescription>&lol9;</primaryDocumentDescription>
</edgarSubmission>`;

it("parses a billion-laughs payload quickly without expanding entities", async () => {
const start = performance.now();
const result = await Form_8_K.parse("8-K", BILLION_LAUGHS);
const elapsed = performance.now() - start;

// The parse stays bounded by input size (well under a second; the assertion
// is intentionally loose to avoid flakes on slow CI). With expansion enabled
// the parser would spend minutes building a ~1 GB string before any timer
// fires.
expect(elapsed).toBeLessThan(50);
// The parse succeeded and produced an object — no expansion crash, no OOM.
expect(result).toBeDefined();
});
});
8 changes: 8 additions & 0 deletions src/sec/forms/Form.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ export abstract class Form {
trimValues: true,
parseTagValue: false,
parseAttributeValue: false,
// Disable entity expansion. A filer-controlled XML payload that
// declares N references each pointing at a node containing N
// references explodes geometrically under expansion ("billion laughs"),
// and the parser hands us untrusted SGML/XML directly from the filer.
// Stays-literal preserves the raw `&entity;` byte sequence — the
// downstream consumers either don't read it or HTML-decode their own
// entity references explicitly.
processEntities: false,
isArray: (_name, jpath) => {
return typeof jpath === "string" && paths.includes(jpath);
},
Expand Down
40 changes: 27 additions & 13 deletions src/sec/forms/miscellaneous-filings/Form_8_K.storage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ export async function processForm8K({
items,
report_date,
form8K,
extractor_id,
extractor_version,
fullSubmissionText,
model,
}: {
Expand All @@ -63,6 +65,8 @@ export async function processForm8K({
readonly items: string | undefined | null;
readonly report_date: string | undefined | null;
readonly form8K: Form8K;
readonly extractor_id: string;
readonly extractor_version: string;
readonly fullSubmissionText?: string;
readonly model?: ModelConfig;
}): Promise<void> {
Expand All @@ -73,20 +77,30 @@ export async function processForm8K({

const itemCodes = extractItemCodes(items, form8K);

for (const itemCode of itemCodes) {
const event: Form8KEvent = {
cik,
accession_number,
item_code: itemCode,
item_description: Form_8_K_ITEMS[itemCode] ?? null,
filing_date,
report_date: effectiveReportDate,
is_amendment: isAmendment,
};
await eventRepo.saveEvent(event);
}
// Build the full row set first so the atomic replace either lands all
// items for this (filing, version) or none of them. A torn write would
// otherwise leave the table with a partial item list that downstream
// queries can't distinguish from a real partial-disclosure filing.
const events: Array<Omit<Form8KEvent, "event_id">> = itemCodes.map((itemCode) => ({
cik,
accession_number,
extractor_id,
extractor_version,
item_code: itemCode,
item_description: Form_8_K_ITEMS[itemCode] ?? null,
filing_date,
report_date: effectiveReportDate,
is_amendment: isAmendment,
}));

await eventRepo.replaceEvents(
cik,
accession_number,
extractor_id,
extractor_version,
events
);

// --- Consolidated SPAC report: map de-SPAC milestone items (known SPACs only) ---
const spacRow = await new SpacRepo().getSpac(cik);
if (spacRow) {
// Skip when no usable date is available: an undated milestone (empty
Expand Down
Loading