From c0114531eb80f104714ccbbd1e1291c1e138ec3f Mon Sep 17 00:00:00 2001 From: hallelx2 Date: Wed, 17 Jun 2026 23:32:46 +0100 Subject: [PATCH 1/2] tree: reconcile section tree + LLM TOC into a canonical heading-path map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ingestion builds two independent structures — the parser's Section tree (content, summaries; the IDs citations resolve to) and the LLM-built TOC tree (the logical outline with clean headings + page anchors). They are never reconciled, so the map a citation resolves against and the map that holds the real headings can diverge. BuildHeadingPaths closes that gap without merging the trees: for every section it returns the canonical heading path it belongs under, matched by page-range containment (deepest containing TOC node wins; best overlap when a section straddles a boundary). Sections with no page range, and every section when the TOC is empty, are absent so callers fall back to existing behaviour. This is the reconciliation map HAL-70 needs to emit a real structural title_path on citations. --- pkg/tree/heading_path.go | 257 ++++++++++++++++++++++++++++++++++ pkg/tree/heading_path_test.go | 170 ++++++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 pkg/tree/heading_path.go create mode 100644 pkg/tree/heading_path_test.go diff --git a/pkg/tree/heading_path.go b/pkg/tree/heading_path.go new file mode 100644 index 0000000..2cdb2d7 --- /dev/null +++ b/pkg/tree/heading_path.go @@ -0,0 +1,257 @@ +package tree + +import "strings" + +// BuildHeadingPaths reconciles the parser's section tree with the +// LLM-built TOC tree into one canonical lookup: section ID → logical +// heading path (the chain of TOC titles from the document root down to +// the most specific TOC node that covers the section's pages). +// +// This is the bridge HAL-109 introduces. Ingestion builds two +// independent structures: +// +// - the parser's Section tree, which carries content, summaries, and +// candidate questions, and whose IDs every citation resolves to; its +// Title fields are whatever the parser recovered (often empty or a +// non-semantic chunk label for content leaves), and +// - the LLM-built TOC tree ([]TOCNode, persisted on documents.toc_tree), +// which carries the document's logical outline with the clean heading +// vocabulary clients actually expect ("Item 8" → "Balance Sheet") and +// page anchors, but no content. +// +// Because the two are never reconciled, the map a citation resolves +// against (parser titles) and the map that holds the real headings (the +// TOC) can — and do — diverge. BuildHeadingPaths closes that gap without +// merging the two trees: it returns, for every section, the canonical +// heading path it belongs under, so a citation can carry a real +// structural path instead of a parser chunk label. +// +// # Matching: page-range containment +// +// A section belongs under the TOC node whose effective page span best +// covers the section's own [PageStart, PageEnd]. Among the TOC nodes that +// overlap a section, the winner is chosen by, in order: +// +// 1. containment — a node that fully contains the section beats one that +// merely overlaps it (the section sits cleanly inside that heading); +// 2. depth — the deeper (more specific) heading wins, so a section under +// "Item 8 → Balance Sheet" maps to both, ending at "Balance Sheet" +// rather than stopping at "Item 8"; +// 3. overlap — more shared pages wins; +// 4. span — the tighter node wins (more specific); +// 5. start page — earlier wins, purely to make the result deterministic. +// +// # Degradation +// +// Sections with no page range (PageStart/PageEnd <= 0 — the normal state +// for non-paginated formats), and every section when the TOC is empty or +// nil, are simply absent from the returned map. Callers treat a missing +// entry as "no canonical heading path known" and fall back to existing +// behaviour, so wiring this in never makes a citation worse than today. +// +// The returned map is keyed by SectionID and never nil (an empty map is +// returned when nothing could be mapped) so callers can index it without +// a nil check. +func BuildHeadingPaths(root *Section, toc []TOCNode) map[SectionID][]string { + out := make(map[SectionID][]string) + if root == nil || len(toc) == 0 { + return out + } + + maxPage := documentMaxPage(root, toc) + entries := flattenTOC(toc, nil, maxPage) + if len(entries) == 0 { + return out + } + + root.Walk(func(s *Section) bool { + if s == nil || s.PageStart <= 0 || s.PageEnd <= 0 { + return true + } + if path, ok := bestHeadingPath(entries, s.PageStart, s.PageEnd); ok { + out[s.ID] = path + } + return true + }) + return out +} + +// tocEntry is a flattened TOC node with its effective (resolved) page +// span and the full heading path leading to it. depth is the node's +// 0-indexed nesting level, used to prefer more specific headings. +type tocEntry struct { + start int + end int + depth int + path []string +} + +// span is the inclusive page count the entry covers. A zero/negative +// span (malformed node that survived resolution) sorts last. +func (e tocEntry) span() int { + if e.end < e.start { + return 1 << 30 + } + return e.end - e.start + 1 +} + +// flattenTOC walks the TOC forest depth-first, resolving each node's +// effective end page and accumulating the heading path. parentPath is +// the chain of titles above this level; parentEnd bounds open-ended +// nodes (a node whose EndPage is 0 runs until the next sibling's start, +// or — for the last sibling — its parent's end, or the document end). +// +// Empty titles are skipped in the accumulated path so a structural +// wrapper node with no heading doesn't inject a blank segment, but its +// children still inherit the correct ancestry. +func flattenTOC(nodes []TOCNode, parentPath []string, parentEnd int) []tocEntry { + return flattenTOCAt(nodes, parentPath, parentEnd, 0) +} + +func flattenTOCAt(nodes []TOCNode, parentPath []string, parentEnd, depth int) []tocEntry { + var out []tocEntry + for i, n := range nodes { + start := n.StartPage + end := resolveEndPage(nodes, i, parentEnd) + + path := parentPath + if t := normaliseTitle(n.Title); t != "" { + // Copy so sibling branches never share/alias the backing array. + path = append(append([]string(nil), parentPath...), t) + } + + if start > 0 && end >= start { + out = append(out, tocEntry{start: start, end: end, depth: depth, path: path}) + } + if len(n.Nodes) > 0 { + // A child can't extend past its parent's end; pass end down so + // an open-ended deepest child is bounded by its ancestor. + childBound := end + if childBound <= 0 { + childBound = parentEnd + } + out = append(out, flattenTOCAt(n.Nodes, path, childBound, depth+1)...) + } + } + return out +} + +// resolveEndPage computes the effective inclusive end page for nodes[i]. +// An explicit EndPage wins. Otherwise the node runs until the page +// before the next sibling that carries a StartPage; if there is no such +// sibling it inherits parentEnd (the enclosing node's end, or the +// document's last page at the top level). +func resolveEndPage(nodes []TOCNode, i, parentEnd int) int { + if nodes[i].EndPage > 0 { + return nodes[i].EndPage + } + for j := i + 1; j < len(nodes); j++ { + if nodes[j].StartPage > 0 { + if nodes[j].StartPage-1 >= nodes[i].StartPage { + return nodes[j].StartPage - 1 + } + return nodes[i].StartPage // degenerate ordering: single page + } + } + if parentEnd > 0 { + return parentEnd + } + return nodes[i].StartPage +} + +// bestHeadingPath picks the heading path for a section spanning +// [secStart, secEnd] using the precedence documented on +// BuildHeadingPaths. Returns ok=false when no TOC entry overlaps the +// section at all. +func bestHeadingPath(entries []tocEntry, secStart, secEnd int) ([]string, bool) { + bestIdx := -1 + for i, e := range entries { + ov := overlapPages(e.start, e.end, secStart, secEnd) + if ov <= 0 { + continue + } + if bestIdx < 0 || lessSpecific(entries[bestIdx], e, secStart, secEnd) { + bestIdx = i + } + } + if bestIdx < 0 || len(entries[bestIdx].path) == 0 { + return nil, false + } + // Defensive copy so callers can't mutate our internal slices. + return append([]string(nil), entries[bestIdx].path...), true +} + +// lessSpecific reports whether the current best entry a is a WORSE match +// for the section than candidate b — i.e. b should replace a. The +// ordering mirrors the BuildHeadingPaths precedence list. +func lessSpecific(a, b tocEntry, secStart, secEnd int) bool { + aContains := contains(a.start, a.end, secStart, secEnd) + bContains := contains(b.start, b.end, secStart, secEnd) + if aContains != bContains { + return bContains // prefer the container + } + if a.depth != b.depth { + return b.depth > a.depth // prefer deeper / more specific + } + aOv := overlapPages(a.start, a.end, secStart, secEnd) + bOv := overlapPages(b.start, b.end, secStart, secEnd) + if aOv != bOv { + return bOv > aOv // prefer more overlap + } + if a.span() != b.span() { + return b.span() < a.span() // prefer the tighter node + } + return b.start < a.start // deterministic tie-break +} + +// contains reports whether [oStart,oEnd] fully encloses [iStart,iEnd]. +func contains(oStart, oEnd, iStart, iEnd int) bool { + return oStart <= iStart && iEnd <= oEnd +} + +// overlapPages returns the count of shared inclusive pages between two +// ranges, or 0 when they don't intersect. +func overlapPages(aStart, aEnd, bStart, bEnd int) int { + if aStart <= 0 || aEnd <= 0 || bStart <= 0 || bEnd <= 0 { + return 0 + } + lo := max(aStart, bStart) + hi := min(aEnd, bEnd) + if hi < lo { + return 0 + } + return hi - lo + 1 +} + +// documentMaxPage is the highest page the document is known to reach, +// used to bound open-ended top-level TOC nodes. It takes the max across +// section PageEnds and TOC StartPages so a TOC whose last node has no +// EndPage still resolves to something sane. +func documentMaxPage(root *Section, toc []TOCNode) int { + hi := 0 + if root != nil { + root.Walk(func(s *Section) bool { + if s != nil { + hi = max(hi, s.PageEnd) + } + return true + }) + } + var scan func(nodes []TOCNode) + scan = func(nodes []TOCNode) { + for _, n := range nodes { + hi = max(hi, n.EndPage, n.StartPage) + scan(n.Nodes) + } + } + scan(toc) + return hi +} + +// normaliseTitle trims a TOC title for use as a path segment. It only +// strips surrounding whitespace — the bench's anchor matcher already +// handles case/punctuation/ordinal normalisation, so we keep the +// heading verbatim here and let the consumer normalise for comparison. +func normaliseTitle(s string) string { + return strings.TrimSpace(s) +} diff --git a/pkg/tree/heading_path_test.go b/pkg/tree/heading_path_test.go new file mode 100644 index 0000000..711d845 --- /dev/null +++ b/pkg/tree/heading_path_test.go @@ -0,0 +1,170 @@ +package tree + +import ( + "reflect" + "testing" +) + +// sec is a tiny helper for building a parser section with a page range. +func sec(id string, start, end int, children ...*Section) *Section { + return &Section{ + ID: SectionID(id), + Title: id, // parser title is deliberately non-semantic here + PageStart: start, + PageEnd: end, + Children: children, + } +} + +// financialTOC mirrors the shape an SEC-filing TOC builder produces: +// Part II → Item 8 → the individual statements, with the deepest nodes +// carrying the headings the gold anchors are written against. +func financialTOC() []TOCNode { + return []TOCNode{ + { + Structure: "1", Title: "Part I", StartPage: 1, EndPage: 9, + }, + { + Structure: "2", Title: "Part II", StartPage: 10, EndPage: 60, + Nodes: []TOCNode{ + { + Structure: "2.1", Title: "Item 7 — MD&A", StartPage: 10, EndPage: 39, + }, + { + Structure: "2.2", Title: "Item 8 — Financial Statements", StartPage: 40, EndPage: 60, + Nodes: []TOCNode{ + {Structure: "2.2.1", Title: "Balance Sheet", StartPage: 41, EndPage: 42}, + {Structure: "2.2.2", Title: "Statements of Operations", StartPage: 43, EndPage: 45}, + // Open-ended last child: EndPage 0 must resolve to the + // parent's end (60), not leak past it. + {Structure: "2.2.3", Title: "Notes to Financial Statements", StartPage: 46, EndPage: 0}, + }, + }, + }, + }, + } +} + +func TestBuildHeadingPaths_DeepestContainingWins(t *testing.T) { + // A content leaf sitting inside the Balance Sheet pages must map to + // the full logical path ending at the most specific heading. + root := sec("root", 0, 0, + sec("sec_balance", 41, 41), + sec("sec_ops", 44, 44), + ) + got := BuildHeadingPaths(root, financialTOC()) + + want := map[SectionID][]string{ + "sec_balance": {"Part II", "Item 8 — Financial Statements", "Balance Sheet"}, + "sec_ops": {"Part II", "Item 8 — Financial Statements", "Statements of Operations"}, + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("heading paths mismatch:\n got=%v\nwant=%v", got, want) + } +} + +func TestBuildHeadingPaths_OpenEndedLastChildBoundedByParent(t *testing.T) { + // pages 50 fall under the open-ended "Notes" node, which must resolve + // its end from the parent's end page (60). + root := sec("root", 0, 0, sec("sec_notes", 50, 50)) + got := BuildHeadingPaths(root, financialTOC()) + + want := []string{"Part II", "Item 8 — Financial Statements", "Notes to Financial Statements"} + if !reflect.DeepEqual(got["sec_notes"], want) { + t.Fatalf("open-ended child path:\n got=%v\nwant=%v", got["sec_notes"], want) + } +} + +func TestBuildHeadingPaths_SectionUnderTopLevelOnly(t *testing.T) { + // A section in Part I (which has no children) maps to just that node. + root := sec("root", 0, 0, sec("sec_intro", 3, 4)) + got := BuildHeadingPaths(root, financialTOC()) + if want := []string{"Part I"}; !reflect.DeepEqual(got["sec_intro"], want) { + t.Fatalf("top-level-only path: got=%v want=%v", got["sec_intro"], want) + } +} + +func TestBuildHeadingPaths_StraddlingSectionPicksBestOverlap(t *testing.T) { + // A coarse section spanning 41-44 isn't fully contained by any single + // statement node; it overlaps Balance Sheet (41-42, 2 pages) and + // Statements of Operations (43-45, 2 pages). With equal overlap the + // container check fails for both, so the deeper tie resolves to the + // parent that DOES contain it: Item 8. + root := sec("root", 0, 0, sec("sec_wide", 41, 44)) + got := BuildHeadingPaths(root, financialTOC()) + + want := []string{"Part II", "Item 8 — Financial Statements"} + if !reflect.DeepEqual(got["sec_wide"], want) { + t.Fatalf("straddling section path:\n got=%v\nwant=%v", got["sec_wide"], want) + } +} + +func TestBuildHeadingPaths_NoPageRangeSkipped(t *testing.T) { + // Non-paginated sections (PageStart/End 0) must not appear in the map. + root := sec("root", 0, 0, + &Section{ID: "sec_nopages", Title: "Intro"}, // no pages + sec("sec_p", 41, 41), + ) + got := BuildHeadingPaths(root, financialTOC()) + if _, ok := got["sec_nopages"]; ok { + t.Fatalf("section without a page range should be absent, got %v", got["sec_nopages"]) + } + if _, ok := got["sec_p"]; !ok { + t.Fatalf("paginated section should be present") + } +} + +func TestBuildHeadingPaths_EmptyTOCDegrades(t *testing.T) { + root := sec("root", 0, 0, sec("sec_a", 1, 2)) + if got := BuildHeadingPaths(root, nil); len(got) != 0 { + t.Fatalf("nil TOC must yield an empty map, got %v", got) + } + if got := BuildHeadingPaths(root, []TOCNode{}); got == nil || len(got) != 0 { + t.Fatalf("empty TOC must yield a non-nil empty map, got %v", got) + } +} + +func TestBuildHeadingPaths_NilRootSafe(t *testing.T) { + if got := BuildHeadingPaths(nil, financialTOC()); got == nil || len(got) != 0 { + t.Fatalf("nil root must yield a non-nil empty map, got %v", got) + } +} + +func TestBuildHeadingPaths_OutsideTOCRangeAbsent(t *testing.T) { + // A section on a page beyond every TOC node's reach maps to nothing + // rather than guessing. + root := sec("root", 0, 0, sec("sec_far", 999, 1000)) + got := BuildHeadingPaths(root, financialTOC()) + if _, ok := got["sec_far"]; ok { + t.Fatalf("section outside all TOC ranges should be absent, got %v", got["sec_far"]) + } +} + +func TestBuildHeadingPaths_EmptyTitleNodeSkippedInPath(t *testing.T) { + // A structural wrapper node with no title must not inject a blank + // segment, but its children still inherit correct ancestry. + toc := []TOCNode{ + { + Structure: "", Title: "", StartPage: 1, EndPage: 20, + Nodes: []TOCNode{ + {Structure: "1", Title: "Overview", StartPage: 1, EndPage: 20}, + }, + }, + } + root := sec("root", 0, 0, sec("sec_x", 5, 6)) + got := BuildHeadingPaths(root, toc) + if want := []string{"Overview"}; !reflect.DeepEqual(got["sec_x"], want) { + t.Fatalf("empty-title wrapper should be skipped: got=%v want=%v", got["sec_x"], want) + } +} + +func TestBuildHeadingPaths_ResultIsDefensivelyCopied(t *testing.T) { + root := sec("root", 0, 0, sec("sec_balance", 41, 41)) + got := BuildHeadingPaths(root, financialTOC()) + got["sec_balance"][0] = "MUTATED" + // A second call must be unaffected by a caller mutating the first. + again := BuildHeadingPaths(root, financialTOC()) + if again["sec_balance"][0] != "Part II" { + t.Fatalf("returned slices must not alias internal state; got %v", again["sec_balance"]) + } +} From b0279563e0b7b3b22a7703a0bab4e29897df25d1 Mon Sep 17 00:00:00 2001 From: hallelx2 Date: Wed, 17 Jun 2026 23:55:35 +0100 Subject: [PATCH 2/2] tree: use math.MaxInt for the malformed-span sentinel (review) Replaces the 1<<30 magic sentinel in tocEntry.span with math.MaxInt so the intent is explicit and it doesn't depend on int width. Per Sourcery review on #39. --- pkg/tree/heading_path.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/tree/heading_path.go b/pkg/tree/heading_path.go index 2cdb2d7..14be2bc 100644 --- a/pkg/tree/heading_path.go +++ b/pkg/tree/heading_path.go @@ -1,6 +1,9 @@ package tree -import "strings" +import ( + "math" + "strings" +) // BuildHeadingPaths reconciles the parser's section tree with the // LLM-built TOC tree into one canonical lookup: section ID → logical @@ -86,11 +89,12 @@ type tocEntry struct { path []string } -// span is the inclusive page count the entry covers. A zero/negative -// span (malformed node that survived resolution) sorts last. +// span is the inclusive page count the entry covers. A malformed node +// (end < start) that survived resolution reports the maximum span so it +// sorts last in specificity comparisons, regardless of int width. func (e tocEntry) span() int { if e.end < e.start { - return 1 << 30 + return math.MaxInt } return e.end - e.start + 1 }