From 81dd4765ae38bd9c4a924e240c3ee10b68201fef Mon Sep 17 00:00:00 2001 From: hallelx2 Date: Wed, 17 Jun 2026 23:46:00 +0100 Subject: [PATCH 1/2] api: emit canonical heading path (title_path) on treewalk citations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Citations carried only {start_page, end_page, section_ids, quote} — no structural heading path. The bench's path_correct@1 reads a title_path it could only reconstruct from the parser's chunk titles, which are the wrong vocabulary, so the metric was structurally 0%. buildTreeWalkCitations now resolves the document's LLM TOC tree (via the strategy's TOC provider) into a section-ID -> heading-path map (tree.BuildHeadingPaths, HAL-109) and attaches the primary section's heading path as title_path on each citation. Degrades cleanly: no TOC persisted -> field omitted, prior behaviour unchanged. Part of HAL-70. --- internal/api/treewalk.go | 46 ++++++++++ internal/api/treewalk_citations_test.go | 107 ++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 internal/api/treewalk_citations_test.go diff --git a/internal/api/treewalk.go b/internal/api/treewalk.go index f6f8a54..d88e926 100644 --- a/internal/api/treewalk.go +++ b/internal/api/treewalk.go @@ -310,6 +310,15 @@ func (d Deps) buildTreeWalkCitations(ctx context.Context, t *tree.Tree, res *ret sources := retrieval.CitationSources(res) citations := make([]map[string]any, 0, len(sources)) + // Canonical heading paths (section ID → logical heading path), resolved + // from the document's LLM TOC tree by page-range containment (HAL-109). + // This is what makes a citation carry a real STRUCTURAL path + // ("Item 8" → "Balance Sheet") rather than leaving the consumer to + // reverse-engineer one from the parser's chunk titles. Nil/empty when no + // TOC was persisted — the citation then simply omits title_path and the + // consumer falls back to its prior behaviour. + headingPaths := d.headingPathsForDoc(ctx, t) + for _, src := range sources { sectionIDs := src.SectionIDs if sectionIDs == nil { @@ -321,6 +330,16 @@ func (d Deps) buildTreeWalkCitations(ctx context.Context, t *tree.Tree, res *ret "section_ids": sectionIDs, } + // Attach the heading path of the citation's primary (first, i.e. + // earliest-page) section. This mirrors how a consumer anchors a + // page-range citation to one structural location, and is exactly + // the field the bench's path-correctness metric reads. + if len(sectionIDs) > 0 { + if hp := headingPaths[sectionIDs[0]]; len(hp) > 0 { + c["title_path"] = hp + } + } + // Quote extraction is best-effort: an LLM blip or empty // content returns no quote, which is a normal degradation // path. We materialise the cited content from storage and @@ -354,6 +373,33 @@ func (d Deps) buildTreeWalkCitations(ctx context.Context, t *tree.Tree, res *ret return citations } +// headingPathsForDoc resolves the canonical section-ID → heading-path +// map for the document, reading the persisted LLM TOC tree through the +// strategy's TOC provider and reconciling it with the section tree via +// tree.BuildHeadingPaths (HAL-109). +// +// Every failure mode degrades to nil (no heading paths): no strategy / +// provider wired, no TOC persisted (retrieval.ErrNoTOC), a fetch error, +// or unparseable TOC JSON. A nil map is safe to index — citations then +// omit title_path and the consumer falls back to its prior behaviour, so +// this never makes a response worse than before the TOC was available. +func (d Deps) headingPathsForDoc(ctx context.Context, t *tree.Tree) map[tree.SectionID][]string { + if t == nil || t.Root == nil || d.TreeWalkStrategy == nil || d.TreeWalkStrategy.TOC == nil { + return nil + } + raw, err := d.TreeWalkStrategy.TOC.GetTOC(ctx, t.DocumentID) + if err != nil || len(raw) == 0 { + return nil + } + var nodes []tree.TOCNode + if err := json.Unmarshal(raw, &nodes); err != nil { + d.Logger.Warn("answer/treewalk: TOC parse failed; citations omit heading paths", + "err", err, "document_id", t.DocumentID) + return nil + } + return tree.BuildHeadingPaths(t.Root, nodes) +} + // materialiseCitedContent loads + concatenates every cited // section's content. Used for answer-span extraction over the // pages the model relied on, so the quote can have real byte diff --git a/internal/api/treewalk_citations_test.go b/internal/api/treewalk_citations_test.go new file mode 100644 index 0000000..c4b2eda --- /dev/null +++ b/internal/api/treewalk_citations_test.go @@ -0,0 +1,107 @@ +package api + +import ( + "context" + "encoding/json" + "log/slog" + "reflect" + "testing" + + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// stubTOCProvider returns canned TOC bytes (or an error) for the +// citation builder's heading-path lookup. +type stubTOCProvider struct { + raw []byte + err error +} + +func (s stubTOCProvider) GetTOC(context.Context, tree.DocumentID) ([]byte, error) { + return s.raw, s.err +} + +// citationTestTOC mirrors buildTreeWalkTestTree's page layout as a +// logical outline: Setup{Install 1-2, Configuration 3-4}, +// Usage{Querying 5-7, Debt 8-9}. +func citationTestTOC(t *testing.T) []byte { + t.Helper() + toc := []tree.TOCNode{ + {Title: "Setup", StartPage: 1, EndPage: 4, Nodes: []tree.TOCNode{ + {Title: "Install", StartPage: 1, EndPage: 2}, + {Title: "Configuration", StartPage: 3, EndPage: 4}, + }}, + {Title: "Usage", StartPage: 5, EndPage: 9, Nodes: []tree.TOCNode{ + {Title: "Querying", StartPage: 5, EndPage: 7}, + {Title: "Debt", StartPage: 8, EndPage: 9}, + }}, + } + raw, err := json.Marshal(toc) + if err != nil { + t.Fatalf("marshal toc: %v", err) + } + return raw +} + +// depsWithTOC builds a minimal Deps for buildTreeWalkCitations: no LLM +// (so quote extraction is skipped) and a stub TOC provider on the +// strategy. +func depsWithTOC(toc []byte, tocErr error) Deps { + return Deps{ + Logger: slog.Default(), + TreeWalkStrategy: &retrieval.TreeWalkStrategy{ + TOC: stubTOCProvider{raw: toc, err: tocErr}, + }, + } +} + +// TestBuildTreeWalkCitations_EmitsHeadingPath is the HAL-70 regression: +// a citation must carry the canonical heading path of its primary +// section, resolved from the TOC — the field the bench's +// path_correct@1 metric reads. +func TestBuildTreeWalkCitations_EmitsHeadingPath(t *testing.T) { + d := depsWithTOC(citationTestTOC(t), nil) + tr := buildTreeWalkTestTree() + res := &retrieval.Result{CitedPages: [][2]int{{1, 2}}} + + cites := d.buildTreeWalkCitations(context.Background(), tr, res, "how do I install?", "") + if len(cites) != 1 { + t.Fatalf("want 1 citation, got %d: %v", len(cites), cites) + } + c := cites[0] + + // Primary section for pages 1-2 is the leaf sec_a1 (Install). + ids, _ := c["section_ids"].([]tree.SectionID) + if len(ids) == 0 || ids[0] != "sec_a1" { + t.Fatalf("expected primary section sec_a1, got %v", c["section_ids"]) + } + + got, ok := c["title_path"].([]string) + if !ok { + t.Fatalf("citation is missing a title_path: %#v", c) + } + if want := []string{"Setup", "Install"}; !reflect.DeepEqual(got, want) { + t.Fatalf("title_path mismatch: got=%v want=%v", got, want) + } +} + +// TestBuildTreeWalkCitations_NoTOCOmitsHeadingPath: when no TOC is +// persisted (ErrNoTOC), the citation degrades gracefully — section_ids +// and pages are still present, title_path is simply absent. +func TestBuildTreeWalkCitations_NoTOCOmitsHeadingPath(t *testing.T) { + d := depsWithTOC(nil, retrieval.ErrNoTOC) + tr := buildTreeWalkTestTree() + res := &retrieval.Result{CitedPages: [][2]int{{5, 7}}} + + cites := d.buildTreeWalkCitations(context.Background(), tr, res, "how do I query?", "") + if len(cites) != 1 { + t.Fatalf("want 1 citation, got %d", len(cites)) + } + if _, present := cites[0]["title_path"]; present { + t.Fatalf("title_path must be absent without a TOC, got %v", cites[0]["title_path"]) + } + if _, present := cites[0]["section_ids"]; !present { + t.Fatalf("section_ids must still be present") + } +} From 44c8753352ba15972489963f5f4425eab6602b02 Mon Sep 17 00:00:00 2001 From: hallelx2 Date: Wed, 17 Jun 2026 23:57:20 +0100 Subject: [PATCH 2/2] api: log non-ErrNoTOC failures in headingPathsForDoc (review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GetTOC errors were all silently treated as 'no heading paths'. Now retrieval.ErrNoTOC (the expected missing-TOC case) stays silent while any other error is logged at debug for observability — still degrading gracefully. Per Sourcery review on #40. --- internal/api/treewalk.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/api/treewalk.go b/internal/api/treewalk.go index d88e926..6bd895a 100644 --- a/internal/api/treewalk.go +++ b/internal/api/treewalk.go @@ -388,7 +388,18 @@ func (d Deps) headingPathsForDoc(ctx context.Context, t *tree.Tree) map[tree.Sec return nil } raw, err := d.TreeWalkStrategy.TOC.GetTOC(ctx, t.DocumentID) - if err != nil || len(raw) == 0 { + if err != nil { + // retrieval.ErrNoTOC is the expected "no TOC persisted yet" signal — + // silent. Any other error (DB/transport) is an operational issue worth + // surfacing for diagnosis, even though we still degrade to no heading + // paths rather than failing the request. + if !errors.Is(err, retrieval.ErrNoTOC) { + d.Logger.Debug("answer/treewalk: TOC fetch failed; citations omit heading paths", + "err", err, "document_id", t.DocumentID) + } + return nil + } + if len(raw) == 0 { return nil } var nodes []tree.TOCNode