From ad8db2e87c77978928d046c95565e9e60c1b1f4e Mon Sep 17 00:00:00 2001 From: suguanyang Date: Fri, 15 May 2026 05:15:19 -0700 Subject: [PATCH] feat: sync SDK with current worker ZIP contract and agentic retrieval API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add typed ChunkMetadata model for known worker metadata fields while preserving forward compatibility via extra="allow". Issue #21 — ParseResult ZIP models Issue #22 — Agentic retrieval API --- docs/usage.md | 77 ++++--- src/knowhere/lib/result_parser.py | 77 ++----- src/knowhere/resources/retrieval.py | 6 + src/knowhere/types/result.py | 145 +++++++++---- src/knowhere/types/retrieval.py | 13 +- tests/test_models.py | 57 +----- tests/test_result_parser.py | 305 ++++++++++++++++++---------- tests/test_retrieval.py | 93 +++++++++ 8 files changed, 490 insertions(+), 283 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index e1c0714..d982cb0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,5 +1,9 @@ # Knowhere Python SDK — Usage Guide +> **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`, +> `length`, etc.) are no longer flattened to the chunk surface. Access them +> through `chunk.metadata` instead. See [Chunk Types](#chunk-types). + Comprehensive reference for every feature, parameter, and pattern in the SDK. ## Table of Contents @@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk] # Lookup by ID chunk = result.getChunk("chunk_42") -# Hierarchy data (document structure tree, if available) -result.hierarchy +# Document navigation tree (from doc_nav.json, current worker output) +result.doc_nav # DocNav | None +result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels +result.doc_nav.resources # DocNavResources — image/table resource summaries + +# Legacy hierarchy (from hierarchy.json, older worker output) +result.hierarchy # Any | None # Raw ZIP bytes (for archival) result.raw_zip @@ -239,49 +248,48 @@ result.save("./output/report/") ## Chunk Types -Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`). Each type adds its own fields. +Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`, +`metadata`). Worker metadata is kept in the `metadata` dict — it is **not** +flattened to top-level chunk properties. -### TextChunk +### Base fields (all chunk types) | Field | Type | Description | |-------|------|-------------| | `chunk_id` | `str` | Unique identifier | -| `type` | `str` | Always `"text"` | -| `content` | `str` | The text content | -| `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) | -| `length` | `int` | Character count | -| `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline | -| `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) | -| `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) | -| `relationships` | `List \| None` | Relationships to other chunks | +| `type` | `str` | `"text"`, `"image"`, or `"table"` | +| `content` | `str` | Text content or placeholder | +| `path` | `str \| None` | Document structure path | +| `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) | + +### TextChunk ```python for chunk in result.text_chunks: print(f"[{chunk.chunk_id}] {chunk.content[:60]}...") - if chunk.keywords: - print(f" Keywords: {', '.join(chunk.keywords)}") - if chunk.summary: - print(f" Summary: {chunk.summary}") + # Metadata is in chunk.metadata, not flattened: + keywords = chunk.metadata.get("keywords", []) + summary = chunk.metadata.get("summary") + if keywords: + print(f" Keywords: {', '.join(keywords)}") + if summary: + print(f" Summary: {summary}") ``` ### ImageChunk | Field | Type | Description | |-------|------|-------------| -| `chunk_id` | `str` | Unique identifier | -| `type` | `str` | Always `"image"` | -| `content` | `str` | Text content associated with the image | | `file_path` | `str \| None` | Path within the ZIP | -| `original_name` | `str \| None` | Original filename | -| `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) | | `data` | `bytes` | Raw image bytes (loaded from ZIP) | | `format` | `str \| None` | Image format inferred from extension (property) | ```python for img in result.image_chunks: print(f"{img.file_path} ({len(img.data)} bytes, {img.format})") - if img.summary: - print(f" Description: {img.summary}") + summary = img.metadata.get("summary") + if summary: + print(f" Description: {summary}") img.save("./output/images/") # writes to disk ``` @@ -289,13 +297,7 @@ for img in result.image_chunks: | Field | Type | Description | |-------|------|-------------| -| `chunk_id` | `str` | Unique identifier | -| `type` | `str` | Always `"table"` | -| `content` | `str` | Text representation of the table | | `file_path` | `str \| None` | Path within the ZIP | -| `original_name` | `str \| None` | Original filename | -| `table_type` | `str \| None` | Table classification | -| `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) | | `html` | `str` | Full HTML of the table (loaded from ZIP) | ```python @@ -471,6 +473,19 @@ response = client.retrieval.query( top_k=5, ) +# Agentic mode (LLM navigation + answer synthesis) +response = client.retrieval.query( + namespace="support-center", + query="How do I pair a Bluetooth headset?", + use_agentic=True, + top_k=5, +) +print(response.answer_text) # LLM-generated natural-language answer +print(response.router_used) # "workflow_single_step", "small_kb_all", etc. +for ref in response.referenced_chunks: + print(ref.get("chunk_id"), ref.get("asset_url")) + +# Legacy results are always available for result in response.results: print(result.content) print(result.score) @@ -479,6 +494,10 @@ for result in response.results: print(result.source.section_path) ``` +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. | + Retrieval results expose `content`, not the older parse-result `text` field. Media results may include `asset_url` when the server can sign the referenced artifact. diff --git a/src/knowhere/lib/result_parser.py b/src/knowhere/lib/result_parser.py index 95f6241..eac4579 100644 --- a/src/knowhere/lib/result_parser.py +++ b/src/knowhere/lib/result_parser.py @@ -13,13 +13,13 @@ from knowhere._logging import getLogger from knowhere.types.result import ( Chunk, + DocNav, ImageChunk, Manifest, ParseResult, SlimChunk, TableChunk, TextChunk, - TextChunkTokens, ) _logger = getLogger() @@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]: return fallback -def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]: - """Return a string-only token list with empty values removed.""" - normalized_tokens: List[str] = [] - for raw_token in raw_tokens: - token_text: str = str(raw_token).strip() - if token_text: - normalized_tokens.append(token_text) - return normalized_tokens - - -def _parseTextChunkTokens( - raw_tokens: Any, - *, - chunk_id: str, -) -> Optional[TextChunkTokens]: - """Normalize text chunk tokens from the current backend payload.""" - if raw_tokens is None: - return None - if isinstance(raw_tokens, bool): - raise KnowhereError( - f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool." - ) - if isinstance(raw_tokens, list): - return _normalizeTokenList(raw_tokens) - - raise KnowhereError( - "Invalid tokens payload for text chunk " - f"'{chunk_id}': expected list[str], " - f"got {type(raw_tokens).__name__}." - ) - - def _buildChunks( raw_chunks: List[Dict[str, Any]], zf: zipfile.ZipFile, @@ -125,58 +93,39 @@ def _buildChunks( if chunk_type == "image": image_data: bytes = b"" - # file_path may be at top level, inside metadata, or use path as fallback file_path: Optional[str] = _extractFilePath(raw) if file_path: image_data = _readZipBytes(zf, file_path) or b"" - metadata: Dict[str, Any] = raw.get("metadata", {}) chunk: Chunk = ImageChunk( chunk_id=raw.get("chunk_id", ""), type="image", content=raw.get("content", ""), path=raw.get("path"), - page_nums=metadata.get("page_nums", raw.get("page_nums")), - length=metadata.get("length", raw.get("length", 0)), file_path=file_path, - original_name=metadata.get("original_name", raw.get("original_name")), - summary=metadata.get("summary", raw.get("summary")), data=image_data, + metadata=raw.get("metadata", {}), ) elif chunk_type == "table": table_html: str = "" file_path = _extractFilePath(raw) if file_path: table_html = _readZipText(zf, file_path) or "" - metadata = raw.get("metadata", {}) chunk = TableChunk( chunk_id=raw.get("chunk_id", ""), type="table", content=raw.get("content", ""), path=raw.get("path"), - page_nums=metadata.get("page_nums", raw.get("page_nums")), - length=metadata.get("length", raw.get("length", 0)), file_path=file_path, - original_name=metadata.get("original_name", raw.get("original_name")), - table_type=metadata.get("table_type", raw.get("table_type")), - summary=metadata.get("summary", raw.get("summary")), html=table_html, + metadata=raw.get("metadata", {}), ) else: - metadata = raw.get("metadata", {}) - chunk_id: str = raw.get("chunk_id", "") - raw_tokens: Any = metadata.get("tokens", raw.get("tokens")) chunk = TextChunk( - chunk_id=chunk_id, + chunk_id=raw.get("chunk_id", ""), type="text", content=raw.get("content", ""), path=raw.get("path"), - page_nums=metadata.get("page_nums", raw.get("page_nums")), - length=metadata.get("length", raw.get("length", 0)), - tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id), - keywords=metadata.get("keywords", raw.get("keywords")), - summary=metadata.get("summary", raw.get("summary")), - connect_to=metadata.get("connect_to", raw.get("connect_to")), - relationships=metadata.get("relationships", raw.get("relationships")), + metadata=raw.get("metadata", {}), ) chunks.append(chunk) @@ -229,7 +178,15 @@ def parseResultZip( # -- Full markdown -- full_markdown: str = _readZipText(zf, "full.md") or "" - # -- Hierarchy -- + # -- DocNav (current worker output) -- + doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json") + doc_nav: Optional[DocNav] = ( + DocNav.model_validate(json.loads(doc_nav_text)) + if doc_nav_text + else None + ) + + # -- Hierarchy (legacy — current worker no longer emits this) -- hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json") hierarchy: Optional[Any] = ( json.loads(hierarchy_text) if hierarchy_text else None @@ -263,11 +220,13 @@ def parseResultZip( return ParseResult( manifest=manifest, chunks=chunks, - chunks_slim=chunks_slim, full_markdown=full_markdown, + raw_zip=zip_bytes, + doc_nav=doc_nav, + # Legacy — the current worker no longer emits these files + chunks_slim=chunks_slim, hierarchy=hierarchy, toc_hierarchies=toc_hierarchies, kb_csv=kb_csv, hierarchy_view_html=hierarchy_view_html, - raw_zip=zip_bytes, ) diff --git a/src/knowhere/resources/retrieval.py b/src/knowhere/resources/retrieval.py index 3b6b36c..9100be8 100644 --- a/src/knowhere/resources/retrieval.py +++ b/src/knowhere/resources/retrieval.py @@ -22,6 +22,7 @@ def query( query: str, namespace: Optional[str] = None, top_k: Optional[int] = None, + use_agentic: Optional[bool] = None, data_type: Optional[int] = None, signal_paths: Optional[list[str]] = None, filter_mode: Optional[RetrievalFilterMode] = None, @@ -39,6 +40,8 @@ def query( body["namespace"] = namespace if top_k is not None: body["top_k"] = top_k + if use_agentic is not None: + body["use_agentic"] = use_agentic if data_type is not None: body["data_type"] = data_type if signal_paths is not None: @@ -77,6 +80,7 @@ async def query( query: str, namespace: Optional[str] = None, top_k: Optional[int] = None, + use_agentic: Optional[bool] = None, data_type: Optional[int] = None, signal_paths: Optional[list[str]] = None, filter_mode: Optional[RetrievalFilterMode] = None, @@ -94,6 +98,8 @@ async def query( body["namespace"] = namespace if top_k is not None: body["top_k"] = top_k + if use_agentic is not None: + body["use_agentic"] = use_agentic if data_type is not None: body["data_type"] = data_type if signal_paths is not None: diff --git a/src/knowhere/types/result.py b/src/knowhere/types/result.py index df83c19..0cce2da 100644 --- a/src/knowhere/types/result.py +++ b/src/knowhere/types/result.py @@ -9,7 +9,6 @@ from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field -from typing_extensions import TypeAlias from knowhere._exceptions import ValidationError @@ -138,6 +137,44 @@ class Manifest(BaseModel): checksum: Optional[Checksum] = None statistics: Optional[Statistics] = None files: Optional[FileIndex] = None + hierarchy: Optional[Any] = Field(default=None, alias="HIERARCHY") + + +# --------------------------------------------------------------------------- +# DocNav models +# --------------------------------------------------------------------------- + + +class DocNavResourceItem(BaseModel): + """A single image or table resource entry in ``doc_nav.json``.""" + + path: str + summary: Optional[str] = None + + +class DocNavResources(BaseModel): + """Image and table resource summaries from ``doc_nav.json``.""" + + images: List[DocNavResourceItem] = Field(default_factory=list) + tables: List[DocNavResourceItem] = Field(default_factory=list) + + +class DocNavSection(BaseModel): + """A document section entry in the ``doc_nav.json`` navigation tree.""" + + title: str + path: str + level: int + summary: Optional[str] = None + chunk_count: int = 0 + children: List["DocNavSection"] = Field(default_factory=list) + + +class DocNav(BaseModel): + """Top-level document navigation structure from ``doc_nav.json``.""" + + sections: List[DocNavSection] = Field(default_factory=list) + resources: Optional[DocNavResources] = None # --------------------------------------------------------------------------- @@ -145,6 +182,27 @@ class Manifest(BaseModel): # --------------------------------------------------------------------------- +class ChunkMetadata(BaseModel): + """Known worker metadata fields for a chunk. + + All fields are optional. Unknown fields added by future worker + versions are preserved thanks to ``model_config``. + """ + + model_config = {"extra": "allow"} + + length: Optional[int] = None + page_nums: Optional[List[int]] = None + tokens: Optional[List[str]] = None + keywords: Optional[List[str]] = None + summary: Optional[str] = None + connect_to: Optional[List[Dict[str, Any]]] = None + file_path: Optional[str] = None + original_name: Optional[str] = None + table_type: Optional[str] = None + document_top_summary: Optional[str] = None + + class BaseChunk(BaseModel): """Fields shared by every chunk type.""" @@ -152,32 +210,20 @@ class BaseChunk(BaseModel): type: str content: str = "" path: Optional[str] = None - page_nums: Optional[List[int]] = None - - -TextChunkTokens: TypeAlias = List[str] + metadata: ChunkMetadata = Field(default_factory=ChunkMetadata) class TextChunk(BaseChunk): """A text chunk extracted from the document.""" type: str = "text" - length: int = 0 - tokens: Optional[TextChunkTokens] = None - keywords: Optional[List[str]] = None - summary: Optional[str] = None - connect_to: Optional[List[Dict[str, Any]]] = None - relationships: Optional[List[Union[Dict[str, Any], str]]] = None class ImageChunk(BaseChunk): """An image chunk — carries raw bytes loaded from the ZIP.""" type: str = "image" - length: int = 0 file_path: Optional[str] = None - original_name: Optional[str] = None - summary: Optional[str] = None data: bytes = Field(default=b"", exclude=True) model_config = {"arbitrary_types_allowed": True} @@ -193,13 +239,13 @@ def format(self) -> Optional[str]: def save(self, directory: Union[str, Path]) -> Path: """Write the image bytes to *directory*, returning the output path. - The filename is derived from ``original_name`` or ``file_path``, - sanitised for cross-platform safety. + The filename is derived from ``file_path``, sanitised for + cross-platform safety. """ dir_path: Path = Path(directory) dir_path.mkdir(parents=True, exist_ok=True) - raw_name: str = self.original_name or os.path.basename( + raw_name: str = os.path.basename( self.file_path or f"{self.chunk_id}.bin" ) safe_name: str = _sanitizeFilename(raw_name) @@ -214,11 +260,7 @@ class TableChunk(BaseChunk): """A table chunk — carries HTML loaded from the ZIP.""" type: str = "table" - length: int = 0 file_path: Optional[str] = None - original_name: Optional[str] = None - table_type: Optional[str] = None - summary: Optional[str] = None html: str = Field(default="", exclude=True) def save(self, directory: Union[str, Path]) -> Path: @@ -226,7 +268,7 @@ def save(self, directory: Union[str, Path]) -> Path: dir_path: Path = Path(directory) dir_path.mkdir(parents=True, exist_ok=True) - raw_name: str = self.original_name or os.path.basename( + raw_name: str = os.path.basename( self.file_path or f"{self.chunk_id}.html" ) safe_name: str = _sanitizeFilename(raw_name) @@ -242,12 +284,11 @@ def save(self, directory: Union[str, Path]) -> Path: class SlimChunk(BaseModel): - """Minimal chunk entry emitted in chunks_slim.json.""" + """Minimal chunk entry emitted in chunks_slim.json (legacy).""" type: str path: Optional[str] = None content: str = "" - summary: Optional[str] = None # --------------------------------------------------------------------------- @@ -259,48 +300,59 @@ class ParseResult: """Eagerly-loaded result of a document parsing job. Contains the manifest, all chunks (with image bytes and table HTML - already loaded), the full markdown, hierarchy data, and the raw ZIP - bytes for archival purposes. + already loaded), the full markdown, the document navigation tree, + and the raw ZIP bytes for archival purposes. + + Legacy fields (``chunks_slim``, ``hierarchy``, ``toc_hierarchies``, + ``kb_csv``, ``hierarchy_view_html``) are kept for backward + compatibility with older result ZIPs. The current worker does not + emit ``chunks_slim.json`` or ``hierarchy.json``. """ manifest: Manifest chunks: List[Chunk] - chunks_slim: Optional[List[SlimChunk]] full_markdown: str + raw_zip: bytes + namespace: Optional[str] + document_id: Optional[str] + # Current worker output + doc_nav: Optional[DocNav] + # Legacy — the current worker no longer emits these files + chunks_slim: Optional[List[SlimChunk]] hierarchy: Optional[Any] toc_hierarchies: Optional[Any] kb_csv: Optional[str] hierarchy_view_html: Optional[str] - raw_zip: bytes - namespace: Optional[str] - document_id: Optional[str] def __init__( self, *, manifest: Manifest, chunks: List[Chunk], - chunks_slim: Optional[List[SlimChunk]], full_markdown: str, - hierarchy: Optional[Any], - toc_hierarchies: Optional[Any], - kb_csv: Optional[str], - hierarchy_view_html: Optional[str], raw_zip: bytes, + doc_nav: Optional[DocNav] = None, namespace: Optional[str] = None, document_id: Optional[str] = None, + # Legacy — the current worker no longer emits these files + chunks_slim: Optional[List[SlimChunk]] = None, + hierarchy: Optional[Any] = None, + toc_hierarchies: Optional[Any] = None, + kb_csv: Optional[str] = None, + hierarchy_view_html: Optional[str] = None, ) -> None: self.manifest = manifest self.chunks = chunks - self.chunks_slim = chunks_slim self.full_markdown = full_markdown + self.raw_zip = raw_zip + self.doc_nav = doc_nav + self.namespace = namespace + self.document_id = document_id + self.chunks_slim = chunks_slim self.hierarchy = hierarchy self.toc_hierarchies = toc_hierarchies self.kb_csv = kb_csv self.hierarchy_view_html = hierarchy_view_html - self.raw_zip = raw_zip - self.namespace = namespace - self.document_id = document_id # -- convenience properties -- @@ -344,11 +396,17 @@ def save(self, directory: Union[str, Path]) -> Path: """Save the full result to *directory*. Creates the directory if needed and writes: + * ``manifest.json`` — result manifest + * ``chunks.json`` — all chunks + * ``doc_nav.json`` — document navigation tree (if present) * ``full.md`` — the full markdown * ``images/`` — all image chunks * ``tables/`` — all table chunks * ``result.zip`` — the raw ZIP archive + Legacy files (``chunks_slim.json``, ``hierarchy.json``, etc.) are + also written when present for backward compatibility. + Returns the resolved directory path. """ dir_path: Path = Path(directory) @@ -357,7 +415,7 @@ def save(self, directory: Union[str, Path]) -> Path: # Manifest / chunks manifest_path: Path = dir_path / "manifest.json" manifest_path.write_text( - self.manifest.model_dump_json(indent=2), + self.manifest.model_dump_json(indent=2, by_alias=True), encoding="utf-8", ) @@ -367,6 +425,13 @@ def save(self, directory: Union[str, Path]) -> Path: encoding="utf-8", ) + if self.doc_nav is not None: + doc_nav_path: Path = dir_path / "doc_nav.json" + doc_nav_path.write_text( + self.doc_nav.model_dump_json(indent=2), + encoding="utf-8", + ) + if self.chunks_slim is not None: chunks_slim_path: Path = dir_path / "chunks_slim.json" chunks_slim_path.write_text( diff --git a/src/knowhere/types/retrieval.py b/src/knowhere/types/retrieval.py index 47b07a8..ebb13cf 100644 --- a/src/knowhere/types/retrieval.py +++ b/src/knowhere/types/retrieval.py @@ -2,9 +2,9 @@ from __future__ import annotations -from typing import Literal, Optional, TypedDict +from typing import Any, Dict, List, Literal, Optional, TypedDict -from pydantic import BaseModel +from pydantic import BaseModel, Field RetrievalChannel = Literal["path", "content", "term"] @@ -37,9 +37,16 @@ class RetrievalResult(BaseModel): class RetrievalQueryResponse(BaseModel): - """Response from ``POST /v1/retrieval/query``.""" + """Response from ``POST /v1/retrieval/query``. + + Agentic fields (``answer_text``, ``referenced_chunks``) are only + populated when ``use_agentic=True``. In legacy retrieval mode they + default to ``None`` and ``[]`` respectively. + """ namespace: str query: str router_used: Optional[str] = None + answer_text: Optional[str] = None + referenced_chunks: List[Dict[str, Any]] = Field(default_factory=list) results: list[RetrievalResult] diff --git a/tests/test_models.py b/tests/test_models.py index 92b9732..4314cfa 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -414,13 +414,17 @@ def test_defaults(self) -> None: chunk: BaseChunk = BaseChunk(chunk_id="chunk_2", type="text") assert chunk.content == "" assert chunk.path is None - assert chunk.page_nums is None + assert chunk.metadata.length is None + assert chunk.metadata.tokens is None - def test_page_nums_supported(self) -> None: + def test_metadata_accessible(self) -> None: chunk: BaseChunk = BaseChunk( - chunk_id="chunk_3", type="text", page_nums=[1, 2] + chunk_id="chunk_3", + type="text", + metadata={"tokens": ["a", "b"], "length": 10}, ) - assert chunk.page_nums == [1, 2] + assert chunk.metadata.tokens == ["a", "b"] + assert chunk.metadata.length == 10 # --------------------------------------------------------------------------- @@ -436,48 +440,21 @@ def test_from_dict(self) -> None: chunk_id="text_1", content="Some text content", path="doc/section1", - length=17, - page_nums=[1, 2], - tokens=["Some", "text", "content"], - keywords=["text", "content"], - summary="A text chunk", - connect_to=[{"target": "img_1", "relation": "embeds"}], - relationships=[{"target": "text_2", "type": "follows"}], ) assert chunk.chunk_id == "text_1" assert chunk.type == "text" assert chunk.content == "Some text content" - assert chunk.length == 17 - assert chunk.page_nums == [1, 2] - assert chunk.tokens == ["Some", "text", "content"] - assert chunk.keywords == ["text", "content"] - assert chunk.summary == "A text chunk" - assert chunk.connect_to is not None - assert len(chunk.connect_to) == 1 - assert chunk.relationships is not None - assert len(chunk.relationships) == 1 def test_defaults(self) -> None: chunk: TextChunk = TextChunk(chunk_id="text_2") assert chunk.type == "text" - assert chunk.length == 0 - assert chunk.tokens is None - assert chunk.keywords is None - assert chunk.summary is None - assert chunk.connect_to is None - assert chunk.relationships is None + assert chunk.content == "" + assert chunk.path is None def test_is_instance_of_base_chunk(self) -> None: chunk: TextChunk = TextChunk(chunk_id="text_3") assert isinstance(chunk, BaseChunk) - def test_accepts_tokens_list(self) -> None: - chunk: TextChunk = TextChunk( - chunk_id="text_4", - tokens=["attention", "transformer"], - ) - assert chunk.tokens == ["attention", "transformer"] - # --------------------------------------------------------------------------- # ImageChunk model @@ -492,24 +469,18 @@ def test_from_dict(self) -> None: chunk_id="IMG_1", content="A photo of a cat", file_path="images/IMG_1.jpg", - original_name="cat.jpg", - summary="Cat photo", data=b"\xff\xd8\xff\xe0", ) assert chunk.chunk_id == "IMG_1" assert chunk.type == "image" assert chunk.content == "A photo of a cat" assert chunk.file_path == "images/IMG_1.jpg" - assert chunk.original_name == "cat.jpg" assert chunk.data == b"\xff\xd8\xff\xe0" def test_defaults(self) -> None: chunk: ImageChunk = ImageChunk(chunk_id="IMG_2") assert chunk.type == "image" - assert chunk.length == 0 assert chunk.file_path is None - assert chunk.original_name is None - assert chunk.summary is None assert chunk.data == b"" def test_format_property_from_file_path(self) -> None: @@ -547,22 +518,16 @@ def test_from_dict(self) -> None: chunk_id="TBL_1", content="Revenue table", file_path="tables/TBL_1.html", - original_name="revenue.html", - table_type="financial", - summary="Revenue data", html="
100
", ) assert chunk.chunk_id == "TBL_1" assert chunk.type == "table" - assert chunk.table_type == "financial" assert chunk.html == "
100
" def test_defaults(self) -> None: chunk: TableChunk = TableChunk(chunk_id="TBL_2") assert chunk.type == "table" - assert chunk.length == 0 assert chunk.file_path is None - assert chunk.table_type is None assert chunk.html == "" def test_is_instance_of_base_chunk(self) -> None: @@ -602,7 +567,6 @@ def _build_parse_result( TextChunk( chunk_id="text_1", content="Hello world", - length=11, ), ImageChunk( chunk_id="img_1", @@ -624,7 +588,6 @@ def _build_parse_result( type="text", path="doc/section1", content="Hello world", - summary="Greeting", ) ], full_markdown="# Test\n\nHello world", diff --git a/tests/test_result_parser.py b/tests/test_result_parser.py index dfa276c..3ff3f24 100644 --- a/tests/test_result_parser.py +++ b/tests/test_result_parser.py @@ -14,6 +14,7 @@ from knowhere._exceptions import ChecksumError, KnowhereError from knowhere.lib.result_parser import parseResultZip from knowhere.types.result import ( + DocNav, ImageChunk, Manifest, ParseResult, @@ -34,30 +35,56 @@ "type": "text", "content": "Hello world", "path": "test/section1", - "length": 11, - "tokens": ["Hello", "world"], - "keywords": ["hello"], - "summary": "A greeting", - "relationships": [], }, { "chunk_id": "IMAGE_test1", "type": "image", "content": "A test image", "path": "test/images", - "length": 12, "file_path": "images/IMAGE_test1.jpg", - "original_name": "test-image.jpg", - "summary": "Test image", }, ] -TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"] - MARKDOWN: str = "# Test\n\nHello world" IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0" TABLE_HTML: str = "
Optimized
" +DOC_NAV_JSON: Dict[str, Any] = { + "sections": [ + { + "title": "Introduction", + "path": "Default_Root/test.pdf-->Introduction", + "level": 1, + "summary": "Overview of the topic", + "chunk_count": 3, + "children": [ + { + "title": "Background", + "path": "Default_Root/test.pdf-->Introduction-->Background", + "level": 2, + "summary": "Historical context", + "chunk_count": 2, + "children": [], + } + ], + } + ], + "resources": { + "images": [ + { + "path": "images/IMAGE_test1.jpg", + "summary": "Test image summary", + } + ], + "tables": [ + { + "path": "tables/table-optimized.html", + "summary": "Optimized table", + } + ], + }, +} + def _build_zip( manifest: Dict[str, Any], @@ -160,48 +187,20 @@ def _make_optimized_chunks() -> List[Dict[str, Any]]: "type": "text", "content": "Text chunk with embedded resources.", "path": "Default_Root/optimized.pdf-->Section 1", - "metadata": { - "length": 35, - "summary": "", - "page_nums": [1, 2], - "tokens": ["Text", "chunk"], - "keywords": ["optimized"], - "connect_to": [ - { - "target": "image_chunk_optimized", - "relation": "embeds", - "ref": "[images/IMAGE_test1.jpg]", - } - ], - }, }, { "chunk_id": "image_chunk_optimized", "type": "image", "content": "[images/IMAGE_test1.jpg]", "path": "images/IMAGE_test1.jpg", - "metadata": { - "length": 1, - "summary": "Optimized image chunk", - "page_nums": [2], - "file_path": "images/IMAGE_test1.jpg", - "keywords": [], - "tokens": [], - }, + "file_path": "images/IMAGE_test1.jpg", }, { "chunk_id": "table_chunk_optimized", "type": "table", "content": TABLE_HTML, "path": "tables/table-optimized.html", - "metadata": { - "length": 1, - "summary": "Optimized table chunk", - "page_nums": [3], - "file_path": "tables/table-optimized.html", - "keywords": ["optimized"], - "tokens": [], - }, + "file_path": "tables/table-optimized.html", }, ] @@ -235,20 +234,18 @@ def test_loads_text_chunks(self) -> None: assert text_chunks[0].chunk_id == "text_chunk_1" assert text_chunks[0].content == "Hello world" - def test_accepts_text_chunk_tokens_as_list(self) -> None: + def test_metadata_accessible_on_chunks(self) -> None: manifest: Dict[str, Any] = _make_manifest() chunks: List[Dict[str, Any]] = [ { - "chunk_id": "text_chunk_tokens_list", + "chunk_id": "text_with_meta", "type": "text", - "content": "Attention is all you need", - "path": "paper/abstract", + "content": "Text with metadata", + "path": "doc/section1", "metadata": { - "length": 25, - "tokens": TEXT_TOKENS_LIST, - "keywords": ["attention", "transformer"], - "summary": "Transformer introduction", - "relationships": [], + "length": 42, + "tokens": ["hello", "world"], + "summary": "A summary", }, } ] @@ -256,52 +253,10 @@ def test_accepts_text_chunk_tokens_as_list(self) -> None: result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False) - assert len(result.text_chunks) == 1 - assert result.text_chunks[0].tokens == TEXT_TOKENS_LIST - - def test_rejects_legacy_text_chunk_tokens_string(self) -> None: - manifest: Dict[str, Any] = _make_manifest() - chunks: List[Dict[str, Any]] = [ - { - "chunk_id": "text_chunk_tokens_string", - "type": "text", - "content": "Attention is all you need", - "path": "paper/abstract", - "metadata": { - "length": 25, - "tokens": "Ashish;Vaswani;attention;transformer", - "keywords": ["attention", "transformer"], - "summary": "Transformer introduction", - "relationships": [], - }, - } - ] - zip_bytes: bytes = _build_zip(manifest, chunks=chunks) - - with pytest.raises(KnowhereError, match="expected list\\[str\\]"): - parseResultZip(zip_bytes, verify_checksum=False) - - def test_rejects_integer_text_chunk_tokens(self) -> None: - manifest: Dict[str, Any] = _make_manifest() - chunks: List[Dict[str, Any]] = [ - { - "chunk_id": "text_chunk_tokens_int", - "type": "text", - "content": "Attention is all you need", - "path": "paper/abstract", - "metadata": { - "length": 25, - "tokens": 4, - "keywords": ["attention", "transformer"], - "summary": "Transformer introduction", - "relationships": [], - }, - } - ] - zip_bytes: bytes = _build_zip(manifest, chunks=chunks) - - with pytest.raises(KnowhereError, match="expected list\\[str\\]"): - parseResultZip(zip_bytes, verify_checksum=False) + chunk = result.text_chunks[0] + assert chunk.metadata.length == 42 + assert chunk.metadata.tokens == ["hello", "world"] + assert chunk.metadata.summary == "A summary" def test_loads_image_chunks_with_data(self) -> None: manifest: Dict[str, Any] = _make_manifest() @@ -381,16 +336,9 @@ def test_exposes_optimized_payload_metadata_and_sidecar_assets(self) -> None: assert result.manifest.processing.billing_status == "charged" assert result.manifest.processing.cost is not None assert result.manifest.processing.cost.micro_dollars == 60000 - assert result.text_chunks[0].page_nums == [1, 2] - assert result.image_chunks[0].page_nums == [2] - assert result.table_chunks[0].page_nums == [3] - assert result.text_chunks[0].connect_to == [ - { - "target": "image_chunk_optimized", - "relation": "embeds", - "ref": "[images/IMAGE_test1.jpg]", - } - ] + assert result.text_chunks[0].chunk_id == "text_chunk_optimized" + assert result.image_chunks[0].chunk_id == "image_chunk_optimized" + assert result.table_chunks[0].chunk_id == "table_chunk_optimized" assert result.chunks_slim is not None assert len(result.chunks_slim) == 1 assert result.kb_csv == "chunk_id,type\ntext_chunk_optimized,text\n" @@ -449,6 +397,153 @@ def test_save_preserves_optimized_sidecar_files(self, tmp_path: Path) -> None: assert (output_dir / "result.zip").exists() +# --------------------------------------------------------------------------- +# Current worker contract tests (doc_nav, HIERARCHY) +# --------------------------------------------------------------------------- + + +def _make_current_contract_manifest() -> Dict[str, Any]: + """Manifest matching the current worker contract with HIERARCHY.""" + return { + "version": "2.0", + "job_id": "job_current123", + "data_id": None, + "source_file_name": "current.pdf", + "processing_date": "2026-05-01T00:00:00Z", + "HIERARCHY": { + "Default_Root": { + "current.pdf": { + "sections": ["Introduction", "Methods", "Results"], + } + } + }, + "statistics": { + "total_chunks": 2, + "text_chunks": 1, + "image_chunks": 1, + "table_chunks": 0, + "total_pages": None, + }, + } + + +class TestCurrentWorkerContract: + """Tests against the current worker output contract.""" + + # -- doc_nav.json -- + + def test_parses_doc_nav(self) -> None: + manifest = _make_optimized_manifest() + chunks = _make_optimized_chunks() + zip_bytes = _build_zip( + manifest, + chunks=chunks, + extra_entries={ + "doc_nav.json": json.dumps(DOC_NAV_JSON).encode("utf-8"), + "tables/table-optimized.html": TABLE_HTML.encode("utf-8"), + }, + ) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.doc_nav is not None + doc_nav: DocNav = result.doc_nav + assert len(doc_nav.sections) == 1 + assert doc_nav.sections[0].title == "Introduction" + assert doc_nav.sections[0].level == 1 + assert doc_nav.sections[0].chunk_count == 3 + assert len(doc_nav.sections[0].children) == 1 + assert doc_nav.sections[0].children[0].title == "Background" + assert doc_nav.resources is not None + assert len(doc_nav.resources.images) == 1 + assert doc_nav.resources.images[0].path == "images/IMAGE_test1.jpg" + assert len(doc_nav.resources.tables) == 1 + assert doc_nav.resources.tables[0].path == "tables/table-optimized.html" + + def test_doc_nav_none_when_missing(self) -> None: + manifest = _make_optimized_manifest() + zip_bytes = _build_zip(manifest) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.doc_nav is None + + def test_save_writes_doc_nav(self, tmp_path: Path) -> None: + manifest = _make_optimized_manifest() + chunks = _make_optimized_chunks() + zip_bytes = _build_zip( + manifest, + chunks=chunks, + extra_entries={ + "doc_nav.json": json.dumps(DOC_NAV_JSON).encode("utf-8"), + "tables/table-optimized.html": TABLE_HTML.encode("utf-8"), + }, + ) + + result = parseResultZip(zip_bytes, verify_checksum=False) + output_dir = tmp_path / "with-doc-nav" + result.save(output_dir) + + assert (output_dir / "doc_nav.json").exists() + + # -- Manifest HIERARCHY -- + + def test_manifest_hierarchy_alias(self) -> None: + manifest = _make_current_contract_manifest() + zip_bytes = _build_zip(manifest) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.manifest.hierarchy is not None + assert "Default_Root" in result.manifest.hierarchy + + def test_manifest_without_hierarchy(self) -> None: + manifest = _make_optimized_manifest() + zip_bytes = _build_zip(manifest) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.manifest.hierarchy is None + + # -- Graceful handling of missing legacy files -- + + def test_parses_without_chunks_slim(self) -> None: + """Current worker doesn't emit chunks_slim.json — parse must succeed.""" + manifest = _make_optimized_manifest() + chunks = _make_optimized_chunks() + zip_bytes = _build_zip( + manifest, + chunks=chunks, + extra_entries={ + "tables/table-optimized.html": TABLE_HTML.encode("utf-8"), + }, + # No chunks_slim.json in extra_entries + ) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.chunks_slim is None + assert len(result.chunks) == 3 + + def test_parses_without_hierarchy_json(self) -> None: + """Current worker doesn't emit hierarchy.json — parse must succeed.""" + manifest = _make_optimized_manifest() + chunks = _make_optimized_chunks() + zip_bytes = _build_zip( + manifest, + chunks=chunks, + extra_entries={ + "tables/table-optimized.html": TABLE_HTML.encode("utf-8"), + }, + # No hierarchy.json in extra_entries + ) + + result = parseResultZip(zip_bytes, verify_checksum=False) + + assert result.hierarchy is None + assert result.manifest is not None + + # --------------------------------------------------------------------------- # Checksum verification # --------------------------------------------------------------------------- diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py index 4925e30..2029120 100644 --- a/tests/test_retrieval.py +++ b/tests/test_retrieval.py @@ -20,6 +20,35 @@ def _make_retrieval_response() -> Dict[str, Any]: "namespace": "support-center", "query": "refund policy", "router_used": "discovery+agent", + "answer_text": "Annual plans may be refunded within 30 days of purchase.", + "referenced_chunks": [ + { + "chunk_id": "chunk_001", + "document_id": "doc_123", + "asset_url": "https://example.com/assets/chunk_001", + } + ], + "results": [ + { + "chunk_type": "text", + "content": "Annual plans may be refunded within 30 days.", + "score": 1.0, + "source": { + "document_id": "doc_123", + "source_file_name": "refund-policy.md", + "section_path": "Policies / Billing / Refunds", + }, + } + ], + } + + +def _make_legacy_retrieval_response() -> Dict[str, Any]: + """Legacy-mode response without agentic fields (backward compatibility).""" + return { + "namespace": "support-center", + "query": "refund policy", + "router_used": "discovery+legacy", "results": [ { "chunk_type": "text", @@ -93,6 +122,11 @@ def test_query_sends_request_and_returns_results(self, sync_client: Any) -> None assert response.results[0].source.document_id == "doc_123" assert response.results[0].source.source_file_name == "refund-policy.md" assert response.results[0].source.section_path == "Policies / Billing / Refunds" + assert response.answer_text == ( + "Annual plans may be refunded within 30 days of purchase." + ) + assert len(response.referenced_chunks) == 1 + assert response.referenced_chunks[0]["chunk_id"] == "chunk_001" assert not hasattr(response.results[0], "citation") assert not hasattr(response.results[0], "chunk_id") assert not hasattr(response.results[0], "section_id") @@ -127,3 +161,62 @@ async def test_async_query_sends_request_and_returns_results( assert route.called assert response.router_used == "discovery+agent" assert response.results[0].source.document_id == "doc_123" + + @respx.mock + def test_use_agentic_sends_parameter(self, sync_client: Any) -> None: + """use_agentic=True is sent to the server.""" + route = respx.post(RETRIEVAL_QUERY_URL).mock( + return_value=httpx.Response(200, json=_make_retrieval_response()) + ) + + sync_client.retrieval.query(query="refund policy", use_agentic=True) + + request_body: Dict[str, Any] = json.loads(route.calls[0].request.read()) + assert request_body["use_agentic"] is True + + @respx.mock + def test_use_agentic_omitted_when_none(self, sync_client: Any) -> None: + """use_agentic=None omits the parameter (server default).""" + route = respx.post(RETRIEVAL_QUERY_URL).mock( + return_value=httpx.Response(200, json=_make_retrieval_response()) + ) + + sync_client.retrieval.query(query="refund policy") + + request_body: Dict[str, Any] = json.loads(route.calls[0].request.read()) + assert "use_agentic" not in request_body + + @respx.mock + def test_agentic_response_fields(self, sync_client: Any) -> None: + """Agentic response exposes answer_text and referenced_chunks.""" + route = respx.post(RETRIEVAL_QUERY_URL).mock( + return_value=httpx.Response(200, json=_make_retrieval_response()) + ) + + response = sync_client.retrieval.query( + query="refund policy", + use_agentic=True, + ) + + assert response.answer_text == ( + "Annual plans may be refunded within 30 days of purchase." + ) + assert len(response.referenced_chunks) == 1 + assert response.referenced_chunks[0]["chunk_id"] == "chunk_001" + assert response.referenced_chunks[0]["asset_url"] == ( + "https://example.com/assets/chunk_001" + ) + + @respx.mock + def test_legacy_response_without_agentic_fields(self, sync_client: Any) -> None: + """Legacy-mode response (no agentic fields) parses without error.""" + route = respx.post(RETRIEVAL_QUERY_URL).mock( + return_value=httpx.Response( + 200, json=_make_legacy_retrieval_response() + ) + ) + + response = sync_client.retrieval.query(query="refund policy") + + assert response.answer_text is None + assert response.referenced_chunks == []