diff --git a/docs/usage.md b/docs/usage.md
index e1c0714..d982cb0 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -1,5 +1,9 @@
# Knowhere Python SDK — Usage Guide
+> **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`,
+> `length`, etc.) are no longer flattened to the chunk surface. Access them
+> through `chunk.metadata` instead. See [Chunk Types](#chunk-types).
+
Comprehensive reference for every feature, parameter, and pattern in the SDK.
## Table of Contents
@@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk]
# Lookup by ID
chunk = result.getChunk("chunk_42")
-# Hierarchy data (document structure tree, if available)
-result.hierarchy
+# Document navigation tree (from doc_nav.json, current worker output)
+result.doc_nav # DocNav | None
+result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels
+result.doc_nav.resources # DocNavResources — image/table resource summaries
+
+# Legacy hierarchy (from hierarchy.json, older worker output)
+result.hierarchy # Any | None
# Raw ZIP bytes (for archival)
result.raw_zip
@@ -239,49 +248,48 @@ result.save("./output/report/")
## Chunk Types
-Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`). Each type adds its own fields.
+Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`,
+`metadata`). Worker metadata is kept in the `metadata` dict — it is **not**
+flattened to top-level chunk properties.
-### TextChunk
+### Base fields (all chunk types)
| Field | Type | Description |
|-------|------|-------------|
| `chunk_id` | `str` | Unique identifier |
-| `type` | `str` | Always `"text"` |
-| `content` | `str` | The text content |
-| `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) |
-| `length` | `int` | Character count |
-| `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline |
-| `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) |
-| `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
-| `relationships` | `List \| None` | Relationships to other chunks |
+| `type` | `str` | `"text"`, `"image"`, or `"table"` |
+| `content` | `str` | Text content or placeholder |
+| `path` | `str \| None` | Document structure path |
+| `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) |
+
+### TextChunk
```python
for chunk in result.text_chunks:
print(f"[{chunk.chunk_id}] {chunk.content[:60]}...")
- if chunk.keywords:
- print(f" Keywords: {', '.join(chunk.keywords)}")
- if chunk.summary:
- print(f" Summary: {chunk.summary}")
+ # Metadata is in chunk.metadata, not flattened:
+ keywords = chunk.metadata.get("keywords", [])
+ summary = chunk.metadata.get("summary")
+ if keywords:
+ print(f" Keywords: {', '.join(keywords)}")
+ if summary:
+ print(f" Summary: {summary}")
```
### ImageChunk
| Field | Type | Description |
|-------|------|-------------|
-| `chunk_id` | `str` | Unique identifier |
-| `type` | `str` | Always `"image"` |
-| `content` | `str` | Text content associated with the image |
| `file_path` | `str \| None` | Path within the ZIP |
-| `original_name` | `str \| None` | Original filename |
-| `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) |
| `data` | `bytes` | Raw image bytes (loaded from ZIP) |
| `format` | `str \| None` | Image format inferred from extension (property) |
```python
for img in result.image_chunks:
print(f"{img.file_path} ({len(img.data)} bytes, {img.format})")
- if img.summary:
- print(f" Description: {img.summary}")
+ summary = img.metadata.get("summary")
+ if summary:
+ print(f" Description: {summary}")
img.save("./output/images/") # writes to disk
```
@@ -289,13 +297,7 @@ for img in result.image_chunks:
| Field | Type | Description |
|-------|------|-------------|
-| `chunk_id` | `str` | Unique identifier |
-| `type` | `str` | Always `"table"` |
-| `content` | `str` | Text representation of the table |
| `file_path` | `str \| None` | Path within the ZIP |
-| `original_name` | `str \| None` | Original filename |
-| `table_type` | `str \| None` | Table classification |
-| `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) |
| `html` | `str` | Full HTML of the table (loaded from ZIP) |
```python
@@ -471,6 +473,19 @@ response = client.retrieval.query(
top_k=5,
)
+# Agentic mode (LLM navigation + answer synthesis)
+response = client.retrieval.query(
+ namespace="support-center",
+ query="How do I pair a Bluetooth headset?",
+ use_agentic=True,
+ top_k=5,
+)
+print(response.answer_text) # LLM-generated natural-language answer
+print(response.router_used) # "workflow_single_step", "small_kb_all", etc.
+for ref in response.referenced_chunks:
+ print(ref.get("chunk_id"), ref.get("asset_url"))
+
+# Legacy results are always available
for result in response.results:
print(result.content)
print(result.score)
@@ -479,6 +494,10 @@ for result in response.results:
print(result.source.section_path)
```
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. |
+
Retrieval results expose `content`, not the older parse-result `text` field.
Media results may include `asset_url` when the server can sign the referenced
artifact.
diff --git a/src/knowhere/lib/result_parser.py b/src/knowhere/lib/result_parser.py
index 95f6241..eac4579 100644
--- a/src/knowhere/lib/result_parser.py
+++ b/src/knowhere/lib/result_parser.py
@@ -13,13 +13,13 @@
from knowhere._logging import getLogger
from knowhere.types.result import (
Chunk,
+ DocNav,
ImageChunk,
Manifest,
ParseResult,
SlimChunk,
TableChunk,
TextChunk,
- TextChunkTokens,
)
_logger = getLogger()
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
return fallback
-def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
- """Return a string-only token list with empty values removed."""
- normalized_tokens: List[str] = []
- for raw_token in raw_tokens:
- token_text: str = str(raw_token).strip()
- if token_text:
- normalized_tokens.append(token_text)
- return normalized_tokens
-
-
-def _parseTextChunkTokens(
- raw_tokens: Any,
- *,
- chunk_id: str,
-) -> Optional[TextChunkTokens]:
- """Normalize text chunk tokens from the current backend payload."""
- if raw_tokens is None:
- return None
- if isinstance(raw_tokens, bool):
- raise KnowhereError(
- f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
- )
- if isinstance(raw_tokens, list):
- return _normalizeTokenList(raw_tokens)
-
- raise KnowhereError(
- "Invalid tokens payload for text chunk "
- f"'{chunk_id}': expected list[str], "
- f"got {type(raw_tokens).__name__}."
- )
-
-
def _buildChunks(
raw_chunks: List[Dict[str, Any]],
zf: zipfile.ZipFile,
@@ -125,58 +93,39 @@ def _buildChunks(
if chunk_type == "image":
image_data: bytes = b""
- # file_path may be at top level, inside metadata, or use path as fallback
file_path: Optional[str] = _extractFilePath(raw)
if file_path:
image_data = _readZipBytes(zf, file_path) or b""
- metadata: Dict[str, Any] = raw.get("metadata", {})
chunk: Chunk = ImageChunk(
chunk_id=raw.get("chunk_id", ""),
type="image",
content=raw.get("content", ""),
path=raw.get("path"),
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
- length=metadata.get("length", raw.get("length", 0)),
file_path=file_path,
- original_name=metadata.get("original_name", raw.get("original_name")),
- summary=metadata.get("summary", raw.get("summary")),
data=image_data,
+ metadata=raw.get("metadata", {}),
)
elif chunk_type == "table":
table_html: str = ""
file_path = _extractFilePath(raw)
if file_path:
table_html = _readZipText(zf, file_path) or ""
- metadata = raw.get("metadata", {})
chunk = TableChunk(
chunk_id=raw.get("chunk_id", ""),
type="table",
content=raw.get("content", ""),
path=raw.get("path"),
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
- length=metadata.get("length", raw.get("length", 0)),
file_path=file_path,
- original_name=metadata.get("original_name", raw.get("original_name")),
- table_type=metadata.get("table_type", raw.get("table_type")),
- summary=metadata.get("summary", raw.get("summary")),
html=table_html,
+ metadata=raw.get("metadata", {}),
)
else:
- metadata = raw.get("metadata", {})
- chunk_id: str = raw.get("chunk_id", "")
- raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
chunk = TextChunk(
- chunk_id=chunk_id,
+ chunk_id=raw.get("chunk_id", ""),
type="text",
content=raw.get("content", ""),
path=raw.get("path"),
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
- length=metadata.get("length", raw.get("length", 0)),
- tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
- keywords=metadata.get("keywords", raw.get("keywords")),
- summary=metadata.get("summary", raw.get("summary")),
- connect_to=metadata.get("connect_to", raw.get("connect_to")),
- relationships=metadata.get("relationships", raw.get("relationships")),
+ metadata=raw.get("metadata", {}),
)
chunks.append(chunk)
@@ -229,7 +178,15 @@ def parseResultZip(
# -- Full markdown --
full_markdown: str = _readZipText(zf, "full.md") or ""
- # -- Hierarchy --
+ # -- DocNav (current worker output) --
+ doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
+ doc_nav: Optional[DocNav] = (
+ DocNav.model_validate(json.loads(doc_nav_text))
+ if doc_nav_text
+ else None
+ )
+
+ # -- Hierarchy (legacy — current worker no longer emits this) --
hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
hierarchy: Optional[Any] = (
json.loads(hierarchy_text) if hierarchy_text else None
@@ -263,11 +220,13 @@ def parseResultZip(
return ParseResult(
manifest=manifest,
chunks=chunks,
- chunks_slim=chunks_slim,
full_markdown=full_markdown,
+ raw_zip=zip_bytes,
+ doc_nav=doc_nav,
+ # Legacy — the current worker no longer emits these files
+ chunks_slim=chunks_slim,
hierarchy=hierarchy,
toc_hierarchies=toc_hierarchies,
kb_csv=kb_csv,
hierarchy_view_html=hierarchy_view_html,
- raw_zip=zip_bytes,
)
diff --git a/src/knowhere/resources/retrieval.py b/src/knowhere/resources/retrieval.py
index 3b6b36c..9100be8 100644
--- a/src/knowhere/resources/retrieval.py
+++ b/src/knowhere/resources/retrieval.py
@@ -22,6 +22,7 @@ def query(
query: str,
namespace: Optional[str] = None,
top_k: Optional[int] = None,
+ use_agentic: Optional[bool] = None,
data_type: Optional[int] = None,
signal_paths: Optional[list[str]] = None,
filter_mode: Optional[RetrievalFilterMode] = None,
@@ -39,6 +40,8 @@ def query(
body["namespace"] = namespace
if top_k is not None:
body["top_k"] = top_k
+ if use_agentic is not None:
+ body["use_agentic"] = use_agentic
if data_type is not None:
body["data_type"] = data_type
if signal_paths is not None:
@@ -77,6 +80,7 @@ async def query(
query: str,
namespace: Optional[str] = None,
top_k: Optional[int] = None,
+ use_agentic: Optional[bool] = None,
data_type: Optional[int] = None,
signal_paths: Optional[list[str]] = None,
filter_mode: Optional[RetrievalFilterMode] = None,
@@ -94,6 +98,8 @@ async def query(
body["namespace"] = namespace
if top_k is not None:
body["top_k"] = top_k
+ if use_agentic is not None:
+ body["use_agentic"] = use_agentic
if data_type is not None:
body["data_type"] = data_type
if signal_paths is not None:
diff --git a/src/knowhere/types/result.py b/src/knowhere/types/result.py
index df83c19..0cce2da 100644
--- a/src/knowhere/types/result.py
+++ b/src/knowhere/types/result.py
@@ -9,7 +9,6 @@
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
-from typing_extensions import TypeAlias
from knowhere._exceptions import ValidationError
@@ -138,6 +137,44 @@ class Manifest(BaseModel):
checksum: Optional[Checksum] = None
statistics: Optional[Statistics] = None
files: Optional[FileIndex] = None
+ hierarchy: Optional[Any] = Field(default=None, alias="HIERARCHY")
+
+
+# ---------------------------------------------------------------------------
+# DocNav models
+# ---------------------------------------------------------------------------
+
+
+class DocNavResourceItem(BaseModel):
+ """A single image or table resource entry in ``doc_nav.json``."""
+
+ path: str
+ summary: Optional[str] = None
+
+
+class DocNavResources(BaseModel):
+ """Image and table resource summaries from ``doc_nav.json``."""
+
+ images: List[DocNavResourceItem] = Field(default_factory=list)
+ tables: List[DocNavResourceItem] = Field(default_factory=list)
+
+
+class DocNavSection(BaseModel):
+ """A document section entry in the ``doc_nav.json`` navigation tree."""
+
+ title: str
+ path: str
+ level: int
+ summary: Optional[str] = None
+ chunk_count: int = 0
+ children: List["DocNavSection"] = Field(default_factory=list)
+
+
+class DocNav(BaseModel):
+ """Top-level document navigation structure from ``doc_nav.json``."""
+
+ sections: List[DocNavSection] = Field(default_factory=list)
+ resources: Optional[DocNavResources] = None
# ---------------------------------------------------------------------------
@@ -145,6 +182,27 @@ class Manifest(BaseModel):
# ---------------------------------------------------------------------------
+class ChunkMetadata(BaseModel):
+ """Known worker metadata fields for a chunk.
+
+ All fields are optional. Unknown fields added by future worker
+ versions are preserved thanks to ``model_config``.
+ """
+
+ model_config = {"extra": "allow"}
+
+ length: Optional[int] = None
+ page_nums: Optional[List[int]] = None
+ tokens: Optional[List[str]] = None
+ keywords: Optional[List[str]] = None
+ summary: Optional[str] = None
+ connect_to: Optional[List[Dict[str, Any]]] = None
+ file_path: Optional[str] = None
+ original_name: Optional[str] = None
+ table_type: Optional[str] = None
+ document_top_summary: Optional[str] = None
+
+
class BaseChunk(BaseModel):
"""Fields shared by every chunk type."""
@@ -152,32 +210,20 @@ class BaseChunk(BaseModel):
type: str
content: str = ""
path: Optional[str] = None
- page_nums: Optional[List[int]] = None
-
-
-TextChunkTokens: TypeAlias = List[str]
+ metadata: ChunkMetadata = Field(default_factory=ChunkMetadata)
class TextChunk(BaseChunk):
"""A text chunk extracted from the document."""
type: str = "text"
- length: int = 0
- tokens: Optional[TextChunkTokens] = None
- keywords: Optional[List[str]] = None
- summary: Optional[str] = None
- connect_to: Optional[List[Dict[str, Any]]] = None
- relationships: Optional[List[Union[Dict[str, Any], str]]] = None
class ImageChunk(BaseChunk):
"""An image chunk — carries raw bytes loaded from the ZIP."""
type: str = "image"
- length: int = 0
file_path: Optional[str] = None
- original_name: Optional[str] = None
- summary: Optional[str] = None
data: bytes = Field(default=b"", exclude=True)
model_config = {"arbitrary_types_allowed": True}
@@ -193,13 +239,13 @@ def format(self) -> Optional[str]:
def save(self, directory: Union[str, Path]) -> Path:
"""Write the image bytes to *directory*, returning the output path.
- The filename is derived from ``original_name`` or ``file_path``,
- sanitised for cross-platform safety.
+ The filename is derived from ``file_path``, sanitised for
+ cross-platform safety.
"""
dir_path: Path = Path(directory)
dir_path.mkdir(parents=True, exist_ok=True)
- raw_name: str = self.original_name or os.path.basename(
+ raw_name: str = os.path.basename(
self.file_path or f"{self.chunk_id}.bin"
)
safe_name: str = _sanitizeFilename(raw_name)
@@ -214,11 +260,7 @@ class TableChunk(BaseChunk):
"""A table chunk — carries HTML loaded from the ZIP."""
type: str = "table"
- length: int = 0
file_path: Optional[str] = None
- original_name: Optional[str] = None
- table_type: Optional[str] = None
- summary: Optional[str] = None
html: str = Field(default="", exclude=True)
def save(self, directory: Union[str, Path]) -> Path:
@@ -226,7 +268,7 @@ def save(self, directory: Union[str, Path]) -> Path:
dir_path: Path = Path(directory)
dir_path.mkdir(parents=True, exist_ok=True)
- raw_name: str = self.original_name or os.path.basename(
+ raw_name: str = os.path.basename(
self.file_path or f"{self.chunk_id}.html"
)
safe_name: str = _sanitizeFilename(raw_name)
@@ -242,12 +284,11 @@ def save(self, directory: Union[str, Path]) -> Path:
class SlimChunk(BaseModel):
- """Minimal chunk entry emitted in chunks_slim.json."""
+ """Minimal chunk entry emitted in chunks_slim.json (legacy)."""
type: str
path: Optional[str] = None
content: str = ""
- summary: Optional[str] = None
# ---------------------------------------------------------------------------
@@ -259,48 +300,59 @@ class ParseResult:
"""Eagerly-loaded result of a document parsing job.
Contains the manifest, all chunks (with image bytes and table HTML
- already loaded), the full markdown, hierarchy data, and the raw ZIP
- bytes for archival purposes.
+ already loaded), the full markdown, the document navigation tree,
+ and the raw ZIP bytes for archival purposes.
+
+ Legacy fields (``chunks_slim``, ``hierarchy``, ``toc_hierarchies``,
+ ``kb_csv``, ``hierarchy_view_html``) are kept for backward
+ compatibility with older result ZIPs. The current worker does not
+ emit ``chunks_slim.json`` or ``hierarchy.json``.
"""
manifest: Manifest
chunks: List[Chunk]
- chunks_slim: Optional[List[SlimChunk]]
full_markdown: str
+ raw_zip: bytes
+ namespace: Optional[str]
+ document_id: Optional[str]
+ # Current worker output
+ doc_nav: Optional[DocNav]
+ # Legacy — the current worker no longer emits these files
+ chunks_slim: Optional[List[SlimChunk]]
hierarchy: Optional[Any]
toc_hierarchies: Optional[Any]
kb_csv: Optional[str]
hierarchy_view_html: Optional[str]
- raw_zip: bytes
- namespace: Optional[str]
- document_id: Optional[str]
def __init__(
self,
*,
manifest: Manifest,
chunks: List[Chunk],
- chunks_slim: Optional[List[SlimChunk]],
full_markdown: str,
- hierarchy: Optional[Any],
- toc_hierarchies: Optional[Any],
- kb_csv: Optional[str],
- hierarchy_view_html: Optional[str],
raw_zip: bytes,
+ doc_nav: Optional[DocNav] = None,
namespace: Optional[str] = None,
document_id: Optional[str] = None,
+ # Legacy — the current worker no longer emits these files
+ chunks_slim: Optional[List[SlimChunk]] = None,
+ hierarchy: Optional[Any] = None,
+ toc_hierarchies: Optional[Any] = None,
+ kb_csv: Optional[str] = None,
+ hierarchy_view_html: Optional[str] = None,
) -> None:
self.manifest = manifest
self.chunks = chunks
- self.chunks_slim = chunks_slim
self.full_markdown = full_markdown
+ self.raw_zip = raw_zip
+ self.doc_nav = doc_nav
+ self.namespace = namespace
+ self.document_id = document_id
+ self.chunks_slim = chunks_slim
self.hierarchy = hierarchy
self.toc_hierarchies = toc_hierarchies
self.kb_csv = kb_csv
self.hierarchy_view_html = hierarchy_view_html
- self.raw_zip = raw_zip
- self.namespace = namespace
- self.document_id = document_id
# -- convenience properties --
@@ -344,11 +396,17 @@ def save(self, directory: Union[str, Path]) -> Path:
"""Save the full result to *directory*.
Creates the directory if needed and writes:
+ * ``manifest.json`` — result manifest
+ * ``chunks.json`` — all chunks
+ * ``doc_nav.json`` — document navigation tree (if present)
* ``full.md`` — the full markdown
* ``images/`` — all image chunks
* ``tables/`` — all table chunks
* ``result.zip`` — the raw ZIP archive
+ Legacy files (``chunks_slim.json``, ``hierarchy.json``, etc.) are
+ also written when present for backward compatibility.
+
Returns the resolved directory path.
"""
dir_path: Path = Path(directory)
@@ -357,7 +415,7 @@ def save(self, directory: Union[str, Path]) -> Path:
# Manifest / chunks
manifest_path: Path = dir_path / "manifest.json"
manifest_path.write_text(
- self.manifest.model_dump_json(indent=2),
+ self.manifest.model_dump_json(indent=2, by_alias=True),
encoding="utf-8",
)
@@ -367,6 +425,13 @@ def save(self, directory: Union[str, Path]) -> Path:
encoding="utf-8",
)
+ if self.doc_nav is not None:
+ doc_nav_path: Path = dir_path / "doc_nav.json"
+ doc_nav_path.write_text(
+ self.doc_nav.model_dump_json(indent=2),
+ encoding="utf-8",
+ )
+
if self.chunks_slim is not None:
chunks_slim_path: Path = dir_path / "chunks_slim.json"
chunks_slim_path.write_text(
diff --git a/src/knowhere/types/retrieval.py b/src/knowhere/types/retrieval.py
index 47b07a8..ebb13cf 100644
--- a/src/knowhere/types/retrieval.py
+++ b/src/knowhere/types/retrieval.py
@@ -2,9 +2,9 @@
from __future__ import annotations
-from typing import Literal, Optional, TypedDict
+from typing import Any, Dict, List, Literal, Optional, TypedDict
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
RetrievalChannel = Literal["path", "content", "term"]
@@ -37,9 +37,16 @@ class RetrievalResult(BaseModel):
class RetrievalQueryResponse(BaseModel):
- """Response from ``POST /v1/retrieval/query``."""
+ """Response from ``POST /v1/retrieval/query``.
+
+ Agentic fields (``answer_text``, ``referenced_chunks``) are only
+ populated when ``use_agentic=True``. In legacy retrieval mode they
+ default to ``None`` and ``[]`` respectively.
+ """
namespace: str
query: str
router_used: Optional[str] = None
+ answer_text: Optional[str] = None
+ referenced_chunks: List[Dict[str, Any]] = Field(default_factory=list)
results: list[RetrievalResult]
diff --git a/tests/test_models.py b/tests/test_models.py
index 92b9732..4314cfa 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -414,13 +414,17 @@ def test_defaults(self) -> None:
chunk: BaseChunk = BaseChunk(chunk_id="chunk_2", type="text")
assert chunk.content == ""
assert chunk.path is None
- assert chunk.page_nums is None
+ assert chunk.metadata.length is None
+ assert chunk.metadata.tokens is None
- def test_page_nums_supported(self) -> None:
+ def test_metadata_accessible(self) -> None:
chunk: BaseChunk = BaseChunk(
- chunk_id="chunk_3", type="text", page_nums=[1, 2]
+ chunk_id="chunk_3",
+ type="text",
+ metadata={"tokens": ["a", "b"], "length": 10},
)
- assert chunk.page_nums == [1, 2]
+ assert chunk.metadata.tokens == ["a", "b"]
+ assert chunk.metadata.length == 10
# ---------------------------------------------------------------------------
@@ -436,48 +440,21 @@ def test_from_dict(self) -> None:
chunk_id="text_1",
content="Some text content",
path="doc/section1",
- length=17,
- page_nums=[1, 2],
- tokens=["Some", "text", "content"],
- keywords=["text", "content"],
- summary="A text chunk",
- connect_to=[{"target": "img_1", "relation": "embeds"}],
- relationships=[{"target": "text_2", "type": "follows"}],
)
assert chunk.chunk_id == "text_1"
assert chunk.type == "text"
assert chunk.content == "Some text content"
- assert chunk.length == 17
- assert chunk.page_nums == [1, 2]
- assert chunk.tokens == ["Some", "text", "content"]
- assert chunk.keywords == ["text", "content"]
- assert chunk.summary == "A text chunk"
- assert chunk.connect_to is not None
- assert len(chunk.connect_to) == 1
- assert chunk.relationships is not None
- assert len(chunk.relationships) == 1
def test_defaults(self) -> None:
chunk: TextChunk = TextChunk(chunk_id="text_2")
assert chunk.type == "text"
- assert chunk.length == 0
- assert chunk.tokens is None
- assert chunk.keywords is None
- assert chunk.summary is None
- assert chunk.connect_to is None
- assert chunk.relationships is None
+ assert chunk.content == ""
+ assert chunk.path is None
def test_is_instance_of_base_chunk(self) -> None:
chunk: TextChunk = TextChunk(chunk_id="text_3")
assert isinstance(chunk, BaseChunk)
- def test_accepts_tokens_list(self) -> None:
- chunk: TextChunk = TextChunk(
- chunk_id="text_4",
- tokens=["attention", "transformer"],
- )
- assert chunk.tokens == ["attention", "transformer"]
-
# ---------------------------------------------------------------------------
# ImageChunk model
@@ -492,24 +469,18 @@ def test_from_dict(self) -> None:
chunk_id="IMG_1",
content="A photo of a cat",
file_path="images/IMG_1.jpg",
- original_name="cat.jpg",
- summary="Cat photo",
data=b"\xff\xd8\xff\xe0",
)
assert chunk.chunk_id == "IMG_1"
assert chunk.type == "image"
assert chunk.content == "A photo of a cat"
assert chunk.file_path == "images/IMG_1.jpg"
- assert chunk.original_name == "cat.jpg"
assert chunk.data == b"\xff\xd8\xff\xe0"
def test_defaults(self) -> None:
chunk: ImageChunk = ImageChunk(chunk_id="IMG_2")
assert chunk.type == "image"
- assert chunk.length == 0
assert chunk.file_path is None
- assert chunk.original_name is None
- assert chunk.summary is None
assert chunk.data == b""
def test_format_property_from_file_path(self) -> None:
@@ -547,22 +518,16 @@ def test_from_dict(self) -> None:
chunk_id="TBL_1",
content="Revenue table",
file_path="tables/TBL_1.html",
- original_name="revenue.html",
- table_type="financial",
- summary="Revenue data",
html="
",
)
assert chunk.chunk_id == "TBL_1"
assert chunk.type == "table"
- assert chunk.table_type == "financial"
assert chunk.html == ""
def test_defaults(self) -> None:
chunk: TableChunk = TableChunk(chunk_id="TBL_2")
assert chunk.type == "table"
- assert chunk.length == 0
assert chunk.file_path is None
- assert chunk.table_type is None
assert chunk.html == ""
def test_is_instance_of_base_chunk(self) -> None:
@@ -602,7 +567,6 @@ def _build_parse_result(
TextChunk(
chunk_id="text_1",
content="Hello world",
- length=11,
),
ImageChunk(
chunk_id="img_1",
@@ -624,7 +588,6 @@ def _build_parse_result(
type="text",
path="doc/section1",
content="Hello world",
- summary="Greeting",
)
],
full_markdown="# Test\n\nHello world",
diff --git a/tests/test_result_parser.py b/tests/test_result_parser.py
index dfa276c..3ff3f24 100644
--- a/tests/test_result_parser.py
+++ b/tests/test_result_parser.py
@@ -14,6 +14,7 @@
from knowhere._exceptions import ChecksumError, KnowhereError
from knowhere.lib.result_parser import parseResultZip
from knowhere.types.result import (
+ DocNav,
ImageChunk,
Manifest,
ParseResult,
@@ -34,30 +35,56 @@
"type": "text",
"content": "Hello world",
"path": "test/section1",
- "length": 11,
- "tokens": ["Hello", "world"],
- "keywords": ["hello"],
- "summary": "A greeting",
- "relationships": [],
},
{
"chunk_id": "IMAGE_test1",
"type": "image",
"content": "A test image",
"path": "test/images",
- "length": 12,
"file_path": "images/IMAGE_test1.jpg",
- "original_name": "test-image.jpg",
- "summary": "Test image",
},
]
-TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"]
-
MARKDOWN: str = "# Test\n\nHello world"
IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0"
TABLE_HTML: str = ""
+DOC_NAV_JSON: Dict[str, Any] = {
+ "sections": [
+ {
+ "title": "Introduction",
+ "path": "Default_Root/test.pdf-->Introduction",
+ "level": 1,
+ "summary": "Overview of the topic",
+ "chunk_count": 3,
+ "children": [
+ {
+ "title": "Background",
+ "path": "Default_Root/test.pdf-->Introduction-->Background",
+ "level": 2,
+ "summary": "Historical context",
+ "chunk_count": 2,
+ "children": [],
+ }
+ ],
+ }
+ ],
+ "resources": {
+ "images": [
+ {
+ "path": "images/IMAGE_test1.jpg",
+ "summary": "Test image summary",
+ }
+ ],
+ "tables": [
+ {
+ "path": "tables/table-optimized.html",
+ "summary": "Optimized table",
+ }
+ ],
+ },
+}
+
def _build_zip(
manifest: Dict[str, Any],
@@ -160,48 +187,20 @@ def _make_optimized_chunks() -> List[Dict[str, Any]]:
"type": "text",
"content": "Text chunk with embedded resources.",
"path": "Default_Root/optimized.pdf-->Section 1",
- "metadata": {
- "length": 35,
- "summary": "",
- "page_nums": [1, 2],
- "tokens": ["Text", "chunk"],
- "keywords": ["optimized"],
- "connect_to": [
- {
- "target": "image_chunk_optimized",
- "relation": "embeds",
- "ref": "[images/IMAGE_test1.jpg]",
- }
- ],
- },
},
{
"chunk_id": "image_chunk_optimized",
"type": "image",
"content": "[images/IMAGE_test1.jpg]",
"path": "images/IMAGE_test1.jpg",
- "metadata": {
- "length": 1,
- "summary": "Optimized image chunk",
- "page_nums": [2],
- "file_path": "images/IMAGE_test1.jpg",
- "keywords": [],
- "tokens": [],
- },
+ "file_path": "images/IMAGE_test1.jpg",
},
{
"chunk_id": "table_chunk_optimized",
"type": "table",
"content": TABLE_HTML,
"path": "tables/table-optimized.html",
- "metadata": {
- "length": 1,
- "summary": "Optimized table chunk",
- "page_nums": [3],
- "file_path": "tables/table-optimized.html",
- "keywords": ["optimized"],
- "tokens": [],
- },
+ "file_path": "tables/table-optimized.html",
},
]
@@ -235,20 +234,18 @@ def test_loads_text_chunks(self) -> None:
assert text_chunks[0].chunk_id == "text_chunk_1"
assert text_chunks[0].content == "Hello world"
- def test_accepts_text_chunk_tokens_as_list(self) -> None:
+ def test_metadata_accessible_on_chunks(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
chunks: List[Dict[str, Any]] = [
{
- "chunk_id": "text_chunk_tokens_list",
+ "chunk_id": "text_with_meta",
"type": "text",
- "content": "Attention is all you need",
- "path": "paper/abstract",
+ "content": "Text with metadata",
+ "path": "doc/section1",
"metadata": {
- "length": 25,
- "tokens": TEXT_TOKENS_LIST,
- "keywords": ["attention", "transformer"],
- "summary": "Transformer introduction",
- "relationships": [],
+ "length": 42,
+ "tokens": ["hello", "world"],
+ "summary": "A summary",
},
}
]
@@ -256,52 +253,10 @@ def test_accepts_text_chunk_tokens_as_list(self) -> None:
result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
- assert len(result.text_chunks) == 1
- assert result.text_chunks[0].tokens == TEXT_TOKENS_LIST
-
- def test_rejects_legacy_text_chunk_tokens_string(self) -> None:
- manifest: Dict[str, Any] = _make_manifest()
- chunks: List[Dict[str, Any]] = [
- {
- "chunk_id": "text_chunk_tokens_string",
- "type": "text",
- "content": "Attention is all you need",
- "path": "paper/abstract",
- "metadata": {
- "length": 25,
- "tokens": "Ashish;Vaswani;attention;transformer",
- "keywords": ["attention", "transformer"],
- "summary": "Transformer introduction",
- "relationships": [],
- },
- }
- ]
- zip_bytes: bytes = _build_zip(manifest, chunks=chunks)
-
- with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
- parseResultZip(zip_bytes, verify_checksum=False)
-
- def test_rejects_integer_text_chunk_tokens(self) -> None:
- manifest: Dict[str, Any] = _make_manifest()
- chunks: List[Dict[str, Any]] = [
- {
- "chunk_id": "text_chunk_tokens_int",
- "type": "text",
- "content": "Attention is all you need",
- "path": "paper/abstract",
- "metadata": {
- "length": 25,
- "tokens": 4,
- "keywords": ["attention", "transformer"],
- "summary": "Transformer introduction",
- "relationships": [],
- },
- }
- ]
- zip_bytes: bytes = _build_zip(manifest, chunks=chunks)
-
- with pytest.raises(KnowhereError, match="expected list\\[str\\]"):
- parseResultZip(zip_bytes, verify_checksum=False)
+ chunk = result.text_chunks[0]
+ assert chunk.metadata.length == 42
+ assert chunk.metadata.tokens == ["hello", "world"]
+ assert chunk.metadata.summary == "A summary"
def test_loads_image_chunks_with_data(self) -> None:
manifest: Dict[str, Any] = _make_manifest()
@@ -381,16 +336,9 @@ def test_exposes_optimized_payload_metadata_and_sidecar_assets(self) -> None:
assert result.manifest.processing.billing_status == "charged"
assert result.manifest.processing.cost is not None
assert result.manifest.processing.cost.micro_dollars == 60000
- assert result.text_chunks[0].page_nums == [1, 2]
- assert result.image_chunks[0].page_nums == [2]
- assert result.table_chunks[0].page_nums == [3]
- assert result.text_chunks[0].connect_to == [
- {
- "target": "image_chunk_optimized",
- "relation": "embeds",
- "ref": "[images/IMAGE_test1.jpg]",
- }
- ]
+ assert result.text_chunks[0].chunk_id == "text_chunk_optimized"
+ assert result.image_chunks[0].chunk_id == "image_chunk_optimized"
+ assert result.table_chunks[0].chunk_id == "table_chunk_optimized"
assert result.chunks_slim is not None
assert len(result.chunks_slim) == 1
assert result.kb_csv == "chunk_id,type\ntext_chunk_optimized,text\n"
@@ -449,6 +397,153 @@ def test_save_preserves_optimized_sidecar_files(self, tmp_path: Path) -> None:
assert (output_dir / "result.zip").exists()
+# ---------------------------------------------------------------------------
+# Current worker contract tests (doc_nav, HIERARCHY)
+# ---------------------------------------------------------------------------
+
+
+def _make_current_contract_manifest() -> Dict[str, Any]:
+ """Manifest matching the current worker contract with HIERARCHY."""
+ return {
+ "version": "2.0",
+ "job_id": "job_current123",
+ "data_id": None,
+ "source_file_name": "current.pdf",
+ "processing_date": "2026-05-01T00:00:00Z",
+ "HIERARCHY": {
+ "Default_Root": {
+ "current.pdf": {
+ "sections": ["Introduction", "Methods", "Results"],
+ }
+ }
+ },
+ "statistics": {
+ "total_chunks": 2,
+ "text_chunks": 1,
+ "image_chunks": 1,
+ "table_chunks": 0,
+ "total_pages": None,
+ },
+ }
+
+
+class TestCurrentWorkerContract:
+ """Tests against the current worker output contract."""
+
+ # -- doc_nav.json --
+
+ def test_parses_doc_nav(self) -> None:
+ manifest = _make_optimized_manifest()
+ chunks = _make_optimized_chunks()
+ zip_bytes = _build_zip(
+ manifest,
+ chunks=chunks,
+ extra_entries={
+ "doc_nav.json": json.dumps(DOC_NAV_JSON).encode("utf-8"),
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
+ },
+ )
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.doc_nav is not None
+ doc_nav: DocNav = result.doc_nav
+ assert len(doc_nav.sections) == 1
+ assert doc_nav.sections[0].title == "Introduction"
+ assert doc_nav.sections[0].level == 1
+ assert doc_nav.sections[0].chunk_count == 3
+ assert len(doc_nav.sections[0].children) == 1
+ assert doc_nav.sections[0].children[0].title == "Background"
+ assert doc_nav.resources is not None
+ assert len(doc_nav.resources.images) == 1
+ assert doc_nav.resources.images[0].path == "images/IMAGE_test1.jpg"
+ assert len(doc_nav.resources.tables) == 1
+ assert doc_nav.resources.tables[0].path == "tables/table-optimized.html"
+
+ def test_doc_nav_none_when_missing(self) -> None:
+ manifest = _make_optimized_manifest()
+ zip_bytes = _build_zip(manifest)
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.doc_nav is None
+
+ def test_save_writes_doc_nav(self, tmp_path: Path) -> None:
+ manifest = _make_optimized_manifest()
+ chunks = _make_optimized_chunks()
+ zip_bytes = _build_zip(
+ manifest,
+ chunks=chunks,
+ extra_entries={
+ "doc_nav.json": json.dumps(DOC_NAV_JSON).encode("utf-8"),
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
+ },
+ )
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+ output_dir = tmp_path / "with-doc-nav"
+ result.save(output_dir)
+
+ assert (output_dir / "doc_nav.json").exists()
+
+ # -- Manifest HIERARCHY --
+
+ def test_manifest_hierarchy_alias(self) -> None:
+ manifest = _make_current_contract_manifest()
+ zip_bytes = _build_zip(manifest)
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.manifest.hierarchy is not None
+ assert "Default_Root" in result.manifest.hierarchy
+
+ def test_manifest_without_hierarchy(self) -> None:
+ manifest = _make_optimized_manifest()
+ zip_bytes = _build_zip(manifest)
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.manifest.hierarchy is None
+
+ # -- Graceful handling of missing legacy files --
+
+ def test_parses_without_chunks_slim(self) -> None:
+ """Current worker doesn't emit chunks_slim.json — parse must succeed."""
+ manifest = _make_optimized_manifest()
+ chunks = _make_optimized_chunks()
+ zip_bytes = _build_zip(
+ manifest,
+ chunks=chunks,
+ extra_entries={
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
+ },
+ # No chunks_slim.json in extra_entries
+ )
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.chunks_slim is None
+ assert len(result.chunks) == 3
+
+ def test_parses_without_hierarchy_json(self) -> None:
+ """Current worker doesn't emit hierarchy.json — parse must succeed."""
+ manifest = _make_optimized_manifest()
+ chunks = _make_optimized_chunks()
+ zip_bytes = _build_zip(
+ manifest,
+ chunks=chunks,
+ extra_entries={
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
+ },
+ # No hierarchy.json in extra_entries
+ )
+
+ result = parseResultZip(zip_bytes, verify_checksum=False)
+
+ assert result.hierarchy is None
+ assert result.manifest is not None
+
+
# ---------------------------------------------------------------------------
# Checksum verification
# ---------------------------------------------------------------------------
diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py
index 4925e30..2029120 100644
--- a/tests/test_retrieval.py
+++ b/tests/test_retrieval.py
@@ -20,6 +20,35 @@ def _make_retrieval_response() -> Dict[str, Any]:
"namespace": "support-center",
"query": "refund policy",
"router_used": "discovery+agent",
+ "answer_text": "Annual plans may be refunded within 30 days of purchase.",
+ "referenced_chunks": [
+ {
+ "chunk_id": "chunk_001",
+ "document_id": "doc_123",
+ "asset_url": "https://example.com/assets/chunk_001",
+ }
+ ],
+ "results": [
+ {
+ "chunk_type": "text",
+ "content": "Annual plans may be refunded within 30 days.",
+ "score": 1.0,
+ "source": {
+ "document_id": "doc_123",
+ "source_file_name": "refund-policy.md",
+ "section_path": "Policies / Billing / Refunds",
+ },
+ }
+ ],
+ }
+
+
+def _make_legacy_retrieval_response() -> Dict[str, Any]:
+ """Legacy-mode response without agentic fields (backward compatibility)."""
+ return {
+ "namespace": "support-center",
+ "query": "refund policy",
+ "router_used": "discovery+legacy",
"results": [
{
"chunk_type": "text",
@@ -93,6 +122,11 @@ def test_query_sends_request_and_returns_results(self, sync_client: Any) -> None
assert response.results[0].source.document_id == "doc_123"
assert response.results[0].source.source_file_name == "refund-policy.md"
assert response.results[0].source.section_path == "Policies / Billing / Refunds"
+ assert response.answer_text == (
+ "Annual plans may be refunded within 30 days of purchase."
+ )
+ assert len(response.referenced_chunks) == 1
+ assert response.referenced_chunks[0]["chunk_id"] == "chunk_001"
assert not hasattr(response.results[0], "citation")
assert not hasattr(response.results[0], "chunk_id")
assert not hasattr(response.results[0], "section_id")
@@ -127,3 +161,62 @@ async def test_async_query_sends_request_and_returns_results(
assert route.called
assert response.router_used == "discovery+agent"
assert response.results[0].source.document_id == "doc_123"
+
+ @respx.mock
+ def test_use_agentic_sends_parameter(self, sync_client: Any) -> None:
+ """use_agentic=True is sent to the server."""
+ route = respx.post(RETRIEVAL_QUERY_URL).mock(
+ return_value=httpx.Response(200, json=_make_retrieval_response())
+ )
+
+ sync_client.retrieval.query(query="refund policy", use_agentic=True)
+
+ request_body: Dict[str, Any] = json.loads(route.calls[0].request.read())
+ assert request_body["use_agentic"] is True
+
+ @respx.mock
+ def test_use_agentic_omitted_when_none(self, sync_client: Any) -> None:
+ """use_agentic=None omits the parameter (server default)."""
+ route = respx.post(RETRIEVAL_QUERY_URL).mock(
+ return_value=httpx.Response(200, json=_make_retrieval_response())
+ )
+
+ sync_client.retrieval.query(query="refund policy")
+
+ request_body: Dict[str, Any] = json.loads(route.calls[0].request.read())
+ assert "use_agentic" not in request_body
+
+ @respx.mock
+ def test_agentic_response_fields(self, sync_client: Any) -> None:
+ """Agentic response exposes answer_text and referenced_chunks."""
+ route = respx.post(RETRIEVAL_QUERY_URL).mock(
+ return_value=httpx.Response(200, json=_make_retrieval_response())
+ )
+
+ response = sync_client.retrieval.query(
+ query="refund policy",
+ use_agentic=True,
+ )
+
+ assert response.answer_text == (
+ "Annual plans may be refunded within 30 days of purchase."
+ )
+ assert len(response.referenced_chunks) == 1
+ assert response.referenced_chunks[0]["chunk_id"] == "chunk_001"
+ assert response.referenced_chunks[0]["asset_url"] == (
+ "https://example.com/assets/chunk_001"
+ )
+
+ @respx.mock
+ def test_legacy_response_without_agentic_fields(self, sync_client: Any) -> None:
+ """Legacy-mode response (no agentic fields) parses without error."""
+ route = respx.post(RETRIEVAL_QUERY_URL).mock(
+ return_value=httpx.Response(
+ 200, json=_make_legacy_retrieval_response()
+ )
+ )
+
+ response = sync_client.retrieval.query(query="refund policy")
+
+ assert response.answer_text is None
+ assert response.referenced_chunks == []