From 73094d4f95ef693785fa3965f6f2a223dfd2a350 Mon Sep 17 00:00:00 2001
From: Will <will@local>
Date: Mon, 27 Apr 2026 04:15:46 +0000
Subject: [PATCH] feat: add document chunks resource methods

---
 README.md                           |  15 ++++
 docs/usage.md                       |  16 ++++
 src/knowhere/__init__.py            |  15 +++-
 src/knowhere/resources/documents.py | 119 +++++++++++++++++++++++++++-
 src/knowhere/types/__init__.py      |  15 +++-
 src/knowhere/types/document.py      |  52 +++++++++++-
 tests/test_documents.py             | 113 ++++++++++++++++++++++++++
 7 files changed, 341 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1cacf09..dd366b6 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,21 @@ update_job = client.jobs.create(
 document = client.documents.get(document_id)
 print(document.status)
 
+chunks = client.documents.list_chunks(
+    document_id,
+    page=1,
+    page_size=50,
+    chunk_type="text",
+)
+print(chunks.pagination.total)
+if chunks.chunks:
+    chunk = client.documents.get_chunk(
+        document_id,
+        chunks.chunks[0].id,
+        include_asset_urls=True,
+    )
+    print(chunk.chunk.content)
+
 client.documents.archive(document_id)
 ```
 
diff --git a/docs/usage.md b/docs/usage.md
index 507f5f1..e1c0714 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -521,6 +521,22 @@ for document in document_list.documents:
 document = client.documents.get("doc_123")
 print(document.current_job_result_id)
 
+chunks = client.documents.list_chunks(
+    "doc_123",
+    page=1,
+    page_size=50,
+    chunk_type="text",
+)
+for chunk in chunks.chunks:
+    print(chunk.id, chunk.content)
+
+image_chunk = client.documents.get_chunk(
+    "doc_123",
+    "dchk_123",
+    include_asset_urls=True,
+)
+print(image_chunk.chunk.asset_url)
+
 archived = client.documents.archive("doc_123")
 print(archived.status)  # "archived"
 ```
diff --git a/src/knowhere/__init__.py b/src/knowhere/__init__.py
index b136805..87c5446 100644
--- a/src/knowhere/__init__.py
+++ b/src/knowhere/__init__.py
@@ -35,7 +35,15 @@
 )
 from knowhere._types import PollProgressCallback, UploadProgressCallback
 from knowhere._version import __version__
-from knowhere.types.document import Document, DocumentListResponse
+from knowhere.types.document import (
+    Document,
+    DocumentChunk,
+    DocumentChunkListResponse,
+    DocumentChunkPagination,
+    DocumentChunkResponse,
+    DocumentChunkType,
+    DocumentListResponse,
+)
 from knowhere.types.job import Job, JobError, JobProgress, JobResult
 from knowhere.types.params import ParsingParams, WebhookConfig
 from knowhere.types.retrieval import (
@@ -98,6 +106,11 @@
     "JobResult",
     # Document types
     "Document",
+    "DocumentChunk",
+    "DocumentChunkListResponse",
+    "DocumentChunkPagination",
+    "DocumentChunkResponse",
+    "DocumentChunkType",
     "DocumentListResponse",
     # Retrieval types
     "RetrievalChannel",
diff --git a/src/knowhere/resources/documents.py b/src/knowhere/resources/documents.py
index c826d64..c58aebb 100644
--- a/src/knowhere/resources/documents.py
+++ b/src/knowhere/resources/documents.py
@@ -5,7 +5,13 @@
 from typing import Any, Dict, Optional
 
 from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
-from knowhere.types.document import Document, DocumentListResponse
+from knowhere.types.document import (
+    Document,
+    DocumentChunkListResponse,
+    DocumentChunkResponse,
+    DocumentChunkType,
+    DocumentListResponse,
+)
 
 
 class Documents(SyncAPIResource):
@@ -32,6 +38,49 @@ def get(self, document_id: str) -> Document:
             cast_to=Document,
         )
 
+    def list_chunks(
+        self,
+        document_id: str,
+        *,
+        page: int = 1,
+        page_size: int = 50,
+        chunk_type: Optional[DocumentChunkType] = None,
+        include_asset_urls: bool = False,
+    ) -> DocumentChunkListResponse:
+        """List current-revision chunks for one canonical document."""
+        params: Dict[str, Any] = _build_chunk_list_params(
+            page=page,
+            page_size=page_size,
+            chunk_type=chunk_type,
+            include_asset_urls=include_asset_urls,
+        )
+
+        return self._request(
+            "GET",
+            f"v1/documents/{document_id}/chunks",
+            params=params or None,
+            cast_to=DocumentChunkListResponse,
+        )
+
+    def get_chunk(
+        self,
+        document_id: str,
+        document_chunk_id: str,
+        *,
+        include_asset_urls: bool = False,
+    ) -> DocumentChunkResponse:
+        """Get one current-revision chunk for one canonical document."""
+        params: Dict[str, Any] = _build_chunk_get_params(
+            include_asset_urls=include_asset_urls,
+        )
+
+        return self._request(
+            "GET",
+            f"v1/documents/{document_id}/chunks/{document_chunk_id}",
+            params=params or None,
+            cast_to=DocumentChunkResponse,
+        )
+
     def archive(self, document_id: str) -> Document:
         """Archive one canonical document by ID."""
         return self._request(
@@ -65,6 +114,49 @@ async def get(self, document_id: str) -> Document:
             cast_to=Document,
         )
 
+    async def list_chunks(
+        self,
+        document_id: str,
+        *,
+        page: int = 1,
+        page_size: int = 50,
+        chunk_type: Optional[DocumentChunkType] = None,
+        include_asset_urls: bool = False,
+    ) -> DocumentChunkListResponse:
+        """List current-revision chunks for one canonical document."""
+        params: Dict[str, Any] = _build_chunk_list_params(
+            page=page,
+            page_size=page_size,
+            chunk_type=chunk_type,
+            include_asset_urls=include_asset_urls,
+        )
+
+        return await self._request(
+            "GET",
+            f"v1/documents/{document_id}/chunks",
+            params=params or None,
+            cast_to=DocumentChunkListResponse,
+        )
+
+    async def get_chunk(
+        self,
+        document_id: str,
+        document_chunk_id: str,
+        *,
+        include_asset_urls: bool = False,
+    ) -> DocumentChunkResponse:
+        """Get one current-revision chunk for one canonical document."""
+        params: Dict[str, Any] = _build_chunk_get_params(
+            include_asset_urls=include_asset_urls,
+        )
+
+        return await self._request(
+            "GET",
+            f"v1/documents/{document_id}/chunks/{document_chunk_id}",
+            params=params or None,
+            cast_to=DocumentChunkResponse,
+        )
+
     async def archive(self, document_id: str) -> Document:
         """Archive one canonical document by ID."""
         return await self._request(
@@ -72,3 +164,28 @@ async def archive(self, document_id: str) -> Document:
             f"v1/documents/{document_id}/archive",
             cast_to=Document,
         )
+
+
+def _build_chunk_list_params(
+    *,
+    page: int,
+    page_size: int,
+    chunk_type: Optional[DocumentChunkType],
+    include_asset_urls: bool,
+) -> Dict[str, Any]:
+    params: Dict[str, Any] = {}
+    if page != 1:
+        params["page"] = page
+    if page_size != 50:
+        params["page_size"] = page_size
+    if chunk_type is not None:
+        params["chunk_type"] = chunk_type
+    if include_asset_urls:
+        params["include_asset_urls"] = True
+    return params
+
+
+def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]:
+    if not include_asset_urls:
+        return {}
+    return {"include_asset_urls": True}
diff --git a/src/knowhere/types/__init__.py b/src/knowhere/types/__init__.py
index a492955..e2323fe 100644
--- a/src/knowhere/types/__init__.py
+++ b/src/knowhere/types/__init__.py
@@ -2,7 +2,15 @@
 
 from __future__ import annotations
 
-from knowhere.types.document import Document, DocumentListResponse
+from knowhere.types.document import (
+    Document,
+    DocumentChunk,
+    DocumentChunkListResponse,
+    DocumentChunkPagination,
+    DocumentChunkResponse,
+    DocumentChunkType,
+    DocumentListResponse,
+)
 from knowhere.types.job import Job, JobError, JobResult
 from knowhere.types.params import ParsingParams, WebhookConfig
 from knowhere.types.retrieval import (
@@ -39,6 +47,11 @@
     "JobResult",
     # document
     "Document",
+    "DocumentChunk",
+    "DocumentChunkListResponse",
+    "DocumentChunkPagination",
+    "DocumentChunkResponse",
+    "DocumentChunkType",
     "DocumentListResponse",
     # retrieval
     "RetrievalChannel",
diff --git a/src/knowhere/types/document.py b/src/knowhere/types/document.py
index f41a438..fcd1b80 100644
--- a/src/knowhere/types/document.py
+++ b/src/knowhere/types/document.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import Optional
+from typing import Any, Dict, Literal, Optional
 
 from pydantic import BaseModel
 
@@ -26,3 +26,53 @@ class DocumentListResponse(BaseModel):
 
     namespace: str
     documents: list[Document]
+
+
+DocumentChunkType = Literal["text", "image", "table"]
+
+
+class DocumentChunkPagination(BaseModel):
+    """Pagination metadata returned by document chunk list endpoints."""
+
+    page: int
+    page_size: int
+    total: int
+    total_pages: int
+
+
+class DocumentChunk(BaseModel):
+    """One current-revision document chunk."""
+
+    id: str
+    chunk_id: str
+    chunk_type: DocumentChunkType
+    content: Optional[str] = None
+    section_id: Optional[str] = None
+    section_path: Optional[str] = None
+    source_chunk_path: Optional[str] = None
+    file_path: Optional[str] = None
+    sort_order: int
+    metadata: Dict[str, Any]
+    asset_url: Optional[str] = None
+    created_at: Optional[datetime] = None
+
+
+class DocumentChunkListResponse(BaseModel):
+    """Response from ``GET /v1/documents/{document_id}/chunks``."""
+
+    document_id: str
+    namespace: str
+    job_result_id: Optional[str] = None
+    job_id: Optional[str] = None
+    chunks: list[DocumentChunk]
+    pagination: DocumentChunkPagination
+
+
+class DocumentChunkResponse(BaseModel):
+    """Response from ``GET /v1/documents/{document_id}/chunks/{chunk_id}``."""
+
+    document_id: str
+    namespace: str
+    job_result_id: Optional[str] = None
+    job_id: Optional[str] = None
+    chunk: DocumentChunk
diff --git a/tests/test_documents.py b/tests/test_documents.py
index 8869642..205d57e 100644
--- a/tests/test_documents.py
+++ b/tests/test_documents.py
@@ -27,6 +27,23 @@ def _make_document(status: str = "active") -> Dict[str, Any]:
     }
 
 
+def _make_document_chunk(chunk_type: str = "text") -> Dict[str, Any]:
+    return {
+        "id": "dchk_123",
+        "chunk_id": "parser-chunk-1",
+        "chunk_type": chunk_type,
+        "content": "Chunk content",
+        "section_id": "sec_123",
+        "section_path": "Chapter 1",
+        "source_chunk_path": "Chapter 1/Intro",
+        "file_path": "images/figure-1.png" if chunk_type == "image" else None,
+        "sort_order": 0,
+        "metadata": {"summary": "Intro", "page_nums": [1]},
+        "asset_url": "https://assets.example/figure-1.png" if chunk_type == "image" else None,
+        "created_at": "2026-04-27T04:00:00Z",
+    }
+
+
 class TestDocumentsResource:
     """Verify document lifecycle calls."""
 
@@ -77,6 +94,102 @@ def test_get_document_returns_document_state(self, sync_client: Any) -> None:
         assert document.document_id == "doc_123"
         assert document.status == "active"
 
+    @respx.mock
+    def test_list_chunks_sends_optional_query_params(self, sync_client: Any) -> None:
+        route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "document_id": "doc_123",
+                    "namespace": "support-center",
+                    "job_result_id": "result_123",
+                    "job_id": "job_123",
+                    "chunks": [_make_document_chunk(chunk_type="table")],
+                    "pagination": {
+                        "page": 2,
+                        "page_size": 10,
+                        "total": 11,
+                        "total_pages": 2,
+                    },
+                },
+            )
+        )
+
+        response = sync_client.documents.list_chunks(
+            "doc_123",
+            page=2,
+            page_size=10,
+            chunk_type="table",
+            include_asset_urls=True,
+        )
+
+        assert route.called
+        assert route.calls[0].request.url.params["page"] == "2"
+        assert route.calls[0].request.url.params["page_size"] == "10"
+        assert route.calls[0].request.url.params["chunk_type"] == "table"
+        assert route.calls[0].request.url.params["include_asset_urls"] == "true"
+        assert response.document_id == "doc_123"
+        assert response.chunks[0].id == "dchk_123"
+        assert response.pagination.total_pages == 2
+
+    @respx.mock
+    def test_list_chunks_omits_default_query_params(self, sync_client: Any) -> None:
+        route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "document_id": "doc_123",
+                    "namespace": "support-center",
+                    "job_result_id": None,
+                    "job_id": None,
+                    "chunks": [],
+                    "pagination": {
+                        "page": 1,
+                        "page_size": 50,
+                        "total": 0,
+                        "total_pages": 0,
+                    },
+                },
+            )
+        )
+
+        response = sync_client.documents.list_chunks("doc_123")
+
+        assert route.called
+        assert dict(route.calls[0].request.url.params) == {}
+        assert response.chunks == []
+        assert response.pagination.total == 0
+
+    @respx.mock
+    @pytest.mark.asyncio
+    async def test_async_get_chunk_requests_asset_urls_only_when_needed(
+        self,
+        async_client: Any,
+    ) -> None:
+        route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks/dchk_123").mock(
+            return_value=httpx.Response(
+                200,
+                json={
+                    "document_id": "doc_123",
+                    "namespace": "support-center",
+                    "job_result_id": "result_123",
+                    "job_id": "job_123",
+                    "chunk": _make_document_chunk(chunk_type="image"),
+                },
+            )
+        )
+
+        response = await async_client.documents.get_chunk(
+            "doc_123",
+            "dchk_123",
+            include_asset_urls=True,
+        )
+
+        assert route.called
+        assert route.calls[0].request.url.params["include_asset_urls"] == "true"
+        assert response.chunk.id == "dchk_123"
+        assert response.chunk.asset_url == "https://assets.example/figure-1.png"
+
     @respx.mock
     def test_archive_document_returns_archived_state(self, sync_client: Any) -> None:
         route = respx.post(f"{DOCUMENTS_URL}/doc_123/archive").mock(