From 73094d4f95ef693785fa3965f6f2a223dfd2a350 Mon Sep 17 00:00:00 2001 From: Will Date: Mon, 27 Apr 2026 04:15:46 +0000 Subject: [PATCH] feat: add document chunks resource methods --- README.md | 15 ++++ docs/usage.md | 16 ++++ src/knowhere/__init__.py | 15 +++- src/knowhere/resources/documents.py | 119 +++++++++++++++++++++++++++- src/knowhere/types/__init__.py | 15 +++- src/knowhere/types/document.py | 52 +++++++++++- tests/test_documents.py | 113 ++++++++++++++++++++++++++ 7 files changed, 341 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1cacf09..dd366b6 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,21 @@ update_job = client.jobs.create( document = client.documents.get(document_id) print(document.status) +chunks = client.documents.list_chunks( + document_id, + page=1, + page_size=50, + chunk_type="text", +) +print(chunks.pagination.total) +if chunks.chunks: + chunk = client.documents.get_chunk( + document_id, + chunks.chunks[0].id, + include_asset_urls=True, + ) + print(chunk.chunk.content) + client.documents.archive(document_id) ``` diff --git a/docs/usage.md b/docs/usage.md index 507f5f1..e1c0714 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -521,6 +521,22 @@ for document in document_list.documents: document = client.documents.get("doc_123") print(document.current_job_result_id) +chunks = client.documents.list_chunks( + "doc_123", + page=1, + page_size=50, + chunk_type="text", +) +for chunk in chunks.chunks: + print(chunk.id, chunk.content) + +image_chunk = client.documents.get_chunk( + "doc_123", + "dchk_123", + include_asset_urls=True, +) +print(image_chunk.chunk.asset_url) + archived = client.documents.archive("doc_123") print(archived.status) # "archived" ``` diff --git a/src/knowhere/__init__.py b/src/knowhere/__init__.py index b136805..87c5446 100644 --- a/src/knowhere/__init__.py +++ b/src/knowhere/__init__.py @@ -35,7 +35,15 @@ ) from knowhere._types import PollProgressCallback, UploadProgressCallback from knowhere._version import __version__ -from knowhere.types.document import Document, DocumentListResponse +from knowhere.types.document import ( + Document, + DocumentChunk, + DocumentChunkListResponse, + DocumentChunkPagination, + DocumentChunkResponse, + DocumentChunkType, + DocumentListResponse, +) from knowhere.types.job import Job, JobError, JobProgress, JobResult from knowhere.types.params import ParsingParams, WebhookConfig from knowhere.types.retrieval import ( @@ -98,6 +106,11 @@ "JobResult", # Document types "Document", + "DocumentChunk", + "DocumentChunkListResponse", + "DocumentChunkPagination", + "DocumentChunkResponse", + "DocumentChunkType", "DocumentListResponse", # Retrieval types "RetrievalChannel", diff --git a/src/knowhere/resources/documents.py b/src/knowhere/resources/documents.py index c826d64..c58aebb 100644 --- a/src/knowhere/resources/documents.py +++ b/src/knowhere/resources/documents.py @@ -5,7 +5,13 @@ from typing import Any, Dict, Optional from knowhere.resources._base import AsyncAPIResource, SyncAPIResource -from knowhere.types.document import Document, DocumentListResponse +from knowhere.types.document import ( + Document, + DocumentChunkListResponse, + DocumentChunkResponse, + DocumentChunkType, + DocumentListResponse, +) class Documents(SyncAPIResource): @@ -32,6 +38,49 @@ def get(self, document_id: str) -> Document: cast_to=Document, ) + def list_chunks( + self, + document_id: str, + *, + page: int = 1, + page_size: int = 50, + chunk_type: Optional[DocumentChunkType] = None, + include_asset_urls: bool = False, + ) -> DocumentChunkListResponse: + """List current-revision chunks for one canonical document.""" + params: Dict[str, Any] = _build_chunk_list_params( + page=page, + page_size=page_size, + chunk_type=chunk_type, + include_asset_urls=include_asset_urls, + ) + + return self._request( + "GET", + f"v1/documents/{document_id}/chunks", + params=params or None, + cast_to=DocumentChunkListResponse, + ) + + def get_chunk( + self, + document_id: str, + document_chunk_id: str, + *, + include_asset_urls: bool = False, + ) -> DocumentChunkResponse: + """Get one current-revision chunk for one canonical document.""" + params: Dict[str, Any] = _build_chunk_get_params( + include_asset_urls=include_asset_urls, + ) + + return self._request( + "GET", + f"v1/documents/{document_id}/chunks/{document_chunk_id}", + params=params or None, + cast_to=DocumentChunkResponse, + ) + def archive(self, document_id: str) -> Document: """Archive one canonical document by ID.""" return self._request( @@ -65,6 +114,49 @@ async def get(self, document_id: str) -> Document: cast_to=Document, ) + async def list_chunks( + self, + document_id: str, + *, + page: int = 1, + page_size: int = 50, + chunk_type: Optional[DocumentChunkType] = None, + include_asset_urls: bool = False, + ) -> DocumentChunkListResponse: + """List current-revision chunks for one canonical document.""" + params: Dict[str, Any] = _build_chunk_list_params( + page=page, + page_size=page_size, + chunk_type=chunk_type, + include_asset_urls=include_asset_urls, + ) + + return await self._request( + "GET", + f"v1/documents/{document_id}/chunks", + params=params or None, + cast_to=DocumentChunkListResponse, + ) + + async def get_chunk( + self, + document_id: str, + document_chunk_id: str, + *, + include_asset_urls: bool = False, + ) -> DocumentChunkResponse: + """Get one current-revision chunk for one canonical document.""" + params: Dict[str, Any] = _build_chunk_get_params( + include_asset_urls=include_asset_urls, + ) + + return await self._request( + "GET", + f"v1/documents/{document_id}/chunks/{document_chunk_id}", + params=params or None, + cast_to=DocumentChunkResponse, + ) + async def archive(self, document_id: str) -> Document: """Archive one canonical document by ID.""" return await self._request( @@ -72,3 +164,28 @@ async def archive(self, document_id: str) -> Document: f"v1/documents/{document_id}/archive", cast_to=Document, ) + + +def _build_chunk_list_params( + *, + page: int, + page_size: int, + chunk_type: Optional[DocumentChunkType], + include_asset_urls: bool, +) -> Dict[str, Any]: + params: Dict[str, Any] = {} + if page != 1: + params["page"] = page + if page_size != 50: + params["page_size"] = page_size + if chunk_type is not None: + params["chunk_type"] = chunk_type + if include_asset_urls: + params["include_asset_urls"] = True + return params + + +def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]: + if not include_asset_urls: + return {} + return {"include_asset_urls": True} diff --git a/src/knowhere/types/__init__.py b/src/knowhere/types/__init__.py index a492955..e2323fe 100644 --- a/src/knowhere/types/__init__.py +++ b/src/knowhere/types/__init__.py @@ -2,7 +2,15 @@ from __future__ import annotations -from knowhere.types.document import Document, DocumentListResponse +from knowhere.types.document import ( + Document, + DocumentChunk, + DocumentChunkListResponse, + DocumentChunkPagination, + DocumentChunkResponse, + DocumentChunkType, + DocumentListResponse, +) from knowhere.types.job import Job, JobError, JobResult from knowhere.types.params import ParsingParams, WebhookConfig from knowhere.types.retrieval import ( @@ -39,6 +47,11 @@ "JobResult", # document "Document", + "DocumentChunk", + "DocumentChunkListResponse", + "DocumentChunkPagination", + "DocumentChunkResponse", + "DocumentChunkType", "DocumentListResponse", # retrieval "RetrievalChannel", diff --git a/src/knowhere/types/document.py b/src/knowhere/types/document.py index f41a438..fcd1b80 100644 --- a/src/knowhere/types/document.py +++ b/src/knowhere/types/document.py @@ -3,7 +3,7 @@ from __future__ import annotations from datetime import datetime -from typing import Optional +from typing import Any, Dict, Literal, Optional from pydantic import BaseModel @@ -26,3 +26,53 @@ class DocumentListResponse(BaseModel): namespace: str documents: list[Document] + + +DocumentChunkType = Literal["text", "image", "table"] + + +class DocumentChunkPagination(BaseModel): + """Pagination metadata returned by document chunk list endpoints.""" + + page: int + page_size: int + total: int + total_pages: int + + +class DocumentChunk(BaseModel): + """One current-revision document chunk.""" + + id: str + chunk_id: str + chunk_type: DocumentChunkType + content: Optional[str] = None + section_id: Optional[str] = None + section_path: Optional[str] = None + source_chunk_path: Optional[str] = None + file_path: Optional[str] = None + sort_order: int + metadata: Dict[str, Any] + asset_url: Optional[str] = None + created_at: Optional[datetime] = None + + +class DocumentChunkListResponse(BaseModel): + """Response from ``GET /v1/documents/{document_id}/chunks``.""" + + document_id: str + namespace: str + job_result_id: Optional[str] = None + job_id: Optional[str] = None + chunks: list[DocumentChunk] + pagination: DocumentChunkPagination + + +class DocumentChunkResponse(BaseModel): + """Response from ``GET /v1/documents/{document_id}/chunks/{chunk_id}``.""" + + document_id: str + namespace: str + job_result_id: Optional[str] = None + job_id: Optional[str] = None + chunk: DocumentChunk diff --git a/tests/test_documents.py b/tests/test_documents.py index 8869642..205d57e 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -27,6 +27,23 @@ def _make_document(status: str = "active") -> Dict[str, Any]: } +def _make_document_chunk(chunk_type: str = "text") -> Dict[str, Any]: + return { + "id": "dchk_123", + "chunk_id": "parser-chunk-1", + "chunk_type": chunk_type, + "content": "Chunk content", + "section_id": "sec_123", + "section_path": "Chapter 1", + "source_chunk_path": "Chapter 1/Intro", + "file_path": "images/figure-1.png" if chunk_type == "image" else None, + "sort_order": 0, + "metadata": {"summary": "Intro", "page_nums": [1]}, + "asset_url": "https://assets.example/figure-1.png" if chunk_type == "image" else None, + "created_at": "2026-04-27T04:00:00Z", + } + + class TestDocumentsResource: """Verify document lifecycle calls.""" @@ -77,6 +94,102 @@ def test_get_document_returns_document_state(self, sync_client: Any) -> None: assert document.document_id == "doc_123" assert document.status == "active" + @respx.mock + def test_list_chunks_sends_optional_query_params(self, sync_client: Any) -> None: + route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks").mock( + return_value=httpx.Response( + 200, + json={ + "document_id": "doc_123", + "namespace": "support-center", + "job_result_id": "result_123", + "job_id": "job_123", + "chunks": [_make_document_chunk(chunk_type="table")], + "pagination": { + "page": 2, + "page_size": 10, + "total": 11, + "total_pages": 2, + }, + }, + ) + ) + + response = sync_client.documents.list_chunks( + "doc_123", + page=2, + page_size=10, + chunk_type="table", + include_asset_urls=True, + ) + + assert route.called + assert route.calls[0].request.url.params["page"] == "2" + assert route.calls[0].request.url.params["page_size"] == "10" + assert route.calls[0].request.url.params["chunk_type"] == "table" + assert route.calls[0].request.url.params["include_asset_urls"] == "true" + assert response.document_id == "doc_123" + assert response.chunks[0].id == "dchk_123" + assert response.pagination.total_pages == 2 + + @respx.mock + def test_list_chunks_omits_default_query_params(self, sync_client: Any) -> None: + route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks").mock( + return_value=httpx.Response( + 200, + json={ + "document_id": "doc_123", + "namespace": "support-center", + "job_result_id": None, + "job_id": None, + "chunks": [], + "pagination": { + "page": 1, + "page_size": 50, + "total": 0, + "total_pages": 0, + }, + }, + ) + ) + + response = sync_client.documents.list_chunks("doc_123") + + assert route.called + assert dict(route.calls[0].request.url.params) == {} + assert response.chunks == [] + assert response.pagination.total == 0 + + @respx.mock + @pytest.mark.asyncio + async def test_async_get_chunk_requests_asset_urls_only_when_needed( + self, + async_client: Any, + ) -> None: + route = respx.get(f"{DOCUMENTS_URL}/doc_123/chunks/dchk_123").mock( + return_value=httpx.Response( + 200, + json={ + "document_id": "doc_123", + "namespace": "support-center", + "job_result_id": "result_123", + "job_id": "job_123", + "chunk": _make_document_chunk(chunk_type="image"), + }, + ) + ) + + response = await async_client.documents.get_chunk( + "doc_123", + "dchk_123", + include_asset_urls=True, + ) + + assert route.called + assert route.calls[0].request.url.params["include_asset_urls"] == "true" + assert response.chunk.id == "dchk_123" + assert response.chunk.asset_url == "https://assets.example/figure-1.png" + @respx.mock def test_archive_document_returns_archived_state(self, sync_client: Any) -> None: route = respx.post(f"{DOCUMENTS_URL}/doc_123/archive").mock(