Haystack Document Store and Retriever backed by Pixeltable — persistent, versioned, multimodal data infrastructure for AI applications.
pip install pixeltable-haystackfrom haystack import Document
from haystack_pixeltable import PixeltableDocumentStore
store = PixeltableDocumentStore(
table_name="myproject.docs",
embedding_dimension=1536,
)
# Write documents
store.write_documents([
Document(content="Pixeltable is multimodal data infrastructure.", embedding=[...]),
Document(content="Haystack is a framework for building RAG pipelines.", embedding=[...]),
])
# Filter documents
results = store.filter_documents(
filters={"field": "meta.category", "operator": "==", "value": "docs"}
)
# Count
print(store.count_documents())from haystack_pixeltable import PixeltableDocumentStore, PixeltableRetriever
store = PixeltableDocumentStore(
table_name="myproject.docs",
embedding_dimension=1536,
)
retriever = PixeltableRetriever(document_store=store, top_k=5)
# Search by embedding vector
result = retriever.run(query_embedding=[0.1, 0.2, ...])
for doc in result["documents"]:
print(f"{doc.content} (score: {doc.score:.3f})")from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack_pixeltable import PixeltableDocumentStore, PixeltableRetriever
store = PixeltableDocumentStore(
table_name="rag.knowledge",
embedding_dimension=384,
)
# Indexing pipeline
indexing = Pipeline()
indexing.add_component("embedder", SentenceTransformersDocumentEmbedder())
indexing.add_component("writer", DocumentWriter(document_store=store))
indexing.connect("embedder", "writer")
# Query pipeline
query = Pipeline()
query.add_component("embedder", SentenceTransformersTextEmbedder())
query.add_component("retriever", PixeltableRetriever(document_store=store, top_k=5))
query.connect("embedder.embedding", "retriever.query_embedding")The Document Store supports the Haystack filter specification:
# Simple equality
store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "science"})
# Comparison operators: ==, !=, >, >=, <, <=
store.filter_documents(filters={"field": "meta.score", "operator": ">", "value": 0.8})
# Compound AND
store.filter_documents(filters={
"operator": "AND",
"conditions": [
{"field": "meta.category", "operator": "==", "value": "science"},
{"field": "meta.score", "operator": ">", "value": 0.5},
],
})
# Compound OR
store.filter_documents(filters={
"operator": "OR",
"conditions": [
{"field": "meta.source", "operator": "==", "value": "arxiv"},
{"field": "meta.source", "operator": "==", "value": "pubmed"},
],
})The .table property gives direct access to the underlying Pixeltable table for operations beyond the Haystack interface:
store = PixeltableDocumentStore(table_name="myproject.docs", embedding_dimension=1536)
t = store.table
# Add a computed column
import pixeltable.functions.openai as openai
t.add_computed_column(
summary=openai.chat_completions(
messages=[{"role": "user", "content": t.content}],
model="gpt-4o-mini",
)
)
# Use arbitrary Pixeltable queries
results = t.where(t.meta["category"] == "science").select(t.content, t.summary).collect()
# Version history
print(t.count(version=-1)) # row count at previous version| Feature | Pixeltable | Chroma | Qdrant | pgvector |
|---|---|---|---|---|
| Persistent storage | Built-in | Opt-in | Opt-in | Built-in |
| Computed columns | Native | No | No | No |
| Version history | Native | No | No | No |
| Multimodal types | Image, Video, Audio, Document | Text only | Text only | Text only |
| Metadata filtering | JSON + SQL predicates | Limited | Rich | SQL |
| Embedding auto-compute | Via computed columns | Manual | Manual | Manual |
pip install -e ".[dev]"
pytest tests/ -v
ruff check . && ruff format --check .Apache 2.0