Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions .github/workflows/build-faiss.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: Build FAISS cache image

on:
workflow_dispatch:
workflow_run:
workflows: ["Upload latest documentation"]
types: [completed]

permissions:
contents: read
packages: write

env:
CACHE_IMAGE: ghcr.io/${{ github.repository_owner }}/orassistant-faiss-cache

jobs:
build-faiss-cache:
# Skip if triggered by upload workflow that failed
if: >
github.event_name == 'workflow_dispatch' ||
github.event.workflow_run.conclusion == 'success'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1

- name: Log in to GHCR
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Restore previous faiss_db for incremental build
run: |
mkdir -p backend/faiss_db
if docker pull ${{ env.CACHE_IMAGE }}:latest 2>/dev/null; then
docker create --name prev-faiss ${{ env.CACHE_IMAGE }}:latest
docker cp prev-faiss:/ORAssistant-backend/faiss_db/. backend/faiss_db/ || true
docker rm prev-faiss
echo "Restored previous faiss_db for incremental build"
else
echo "No previous cache image found — building all indices from scratch"
fi
# Ensure at least one file exists so COPY ./faiss_db in Dockerfile always succeeds
touch backend/faiss_db/.keep

- name: Build FAISS cache image
run: |
docker build \
-f backend/Dockerfile.faiss-cache \
-t ${{ env.CACHE_IMAGE }}:latest \
-t ${{ env.CACHE_IMAGE }}:${{ github.sha }} \
backend/

- name: Push FAISS cache image
run: |
docker push ${{ env.CACHE_IMAGE }}:latest
docker push ${{ env.CACHE_IMAGE }}:${{ github.sha }}
18 changes: 14 additions & 4 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Pull pre-built FAISS indices from the cache image (built by the build-faiss CI workflow).
# This avoids rebuilding indices on every PR — only the secret CI rebuilds them.
ARG FAISS_CACHE_IMAGE=ghcr.io/the-openroad-project/orassistant-faiss-cache:latest
FROM ${FAISS_CACHE_IMAGE} AS faiss-cache

FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim

WORKDIR /ORAssistant-backend
Expand All @@ -22,10 +27,15 @@ COPY . .

RUN uv venv .venv && uv sync --dev && uv run /ORAssistant-backend/src/post_install.py

RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \
mkdir -p data && \
mv ORAssistant_RAG_Dataset/* data/ && \
rm -rf ORAssistant_RAG_Dataset
# Copy pre-built FAISS indices from the cache image.
# HybridRetrieverChain detects faiss_db/<name> at startup and takes the fast load_db()
# path, reducing graph init from ~60 min to a few seconds.
COPY --from=faiss-cache /ORAssistant-backend/faiss_db ./faiss_db

# Runtime embedding model must match the model used when building the indices.
# Override via docker run -e or docker-compose environment if using a different model.
ENV EMBEDDINGS_TYPE=HF
ENV HF_EMBEDDINGS=thenlper/gte-large

EXPOSE 8000

Expand Down
40 changes: 40 additions & 0 deletions backend/Dockerfile.faiss-cache
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim

WORKDIR /ORAssistant-backend

RUN apt-get update && apt-get install -y \
build-essential \
gcc \
git \
git-lfs \
wget && \
git lfs install && \
rm -rf /var/lib/apt/lists/*

RUN pip install uv

COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml
COPY ./src ./src
COPY ./scripts ./scripts

RUN uv venv .venv && uv sync --dev && uv run /ORAssistant-backend/src/post_install.py

# Restore previous faiss_db for incremental builds.
# The build-faiss CI workflow extracts faiss_db/ from the previous cache image and places
# it in the build context. On first run the directory contains only a .keep sentinel.
COPY ./faiss_db ./faiss_db

# Download the RAG dataset (always refreshed so the hash check in build_faiss.py
# can detect data changes from the latest upload).
RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \
mkdir -p data && \
mv ORAssistant_RAG_Dataset/* data/ && \
rm -rf ORAssistant_RAG_Dataset

ARG BUILD_EMBEDDINGS_TYPE=HF
ARG BUILD_HF_MODEL=thenlper/gte-large
RUN PYTHONPATH=/ORAssistant-backend \
EMBEDDINGS_TYPE=${BUILD_EMBEDDINGS_TYPE} \
HF_EMBEDDINGS=${BUILD_HF_MODEL} \
FAST_MODE=true \
uv run python scripts/build_faiss.py
7 changes: 7 additions & 0 deletions backend/Dockerfile.faiss-cache.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.venv
*.egg-info
.mypy-cache
__pycache__
data
tests
# faiss_db is intentionally NOT excluded — CI places previous indices here for incremental builds
182 changes: 182 additions & 0 deletions backend/scripts/build_faiss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""
Pre-build FAISS indices at Docker image build time with per-index incremental skipping.

Each index has a hash computed from its source file paths + sizes. If the hash matches
the stored manifest and the index directory already exists, that index is skipped.
Only changed or missing indices are rebuilt, so the secret CI only pays for what changed.

Run from /ORAssistant-backend (the WORKDIR in the Dockerfile) so data/ is reachable.
EMBEDDINGS_TYPE and HF_EMBEDDINGS are read from the environment.
"""

import hashlib
import json
import logging
import os
import sys
from pathlib import Path

from src.chains.hybrid_retriever_chain import HybridRetrieverChain

logging.basicConfig(level="INFO", format="%(levelname)s %(message)s")

FAISS_DB_DIR = Path("faiss_db")
MANIFEST_FILE = FAISS_DB_DIR / "manifest.json"

embeddings_type = os.environ.get("EMBEDDINGS_TYPE", "HF")
hf_embeddings = os.environ.get("HF_EMBEDDINGS", "thenlper/gte-large")
fast_mode = os.environ.get("FAST_MODE", "true").lower() == "true"
chunk_size = int(os.environ.get("CHUNK_SIZE", 4000))

if not os.path.isdir("data"):
sys.exit("ERROR: run from backend directory — data/ not found")

if embeddings_type != "HF":
sys.exit(
f"ERROR: build_faiss.py only supports EMBEDDINGS_TYPE=HF, got '{embeddings_type}'"
)

embeddings_config = {"type": embeddings_type, "name": hf_embeddings}

_MD = "./data/markdown"
_HTML = "./data/html"

# Mirrors fastmode_docs_map / markdown_docs_map in retriever_tools.py.
# Each entry must use the same paths and index_name as RetrieverTools.initialize().
INDEX_DEFS: dict[bool, list[dict]] = {
True: [ # fast_mode=True
{"name": "general", "markdown_docs_path": [f"{_MD}/OR_docs"]},
{"name": "install", "markdown_docs_path": [f"{_MD}/ORFS_docs/installation"]},
{"name": "commands", "markdown_docs_path": [f"{_MD}/OR_docs/tools"]},
{
"name": "yosys_rtdocs",
"html_docs_path": [
f"{_HTML}/yosys_docs/yosyshq.readthedocs.io"
"/projects/yosys/en/latest/getting_started"
],
},
{
"name": "klayout",
"html_docs_path": [f"{_HTML}/klayout_docs/www.klayout.de/examples"],
},
{"name": "errinfo", "markdown_docs_path": [f"{_MD}/gh_discussions/Bug"]},
],
False: [ # fast_mode=False (full dataset)
{
"name": "general",
"markdown_docs_path": [
f"{_MD}/OR_docs",
f"{_MD}/ORFS_docs",
f"{_MD}/gh_discussions",
f"{_MD}/manpages/man1",
f"{_MD}/manpages/man2",
f"{_MD}/OpenSTA_docs",
],
"html_docs_path": [f"{_HTML}/or_website/"],
"other_docs_path": ["./data/pdf"],
},
{
"name": "install",
"markdown_docs_path": [
f"{_MD}/ORFS_docs/installation",
f"{_MD}/OR_docs/installation",
f"{_MD}/gh_discussions/Build",
f"{_MD}/gh_discussions/Installation",
f"{_MD}/OpenSTA_docs",
],
},
{
"name": "commands",
"markdown_docs_path": [
f"{_MD}/OR_docs/tools",
f"{_MD}/ORFS_docs/general",
f"{_MD}/gh_discussions/Query",
f"{_MD}/gh_discussions/Runtime",
f"{_MD}/gh_discussions/Documentation",
f"{_MD}/manpages/man1",
f"{_MD}/manpages/man2",
f"{_MD}/OpenSTA_docs",
],
"other_docs_path": ["./data/pdf"],
},
{"name": "yosys_rtdocs", "html_docs_path": [f"{_HTML}/yosys_docs"]},
{"name": "klayout", "html_docs_path": [f"{_HTML}/klayout_docs"]},
{
"name": "errinfo",
"markdown_docs_path": [
f"{_MD}/manpages/man3",
f"{_MD}/gh_discussions/Bug",
],
},
],
}


def _source_paths(index_def: dict) -> list[str]:
paths: list[str] = []
for key in ("markdown_docs_path", "html_docs_path", "other_docs_path"):
paths.extend(index_def.get(key, []))
return paths


def _hash_paths(paths: list[str]) -> str:
"""Stable hash over file path + size for all files under the given directories."""
h = hashlib.sha256()
for base in sorted(p for p in paths if Path(p).exists()):
for f in sorted(Path(base).rglob("*")):
if f.is_file() and not f.name.startswith("."):
h.update(str(f.relative_to(Path(base))).encode())
h.update(str(f.stat().st_size).encode())
return h.hexdigest()


def _load_manifest() -> dict:
if MANIFEST_FILE.exists():
return json.loads(MANIFEST_FILE.read_text())
return {}


def _save_manifest(manifest: dict) -> None:
FAISS_DB_DIR.mkdir(exist_ok=True)
MANIFEST_FILE.write_text(json.dumps(manifest, indent=2))


logging.info(
"Pre-building FAISS indices (fast_mode=%s, model=%s)", fast_mode, hf_embeddings
)

manifest = _load_manifest()
updated = False

for index_def in INDEX_DEFS[fast_mode]:
name: str = index_def["name"]
paths = _source_paths(index_def)
current_hash = _hash_paths(paths)
index_dir = FAISS_DB_DIR / name

if manifest.get(name) == current_hash and index_dir.is_dir():
logging.info("Skipping %s (source data unchanged)", name)
continue

logging.info("Building index: %s", name)
chain = HybridRetrieverChain(
embeddings_config=embeddings_config,
reranking_model_name="",
use_cuda=False,
index_name=name,
markdown_docs_path=index_def.get("markdown_docs_path"),
html_docs_path=index_def.get("html_docs_path"),
other_docs_path=index_def.get("other_docs_path"),
chunk_size=chunk_size,
contextual_rerank=False,
)
chain.create_hybrid_retriever()
manifest[name] = current_hash
updated = True
logging.info("Built index: %s", name)

if updated:
_save_manifest(manifest)
logging.info("FAISS pre-build complete (manifest updated)")
else:
logging.info("FAISS pre-build complete (all indices up to date, nothing rebuilt)")
Loading