diff --git a/.github/workflows/build-faiss.yml b/.github/workflows/build-faiss.yml new file mode 100644 index 00000000..fa0f1ef9 --- /dev/null +++ b/.github/workflows/build-faiss.yml @@ -0,0 +1,59 @@ +name: Build FAISS cache image + +on: + workflow_dispatch: + workflow_run: + workflows: ["Upload latest documentation"] + types: [completed] + +permissions: + contents: read + packages: write + +env: + CACHE_IMAGE: ghcr.io/${{ github.repository_owner }}/orassistant-faiss-cache + +jobs: + build-faiss-cache: + # Skip if triggered by upload workflow that failed + if: > + github.event_name == 'workflow_dispatch' || + github.event.workflow_run.conclusion == 'success' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - name: Log in to GHCR + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Restore previous faiss_db for incremental build + run: | + mkdir -p backend/faiss_db + if docker pull ${{ env.CACHE_IMAGE }}:latest 2>/dev/null; then + docker create --name prev-faiss ${{ env.CACHE_IMAGE }}:latest + docker cp prev-faiss:/ORAssistant-backend/faiss_db/. backend/faiss_db/ || true + docker rm prev-faiss + echo "Restored previous faiss_db for incremental build" + else + echo "No previous cache image found — building all indices from scratch" + fi + # Ensure at least one file exists so COPY ./faiss_db in Dockerfile always succeeds + touch backend/faiss_db/.keep + + - name: Build FAISS cache image + run: | + docker build \ + -f backend/Dockerfile.faiss-cache \ + -t ${{ env.CACHE_IMAGE }}:latest \ + -t ${{ env.CACHE_IMAGE }}:${{ github.sha }} \ + backend/ + + - name: Push FAISS cache image + run: | + docker push ${{ env.CACHE_IMAGE }}:latest + docker push ${{ env.CACHE_IMAGE }}:${{ github.sha }} diff --git a/backend/Dockerfile b/backend/Dockerfile index b8cbb94f..01b00ef0 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,3 +1,8 @@ +# Pull pre-built FAISS indices from the cache image (built by the build-faiss CI workflow). +# This avoids rebuilding indices on every PR — only the secret CI rebuilds them. +ARG FAISS_CACHE_IMAGE=ghcr.io/the-openroad-project/orassistant-faiss-cache:latest +FROM ${FAISS_CACHE_IMAGE} AS faiss-cache + FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim WORKDIR /ORAssistant-backend @@ -22,10 +27,15 @@ COPY . . RUN uv venv .venv && uv sync --dev && uv run /ORAssistant-backend/src/post_install.py -RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \ - mkdir -p data && \ - mv ORAssistant_RAG_Dataset/* data/ && \ - rm -rf ORAssistant_RAG_Dataset +# Copy pre-built FAISS indices from the cache image. +# HybridRetrieverChain detects faiss_db/ at startup and takes the fast load_db() +# path, reducing graph init from ~60 min to a few seconds. +COPY --from=faiss-cache /ORAssistant-backend/faiss_db ./faiss_db + +# Runtime embedding model must match the model used when building the indices. +# Override via docker run -e or docker-compose environment if using a different model. +ENV EMBEDDINGS_TYPE=HF +ENV HF_EMBEDDINGS=thenlper/gte-large EXPOSE 8000 diff --git a/backend/Dockerfile.faiss-cache b/backend/Dockerfile.faiss-cache new file mode 100644 index 00000000..3929750e --- /dev/null +++ b/backend/Dockerfile.faiss-cache @@ -0,0 +1,40 @@ +FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim + +WORKDIR /ORAssistant-backend + +RUN apt-get update && apt-get install -y \ + build-essential \ + gcc \ + git \ + git-lfs \ + wget && \ + git lfs install && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install uv + +COPY ./pyproject.toml /ORAssistant-backend/pyproject.toml +COPY ./src ./src +COPY ./scripts ./scripts + +RUN uv venv .venv && uv sync --dev && uv run /ORAssistant-backend/src/post_install.py + +# Restore previous faiss_db for incremental builds. +# The build-faiss CI workflow extracts faiss_db/ from the previous cache image and places +# it in the build context. On first run the directory contains only a .keep sentinel. +COPY ./faiss_db ./faiss_db + +# Download the RAG dataset (always refreshed so the hash check in build_faiss.py +# can detect data changes from the latest upload). +RUN git clone https://huggingface.co/datasets/The-OpenROAD-Project/ORAssistant_RAG_Dataset && \ + mkdir -p data && \ + mv ORAssistant_RAG_Dataset/* data/ && \ + rm -rf ORAssistant_RAG_Dataset + +ARG BUILD_EMBEDDINGS_TYPE=HF +ARG BUILD_HF_MODEL=thenlper/gte-large +RUN PYTHONPATH=/ORAssistant-backend \ + EMBEDDINGS_TYPE=${BUILD_EMBEDDINGS_TYPE} \ + HF_EMBEDDINGS=${BUILD_HF_MODEL} \ + FAST_MODE=true \ + uv run python scripts/build_faiss.py diff --git a/backend/Dockerfile.faiss-cache.dockerignore b/backend/Dockerfile.faiss-cache.dockerignore new file mode 100644 index 00000000..66542e52 --- /dev/null +++ b/backend/Dockerfile.faiss-cache.dockerignore @@ -0,0 +1,7 @@ +.venv +*.egg-info +.mypy-cache +__pycache__ +data +tests +# faiss_db is intentionally NOT excluded — CI places previous indices here for incremental builds diff --git a/backend/scripts/build_faiss.py b/backend/scripts/build_faiss.py new file mode 100644 index 00000000..8ea5852d --- /dev/null +++ b/backend/scripts/build_faiss.py @@ -0,0 +1,182 @@ +""" +Pre-build FAISS indices at Docker image build time with per-index incremental skipping. + +Each index has a hash computed from its source file paths + sizes. If the hash matches +the stored manifest and the index directory already exists, that index is skipped. +Only changed or missing indices are rebuilt, so the secret CI only pays for what changed. + +Run from /ORAssistant-backend (the WORKDIR in the Dockerfile) so data/ is reachable. +EMBEDDINGS_TYPE and HF_EMBEDDINGS are read from the environment. +""" + +import hashlib +import json +import logging +import os +import sys +from pathlib import Path + +from src.chains.hybrid_retriever_chain import HybridRetrieverChain + +logging.basicConfig(level="INFO", format="%(levelname)s %(message)s") + +FAISS_DB_DIR = Path("faiss_db") +MANIFEST_FILE = FAISS_DB_DIR / "manifest.json" + +embeddings_type = os.environ.get("EMBEDDINGS_TYPE", "HF") +hf_embeddings = os.environ.get("HF_EMBEDDINGS", "thenlper/gte-large") +fast_mode = os.environ.get("FAST_MODE", "true").lower() == "true" +chunk_size = int(os.environ.get("CHUNK_SIZE", 4000)) + +if not os.path.isdir("data"): + sys.exit("ERROR: run from backend directory — data/ not found") + +if embeddings_type != "HF": + sys.exit( + f"ERROR: build_faiss.py only supports EMBEDDINGS_TYPE=HF, got '{embeddings_type}'" + ) + +embeddings_config = {"type": embeddings_type, "name": hf_embeddings} + +_MD = "./data/markdown" +_HTML = "./data/html" + +# Mirrors fastmode_docs_map / markdown_docs_map in retriever_tools.py. +# Each entry must use the same paths and index_name as RetrieverTools.initialize(). +INDEX_DEFS: dict[bool, list[dict]] = { + True: [ # fast_mode=True + {"name": "general", "markdown_docs_path": [f"{_MD}/OR_docs"]}, + {"name": "install", "markdown_docs_path": [f"{_MD}/ORFS_docs/installation"]}, + {"name": "commands", "markdown_docs_path": [f"{_MD}/OR_docs/tools"]}, + { + "name": "yosys_rtdocs", + "html_docs_path": [ + f"{_HTML}/yosys_docs/yosyshq.readthedocs.io" + "/projects/yosys/en/latest/getting_started" + ], + }, + { + "name": "klayout", + "html_docs_path": [f"{_HTML}/klayout_docs/www.klayout.de/examples"], + }, + {"name": "errinfo", "markdown_docs_path": [f"{_MD}/gh_discussions/Bug"]}, + ], + False: [ # fast_mode=False (full dataset) + { + "name": "general", + "markdown_docs_path": [ + f"{_MD}/OR_docs", + f"{_MD}/ORFS_docs", + f"{_MD}/gh_discussions", + f"{_MD}/manpages/man1", + f"{_MD}/manpages/man2", + f"{_MD}/OpenSTA_docs", + ], + "html_docs_path": [f"{_HTML}/or_website/"], + "other_docs_path": ["./data/pdf"], + }, + { + "name": "install", + "markdown_docs_path": [ + f"{_MD}/ORFS_docs/installation", + f"{_MD}/OR_docs/installation", + f"{_MD}/gh_discussions/Build", + f"{_MD}/gh_discussions/Installation", + f"{_MD}/OpenSTA_docs", + ], + }, + { + "name": "commands", + "markdown_docs_path": [ + f"{_MD}/OR_docs/tools", + f"{_MD}/ORFS_docs/general", + f"{_MD}/gh_discussions/Query", + f"{_MD}/gh_discussions/Runtime", + f"{_MD}/gh_discussions/Documentation", + f"{_MD}/manpages/man1", + f"{_MD}/manpages/man2", + f"{_MD}/OpenSTA_docs", + ], + "other_docs_path": ["./data/pdf"], + }, + {"name": "yosys_rtdocs", "html_docs_path": [f"{_HTML}/yosys_docs"]}, + {"name": "klayout", "html_docs_path": [f"{_HTML}/klayout_docs"]}, + { + "name": "errinfo", + "markdown_docs_path": [ + f"{_MD}/manpages/man3", + f"{_MD}/gh_discussions/Bug", + ], + }, + ], +} + + +def _source_paths(index_def: dict) -> list[str]: + paths: list[str] = [] + for key in ("markdown_docs_path", "html_docs_path", "other_docs_path"): + paths.extend(index_def.get(key, [])) + return paths + + +def _hash_paths(paths: list[str]) -> str: + """Stable hash over file path + size for all files under the given directories.""" + h = hashlib.sha256() + for base in sorted(p for p in paths if Path(p).exists()): + for f in sorted(Path(base).rglob("*")): + if f.is_file() and not f.name.startswith("."): + h.update(str(f.relative_to(Path(base))).encode()) + h.update(str(f.stat().st_size).encode()) + return h.hexdigest() + + +def _load_manifest() -> dict: + if MANIFEST_FILE.exists(): + return json.loads(MANIFEST_FILE.read_text()) + return {} + + +def _save_manifest(manifest: dict) -> None: + FAISS_DB_DIR.mkdir(exist_ok=True) + MANIFEST_FILE.write_text(json.dumps(manifest, indent=2)) + + +logging.info( + "Pre-building FAISS indices (fast_mode=%s, model=%s)", fast_mode, hf_embeddings +) + +manifest = _load_manifest() +updated = False + +for index_def in INDEX_DEFS[fast_mode]: + name: str = index_def["name"] + paths = _source_paths(index_def) + current_hash = _hash_paths(paths) + index_dir = FAISS_DB_DIR / name + + if manifest.get(name) == current_hash and index_dir.is_dir(): + logging.info("Skipping %s (source data unchanged)", name) + continue + + logging.info("Building index: %s", name) + chain = HybridRetrieverChain( + embeddings_config=embeddings_config, + reranking_model_name="", + use_cuda=False, + index_name=name, + markdown_docs_path=index_def.get("markdown_docs_path"), + html_docs_path=index_def.get("html_docs_path"), + other_docs_path=index_def.get("other_docs_path"), + chunk_size=chunk_size, + contextual_rerank=False, + ) + chain.create_hybrid_retriever() + manifest[name] = current_hash + updated = True + logging.info("Built index: %s", name) + +if updated: + _save_manifest(manifest) + logging.info("FAISS pre-build complete (manifest updated)") +else: + logging.info("FAISS pre-build complete (all indices up to date, nothing rebuilt)") diff --git a/backend/src/agents/retriever_tools.py b/backend/src/agents/retriever_tools.py index c2ad3823..21ffa9ea 100644 --- a/backend/src/agents/retriever_tools.py +++ b/backend/src/agents/retriever_tools.py @@ -45,6 +45,7 @@ def initialize( reranking_model_name: str, use_cuda: bool = False, fast_mode: bool = False, + contextual_rerank: bool = True, ) -> None: markdown_docs_map = { "general": [ @@ -97,9 +98,10 @@ def initialize( else markdown_docs_map["general"], other_docs_path=[] if fast_mode else ["./data/pdf"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="general", ) general_retriever_chain.create_hybrid_retriever() RetrieverTools.general_retriever = general_retriever_chain.retriever @@ -112,9 +114,10 @@ def initialize( if fast_mode else markdown_docs_map["install"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="install", ) install_retriever_chain.create_hybrid_retriever() RetrieverTools.install_retriever = install_retriever_chain.retriever @@ -128,9 +131,10 @@ def initialize( else markdown_docs_map["commands"], other_docs_path=[] if fast_mode else ["./data/pdf"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="commands", ) commands_retriever_chain.create_hybrid_retriever() RetrieverTools.commands_retriever = commands_retriever_chain.retriever @@ -143,9 +147,10 @@ def initialize( if fast_mode else ["./data/html/yosys_docs"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="yosys_rtdocs", ) yosys_rtdocs_retriever_chain.create_hybrid_retriever() RetrieverTools.yosys_rtdocs_retriever = yosys_rtdocs_retriever_chain.retriever @@ -158,9 +163,10 @@ def initialize( if fast_mode else ["./data/html/klayout_docs"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="klayout", ) klayout_retriever_chain.create_hybrid_retriever() RetrieverTools.klayout_retriever = klayout_retriever_chain.retriever @@ -173,9 +179,10 @@ def initialize( if fast_mode else markdown_docs_map["errinfo"], weights=[0.6, 0.2, 0.2], - contextual_rerank=True, + contextual_rerank=contextual_rerank, search_k=search_k, chunk_size=chunk_size, + index_name="errinfo", ) errinfo_retriever_chain.create_hybrid_retriever() RetrieverTools.errinfo_retriever = errinfo_retriever_chain.retriever diff --git a/backend/src/chains/hybrid_retriever_chain.py b/backend/src/chains/hybrid_retriever_chain.py index d96be2d8..44a24be8 100644 --- a/backend/src/chains/hybrid_retriever_chain.py +++ b/backend/src/chains/hybrid_retriever_chain.py @@ -36,6 +36,7 @@ def __init__( weights: list[float] = [0.33, 0.33, 0.33], chunk_size: int = 500, contextual_rerank: bool = False, + index_name: Optional[str] = None, ): super().__init__( llm_model=llm_model, @@ -58,6 +59,7 @@ def __init__( self.chunk_size: int = chunk_size self.contextual_rerank: bool = contextual_rerank + self.index_name: Optional[str] = index_name self.retriever: Any # RunnableParallel compatibility def create_hybrid_retriever(self) -> None: @@ -72,6 +74,7 @@ def create_hybrid_retriever(self) -> None: html_docs_path=self.html_docs_path, chunk_size=self.chunk_size, use_cuda=self.use_cuda, + name=self.index_name, ) if self.vector_db is None: cur_path = os.path.abspath(__file__) diff --git a/backend/src/chains/similarity_retriever_chain.py b/backend/src/chains/similarity_retriever_chain.py index 01547ebf..1c0285b2 100644 --- a/backend/src/chains/similarity_retriever_chain.py +++ b/backend/src/chains/similarity_retriever_chain.py @@ -28,6 +28,7 @@ def __init__( embeddings_config: Optional[dict[str, str]] = None, use_cuda: bool = False, chunk_size: int = 500, + name: Optional[str] = None, ): super().__init__( llm_model=llm_model, @@ -36,7 +37,7 @@ def __init__( ) SimilarityRetrieverChain.count += 1 - self.name = f"similarity_INST{SimilarityRetrieverChain.count}" + self.name = name if name is not None else f"similarity_INST{SimilarityRetrieverChain.count}" self.embeddings_config: Optional[dict[str, str]] = embeddings_config self.use_cuda: bool = use_cuda