From ebd2401f257cd1d140710aacc2b42a455ac6ed09 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Wed, 20 May 2026 18:58:51 +0200
Subject: [PATCH 01/11] Add content-specific search

---
 pyproject.toml             |  1 +
 src/semble/__init__.py     |  3 +-
 src/semble/cli.py          | 60 +++++++++++++++++++++-----------
 src/semble/index/create.py | 28 ++++++++++++---
 src/semble/index/files.py  | 58 ++++++++++++++++++-------------
 src/semble/index/index.py  | 70 +++++++++++++++++++++++++++-----------
 src/semble/mcp.py          | 20 ++++++-----
 src/semble/search.py       | 54 +++++++++++++++++++++++------
 src/semble/types.py        | 20 ++++++++++-
 tests/test_cli.py          | 20 +++++++++++
 tests/test_files.py        | 63 +++++++++++++++++++++-------------
 tests/test_index.py        | 41 ++++++++++++++++++----
 tests/test_ranking.py      |  6 ----
 tests/test_search.py       | 47 ++++++++++++++++++++++---
 uv.lock                    | 17 ++++++++-
 15 files changed, 376 insertions(+), 132 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c30236d..a326ab1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ classifiers = [
 dependencies = [
     "model2vec>=0.4.0",
     "vicinity>=0.4.4",
+    "pyversity>=0.1.0",
     "numpy>=1.24.0",
     "bm25s>=0.2.0",
     "pathspec>=0.12",
diff --git a/src/semble/__init__.py b/src/semble/__init__.py
index ef61bdf..136f345 100644
--- a/src/semble/__init__.py
+++ b/src/semble/__init__.py
@@ -1,9 +1,10 @@
 from semble.index import SembleIndex
-from semble.types import Chunk, EmbeddingMatrix, Encoder, IndexStats, SearchResult
+from semble.types import Chunk, ContentType, EmbeddingMatrix, Encoder, IndexStats, SearchResult
 from semble.version import __version__
 
 __all__ = [
     "Chunk",
+    "ContentType",
     "EmbeddingMatrix",
     "Encoder",
     "IndexStats",
diff --git a/src/semble/cli.py b/src/semble/cli.py
index 9ee2df7..8f47998 100644
--- a/src/semble/cli.py
+++ b/src/semble/cli.py
@@ -1,6 +1,7 @@
 import argparse
 import asyncio
 import sys
+import warnings
 from enum import Enum
 from importlib.resources import files
 from importlib.util import find_spec
@@ -10,8 +11,11 @@
 
 from semble.index import SembleIndex
 from semble.stats import format_savings_report
+from semble.types import ContentType
 from semble.utils import _format_results, _is_git_url, _resolve_chunk
 
+_CONTENT_CHOICES = [ct.value for ct in ContentType]
+
 
 class Agent(str, Enum):
     CLAUDE = "claude"
@@ -32,6 +36,23 @@ def _agent_path(agent: Agent) -> Path:
     return Path(base_dir) / "agents" / "semble-search.md"
 
 
+def _add_content_args(p: argparse.ArgumentParser) -> None:
+    """Add --content and deprecated --include-text-files to a subparser."""
+    p.add_argument(
+        "--content",
+        action="append",
+        default=None,
+        choices=_CONTENT_CHOICES,
+        metavar="TYPE",
+        help="Content type(s) to index: 'code' (default), 'docs', 'all'. Repeatable: --content code --content docs.",
+    )
+    p.add_argument(
+        "--include-text-files",
+        action="store_true",
+        help="Deprecated. Use --content all instead.",
+    )
+
+
 def main() -> None:
     """Entry point for the semble command-line tool."""
     if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS:
@@ -52,18 +73,15 @@ def _mcp_main() -> None:
         help="Local directory or git URL to pre-index at startup (optional).",
     )
     parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).")
-    parser.add_argument(
-        "--include-text-files",
-        action="store_true",
-        help="Also index non-code text files (.md, .yaml, .json, etc.).",
-    )
+    _add_content_args(parser)
     args = parser.parse_args()
     if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")):
         print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr)
         raise SystemExit(1)
     from semble.mcp import serve
 
-    asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files))
+    content = _resolve_content(args.content, args.include_text_files)
+    asyncio.run(serve(args.path, ref=args.ref, content=content))
 
 
 def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
@@ -78,6 +96,18 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
     print(f"Created {dest}")
 
 
+def _resolve_content(content_args: list[str] | None, include_text_files: bool) -> list[ContentType]:
+    """Resolve --content values and the deprecated --include-text-files into a list of ContentType."""
+    if include_text_files:
+        warnings.warn(
+            "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+        return [ContentType.ALL]
+    return [ContentType(v) for v in content_args] if content_args else [ContentType.CODE]
+
+
 def _cli_main() -> None:
     parser = argparse.ArgumentParser(prog="semble")
     sub = parser.add_subparsers(dest="command")
@@ -86,22 +116,14 @@ def _cli_main() -> None:
     search_p.add_argument("query", help="Natural language or code query.")
     search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
     search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
-    search_p.add_argument(
-        "--include-text-files",
-        action="store_true",
-        help="Also index non-code text files (.md, .yaml, .json, etc.).",
-    )
+    _add_content_args(search_p)
 
     related_p = sub.add_parser("find-related", help="Find code similar to a specific location.")
     related_p.add_argument("file_path", help="File path as shown in search results.")
     related_p.add_argument("line", type=int, help="Line number (1-indexed).")
     related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
     related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
-    related_p.add_argument(
-        "--include-text-files",
-        action="store_true",
-        help="Also index non-code text files (.md, .yaml, .json, etc.).",
-    )
+    _add_content_args(related_p)
 
     init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.")
     init_p.add_argument(
@@ -126,11 +148,11 @@ def _cli_main() -> None:
         print(format_savings_report(verbose=args.verbose), end="")
         return
 
-    include_text = args.include_text_files
+    content = _resolve_content(args.content, args.include_text_files)
     index = (
-        SembleIndex.from_git(args.path, include_text_files=include_text)
+        SembleIndex.from_git(args.path, content=content)
         if _is_git_url(args.path)
-        else SembleIndex.from_path(args.path, include_text_files=include_text)
+        else SembleIndex.from_path(args.path, content=content)
     )
 
     if args.command == "search":
diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index 168f8ef..1dd2609 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -1,4 +1,5 @@
 import contextlib
+import warnings
 from collections.abc import Sequence
 from pathlib import Path
 
@@ -11,31 +12,48 @@
 from semble.index.files import detect_language, get_extensions
 from semble.index.sparse import enrich_for_bm25
 from semble.tokens import tokenize
-from semble.types import Chunk, Encoder
+from semble.types import Chunk, ContentType, Encoder
 
 _MAX_FILE_BYTES = 1_000_000  # 1 MB max file size to read and index
+_DEFAULT_CONTENT: frozenset[ContentType] = frozenset({ContentType.CODE})
+_DEPRECATION_MSG = (
+    "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead."
+)
+
+
+def _apply_include_text_files(
+    normalized: frozenset[ContentType], include_text_files: bool | None
+) -> frozenset[ContentType]:
+    """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
+    if include_text_files is None:
+        return normalized
+    warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3)
+    return frozenset({ContentType.ALL}) if include_text_files else _DEFAULT_CONTENT
 
 
 def create_index_from_path(
     path: Path,
     model: Encoder,
     extensions: Sequence[str] | None = None,
-    include_text_files: bool = False,
+    content: frozenset[ContentType] = _DEFAULT_CONTENT,
     display_root: Path | None = None,
+    include_text_files: bool | None = None,
 ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
     """Create an index from a resolved directory, optionally storing chunk paths relative to display_root.
 
     :param path: Resolved absolute path to index.
     :param model: The model to use for indexing.
     :param extensions: File extensions to include.
-    :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
+    :param content: Content types to index.
     :param display_root: If set, chunk file paths are stored relative to this root.
+    :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
     :raises ValueError: if no items were found, no index can be created.
     :return: A bm25 index, vicinity index and list of chunks
     """
+    content = _apply_include_text_files(content, include_text_files)
     chunks: list[Chunk] = []
-    extensions = get_extensions(include_text_files, extensions)
-    for file_path in walk_files(path, extensions):
+    resolved_extensions = get_extensions(content, extensions)
+    for file_path in walk_files(path, resolved_extensions):
         language = detect_language(file_path)
         with contextlib.suppress(OSError):
             if file_path.stat().st_size > _MAX_FILE_BYTES:
diff --git a/src/semble/index/files.py b/src/semble/index/files.py
index e79d7c7..10ae819 100644
--- a/src/semble/index/files.py
+++ b/src/semble/index/files.py
@@ -2,6 +2,8 @@
 from collections.abc import Sequence
 from pathlib import Path
 
+from semble.types import ContentType
+
 _EXTENSION_TO_LANGUAGE = {
     ".4th": "forth",
     ".ada": "ada",
@@ -357,8 +359,30 @@
 
 _DOC_LANGUAGES = {
     "asciidoc",
-    "beancount",
     "bibtex",
+    "djot",
+    "doxygen",
+    "html",
+    "javadoc",
+    "jsdoc",
+    "latex",
+    "luadoc",
+    "markdown",
+    "markdown_inline",
+    "mermaid",
+    "norg",
+    "norg_meta",
+    "org",
+    "phpdoc",
+    "po",
+    "rst",
+    "rtf",
+    "vimdoc",
+}
+
+# Everything that is not a programming language — used to derive _CODE_LANGUAGES.
+_NON_CODE_LANGUAGES = _DOC_LANGUAGES | {
+    "beancount",
     "capnp",
     "cedarschema",
     "comment",
@@ -368,8 +392,6 @@
     "desktop",
     "devicetree",
     "diff",
-    "djot",
-    "doxygen",
     "dtd",
     "editorconfig",
     "ebnf",
@@ -384,33 +406,18 @@
     "gpg",
     "hjson",
     "hocon",
-    "html",
     "ini",
-    "javadoc",
-    "jsdoc",
     "json",
     "json5",
     "kdl",
-    "latex",
     "ledger",
-    "luadoc",
-    "markdown",
-    "markdown_inline",
-    "mermaid",
-    "norg",
-    "norg_meta",
-    "org",
     "pem",
     "pgn",
-    "phpdoc",
-    "po",
     "properties",
     "proto",
     "psv",
     "requirements",
     "ron",
-    "rst",
-    "rtf",
     "smithy",
     "ssh_config",
     "textproto",
@@ -420,7 +427,6 @@
     "tsv",
     "turtle",
     "typespec",
-    "vimdoc",
     "wit",
     "xcompose",
     "xml",
@@ -438,7 +444,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]:
 
 
 ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values())
-_WITHOUT_DOC = ALL_LANGUAGES - _DOC_LANGUAGES
+_CODE_LANGUAGES = ALL_LANGUAGES - _NON_CODE_LANGUAGES
 _LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE)
 
 
@@ -447,12 +453,16 @@ def detect_language(file_name: Path) -> str | None:
     return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower())
 
 
-def get_extensions(include_text_files: bool, extensions: Sequence[str] | None) -> list[str]:
+def get_extensions(content: frozenset[ContentType], extensions: Sequence[str] | None) -> list[str]:
     """Returns a list of supported file extensions."""
-    if include_text_files:
-        languages = ALL_LANGUAGES
+    if ContentType.ALL in content:
+        languages: frozenset[str] = ALL_LANGUAGES
     else:
-        languages = _WITHOUT_DOC
+        languages = frozenset()
+        if ContentType.CODE in content:
+            languages |= _CODE_LANGUAGES
+        if ContentType.DOCS in content:
+            languages |= _DOC_LANGUAGES
     all_extensions: set[str] = set()
     for language in languages:
         all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set()))
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 56d51e3..d29b5a2 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -11,11 +11,22 @@
 import numpy.typing as npt
 from bm25s import BM25
 
-from semble.index.create import create_index_from_path
+from semble.index.create import _DEFAULT_CONTENT, _apply_include_text_files, create_index_from_path
 from semble.index.dense import SelectableBasicBackend, load_model
-from semble.search import _search_semantic, search
+from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search
 from semble.stats import save_search_stats
-from semble.types import CallType, Chunk, Encoder, IndexStats, SearchResult
+from semble.types import (
+    CallType,
+    Chunk,
+    ContentSelection,
+    ContentType,
+    Encoder,
+    IndexStats,
+    SearchResult,
+    normalize_content,
+)
+
+_UNSET = object()
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
 
@@ -30,6 +41,7 @@ def __init__(
         semantic_index: SelectableBasicBackend,
         chunks: list[Chunk],
         root: Path | None = None,
+        content: frozenset[ContentType] = _DEFAULT_CONTENT,
     ) -> None:
         """Initialize a SembleIndex. Should be created with from_path or from_git.
 
@@ -38,12 +50,14 @@ def __init__(
         :param semantic_index: The semantic index.
         :param chunks: The found chunks.
         :param root: Root directory used to read file sizes for token-savings stats.
+        :param content: Content types used when indexing; controls the search pipeline.
         """
         self.model: Encoder = model
         self.chunks: list[Chunk] = chunks
         self._bm25_index: BM25 = bm25_index
         self._semantic_index: SelectableBasicBackend = semantic_index
         self._root: Path | None = root
+        self._content: frozenset[ContentType] = content
         self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
         self._file_mapping, self._language_mapping = self._populate_mapping()
 
@@ -91,18 +105,22 @@ def from_path(
         path: str | Path,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        include_text_files: bool = False,
+        content: ContentSelection = ContentType.CODE,
+        include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Create and index a SembleIndex from a directory.
 
         :param path: Root directory to index.
         :param model: Embedding model to use. Defaults to potion-code-16M.
         :param extensions: File extensions to include. Defaults to a standard set of code extensions.
-        :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
+        :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``,
+            ``ContentType.ALL``, or a list of multiple types.
+        :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
         :return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
         :raises FileNotFoundError: If `path` does not exist.
         :raises NotADirectoryError: If `path` exists but is not a directory.
         """
+        normalized = _apply_include_text_files(normalize_content(content), include_text_files)
         model = model or load_model()
         path = Path(path)
         if not path.exists():
@@ -114,11 +132,11 @@ def from_path(
             path,
             model=model,
             extensions=extensions,
-            include_text_files=include_text_files,
+            content=normalized,
             display_root=path,
         )
 
-        return SembleIndex(model, bm25, vicinity, chunks, root=path)
+        return SembleIndex(model, bm25, vicinity, chunks, root=path, content=normalized)
 
     @classmethod
     def from_git(
@@ -127,7 +145,8 @@ def from_git(
         ref: str | None = None,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        include_text_files: bool = False,
+        content: ContentSelection = ContentType.CODE,
+        include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Clone a git repository and index it.
 
@@ -140,10 +159,13 @@ def from_git(
         :param ref: Branch or tag to check out. Defaults to the remote HEAD.
         :param model: Embedding model to use. Defaults to potion-code-16M.
         :param extensions: File extensions to include. Defaults to a standard set of code extensions.
-        :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
+        :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``,
+            ``ContentType.ALL``, or a list of multiple types.
+        :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
         :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
+        normalized = _apply_include_text_files(normalize_content(content), include_text_files)
         with tempfile.TemporaryDirectory() as tmp_dir:
             # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`).
             cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir]
@@ -163,11 +185,11 @@ def from_git(
                 resolved_path,
                 model=model,
                 extensions=extensions,
-                include_text_files=include_text_files,
+                content=normalized,
                 display_root=resolved_path,
             )
 
-            return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path)
+            return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path, content=normalized)
 
     def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
         """Return chunks semantically similar to the given chunk or search result.
@@ -202,38 +224,46 @@ def search(
         alpha: float | None = None,
         filter_languages: list[str] | None = None,
         filter_paths: list[str] | None = None,
-        rerank: bool = True,
+        rerank: bool | None = None,
+        diversity: float | None = _UNSET,  # type: ignore[assignment]
     ) -> list[SearchResult]:
         """Search the index and return the top-k most relevant chunks.
 
         :param query: Natural-language or keyword query string.
         :param top_k: Maximum number of results to return.
         :param alpha: Blend weight for hybrid score combination; 1.0 = full semantic
-            weight, 0.0 = full BM25 weight. File-path penalties and diversity reranking
-            are applied regardless. ``None`` auto-detects from query type.
+            weight, 0.0 = full BM25 weight. ``None`` auto-detects from query type.
         :param filter_languages: Optional list of language codes; if set, only chunks in
             these languages are returned.
         :param filter_paths: Optional list of repo-relative file paths; if set, only
             chunks from these files are returned.
-        :param rerank: Whether to rerank the top-k results using custom reranking logic.
+        :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
+            Defaults to ``True`` when ``ContentType.CODE`` was indexed.
+        :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking.
+            Defaults to ``DEFAULT_DOCS_DIVERSITY`` when ``ContentType.DOCS`` was indexed. Pass
+            ``None`` explicitly to disable.
         :return: Ranked list of :class:`SearchResult` objects, best match first.
         """
-        bm25_index, semantic_index = self._bm25_index, self._semantic_index
         if not self.chunks or not query.strip():
             return []
 
-        selector = self._get_selector_vector(filter_languages, filter_paths)
+        has_code = ContentType.CODE in self._content or ContentType.ALL in self._content
+        has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content
+        resolved_rerank = has_code if rerank is None else rerank
+        resolved_diversity = (DEFAULT_DOCS_DIVERSITY if has_docs else None) if diversity is _UNSET else diversity
 
+        selector = self._get_selector_vector(filter_languages, filter_paths)
         results = search(
             query,
             self.model,
-            semantic_index,
-            bm25_index,
+            self._semantic_index,
+            self._bm25_index,
             self.chunks,
             top_k,
             alpha=alpha,
             selector=selector,
-            rerank=rerank,
+            rerank=resolved_rerank,
+            diversity=resolved_diversity,
         )
         save_search_stats(results, CallType.SEARCH, self._file_sizes)
         return results
diff --git a/src/semble/mcp.py b/src/semble/mcp.py
index a9c533d..ca90897 100644
--- a/src/semble/mcp.py
+++ b/src/semble/mcp.py
@@ -12,7 +12,7 @@
 
 from semble.index import SembleIndex
 from semble.index.dense import load_model
-from semble.types import Encoder
+from semble.types import ContentSelection, ContentType, Encoder, normalize_content
 from semble.utils import _format_results, _is_git_url, _resolve_chunk
 
 logger = logging.getLogger(__name__)
@@ -112,10 +112,14 @@ async def find_related(
     return server
 
 
-async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None:
+async def serve(
+    path: str | None = None,
+    ref: str | None = None,
+    content: ContentSelection = ContentType.CODE,
+) -> None:
     """Start an MCP stdio server, optionally pre-indexing a default source."""
     model = await asyncio.to_thread(load_model)
-    cache = _IndexCache(model=model, include_text_files=include_text_files)
+    cache = _IndexCache(model=model, content=normalize_content(content))
     if path:
         await cache.get(path, ref=ref)
         if not _is_git_url(path):
@@ -128,10 +132,10 @@ async def serve(path: str | None = None, ref: str | None = None, include_text_fi
 class _IndexCache:
     """Cache of indexed repos and local paths for the lifetime of the MCP server process."""
 
-    def __init__(self, model: Encoder, include_text_files: bool = False) -> None:
+    def __init__(self, model: Encoder, content: frozenset[ContentType] = frozenset({ContentType.CODE})) -> None:
         """Initialise an empty cache with a shared embedding model."""
         self._model = model
-        self._include_text_files = include_text_files
+        self._content = content
         self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict()  # ordered for LRU eviction
         self._watcher_task: asyncio.Task[None] | None = None
 
@@ -175,14 +179,12 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
                         source,
                         ref=ref,
                         model=self._model,
-                        include_text_files=self._include_text_files,
+                        content=self._content,
                     )
                 )
             else:
                 self._tasks[cache_key] = asyncio.create_task(
-                    asyncio.to_thread(
-                        SembleIndex.from_path, cache_key, model=self._model, include_text_files=self._include_text_files
-                    )
+                    asyncio.to_thread(SembleIndex.from_path, cache_key, model=self._model, content=self._content)
                 )
         task = self._tasks[cache_key]
         try:
diff --git a/src/semble/search.py b/src/semble/search.py
index d0a9d25..7b58b8a 100644
--- a/src/semble/search.py
+++ b/src/semble/search.py
@@ -1,6 +1,7 @@
 import bm25s
 import numpy as np
 import numpy.typing as npt
+from pyversity import Strategy, diversify
 
 from semble.index.dense import SelectableBasicBackend
 from semble.index.sparse import selector_to_mask
@@ -9,6 +10,7 @@
 from semble.types import Chunk, Encoder, SearchResult
 
 _RRF_K = 60
+DEFAULT_DOCS_DIVERSITY = 0.3
 
 
 def _rrf_scores(scores: dict[Chunk, float]) -> dict[Chunk, float]:
@@ -72,6 +74,7 @@ def search(
     alpha: float | None = None,
     selector: npt.NDArray[np.int_] | None = None,
     rerank: bool = True,
+    diversity: float | None = None,
 ) -> list[SearchResult]:
     """Hybrid search: alpha-weighted combination of semantic and BM25 scores.
 
@@ -86,7 +89,9 @@ def search(
     :param top_k: Number of results to return.
     :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type.
     :param selector: Optional array of chunk indices to filter results by.
-    :param rerank: Whether to perform reranking. This should be done, and is mainly here for benchmarking.
+    :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties).
+    :param diversity: DPP diversity weight in [0, 1]. When set, fetches 2× candidates, reranks,
+        then re-selects with pyversity DPP. None disables diversity.
     :return: List of search results sorted by combined score descending.
     """
     alpha_weight = resolve_alpha(query, alpha)
@@ -104,8 +109,7 @@ def search(
     normalized_semantic = _rrf_scores(semantic_scores)
     normalized_bm25 = _rrf_scores(bm25_scores)
 
-    # Sort by the file path and start line to
-    # counteract randomness introduces by hashing.
+    # Sort by the file path and start line to counteract randomness from hashing.
     all_candidates = sorted(
         {*normalized_semantic, *normalized_bm25},
         key=lambda c: c.start_line,
@@ -116,14 +120,44 @@ def search(
         for chunk in all_candidates
     }
 
+    # Over-fetch before reranking so diversity has candidates to choose from.
+    fetch_k = top_k * 2 if diversity is not None else top_k
+
     if rerank:
-        # Boost files with multiple relevant chunks.
         boost_multi_chunk_files(combined_scores)
-        # Boost queries with specific identifiers in them.
         combined_scores = apply_query_boost(combined_scores, query, chunks)
-        # Rerank the top-k results by applying path-based penalties.
-        ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0)
+        ranked = rerank_topk(combined_scores, fetch_k, penalise_paths=alpha_weight < 1.0)
     else:
-        sorted_by_score = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
-        ranked = sorted_by_score[:top_k]
-    return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]
+        ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:fetch_k]
+
+    results = [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]
+
+    if diversity is not None:
+        return _diversify(results, top_k, diversity, semantic_index, chunks)
+    return results
+
+
+def _diversify(
+    results: list[SearchResult],
+    top_k: int,
+    diversity_weight: float,
+    semantic_index: SelectableBasicBackend,
+    chunks: list[Chunk],
+) -> list[SearchResult]:
+    """Re-rank results with DPP to improve embedding-space diversity."""
+    if len(results) <= top_k:
+        return results
+    chunk_index = {c: i for i, c in enumerate(chunks)}
+    valid = [r for r in results if r.chunk in chunk_index]
+    if len(valid) <= top_k:
+        return valid
+    indices = np.array([chunk_index[r.chunk] for r in valid])
+    scores = np.array([r.score for r in valid], dtype=np.float32)
+    result = diversify(
+        embeddings=semantic_index.vectors[indices],
+        scores=scores,
+        k=top_k,
+        strategy=Strategy.DPP,
+        diversity=diversity_weight,
+    )
+    return sorted((valid[i] for i in result.indices), key=lambda r: -r.score)
diff --git a/src/semble/types.py b/src/semble/types.py
index d01c774..ca32fac 100644
--- a/src/semble/types.py
+++ b/src/semble/types.py
@@ -1,4 +1,4 @@
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Protocol, TypeAlias
@@ -16,6 +16,24 @@ class CallType(str, Enum):
     FIND_RELATED = "find_related"
 
 
+class ContentType(str, Enum):
+    """Content type for indexing and search pipeline selection."""
+
+    CODE = "code"
+    DOCS = "docs"
+    ALL = "all"
+
+
+ContentSelection: TypeAlias = "ContentType | Iterable[ContentType]"
+
+
+def normalize_content(content: ContentSelection) -> frozenset[ContentType]:
+    """Normalize a single ContentType or iterable of ContentType into a frozenset."""
+    if isinstance(content, ContentType):
+        return frozenset({content})
+    return frozenset(content)
+
+
 class Encoder(Protocol):
     """Protocol for embedding models."""
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 2c71a72..6f9928e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -193,6 +193,26 @@ def test_mcp_main_exits_with_message_when_extras_missing(
     assert "pip install 'semble[mcp]'" in capsys.readouterr().err
 
 
+def test_include_text_files_cli_deprecated(
+    monkeypatch: pytest.MonkeyPatch,
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    """--include-text-files on CLI raises DeprecationWarning."""
+    import warnings
+
+    chunk = make_chunk("def foo(): pass", "src/foo.py")
+    fake_index = MagicMock()
+    fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9)]
+    monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path", "--include-text-files"])
+    with patch("semble.cli.SembleIndex.from_path", return_value=fake_index):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _cli_main()
+    assert any(
+        "include-text-files" in str(w.message).lower() for w in caught if issubclass(w.category, DeprecationWarning)
+    )
+
+
 def test_agent_file_tools_are_bash_only() -> None:
     """The agent file must list only Bash and Read — no MCP tools that require schema loading."""
     frontmatter = _CLAUDE_AGENT_FILE.split("---")[1]
diff --git a/tests/test_files.py b/tests/test_files.py
index 71ede4c..acda49c 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -1,6 +1,9 @@
 from pathlib import Path
 
-from semble.index.files import _DOC_LANGUAGES, _EXTENSION_TO_LANGUAGE, detect_language, get_extensions
+import pytest
+
+from semble.index.files import _CODE_LANGUAGES, _DOC_LANGUAGES, _NON_CODE_LANGUAGES, detect_language, get_extensions
+from semble.types import ContentType
 
 
 def test_detect_language() -> None:
@@ -10,32 +13,44 @@ def test_detect_language() -> None:
     assert detect_language(Path("c.txt")) is None
 
 
-def test_get_extensions() -> None:
-    """Test the get_extensions function."""
-    all_extensions = get_extensions(True, None)
-    without_doc_extensions = get_extensions(False, None)
-
-    doc_extensions = set(all_extensions) - set(without_doc_extensions)
-
-    for extension in doc_extensions:
-        assert _EXTENSION_TO_LANGUAGE[extension] in _DOC_LANGUAGES
-    for extension in without_doc_extensions:
-        assert _EXTENSION_TO_LANGUAGE[extension] not in _DOC_LANGUAGES
+def test_language_sets_are_consistent() -> None:
+    """Code, doc, and non-code language sets satisfy their mutual invariants."""
+    assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES)
+    assert _CODE_LANGUAGES.isdisjoint(_NON_CODE_LANGUAGES)
+    assert _DOC_LANGUAGES <= _NON_CODE_LANGUAGES
 
 
-def test_get_extensions_additional() -> None:
-    """Test the get_extensions function."""
-    all_extensions = get_extensions(True, None)
-    all_extensions_extra = get_extensions(True, [".kjs"])
-
-    assert set(all_extensions_extra) == set(all_extensions) | {".kjs"}
+@pytest.mark.parametrize(
+    ("content", "includes", "excludes"),
+    [
+        (frozenset({ContentType.CODE}), [".py"], [".md"]),
+        (frozenset({ContentType.DOCS}), [".md"], [".py"]),
+        (frozenset({ContentType.ALL}), [".py", ".md"], []),
+    ],
+)
+def test_get_extensions(content: frozenset[ContentType], includes: list[str], excludes: list[str]) -> None:
+    """get_extensions returns the right extensions for each content type."""
+    exts = set(get_extensions(content, None))
+    for ext in includes:
+        assert ext in exts
+    for ext in excludes:
+        assert ext not in exts
 
-    all_extensions = get_extensions(False, None)
-    all_extensions_extra = get_extensions(False, [".kjs"])
 
-    assert set(all_extensions_extra) == set(all_extensions) | {".kjs"}
+def test_get_extensions_code_and_docs() -> None:
+    """Code + docs is the union of each individual set."""
+    code = set(get_extensions(frozenset({ContentType.CODE}), None))
+    docs = set(get_extensions(frozenset({ContentType.DOCS}), None))
+    combined = set(get_extensions(frozenset({ContentType.CODE, ContentType.DOCS}), None))
+    assert combined == code | docs
 
-    all_extensions = get_extensions(False, None)
-    all_extensions_extra = get_extensions(False, [".py"])
 
-    assert set(all_extensions_extra) == set(all_extensions)
+def test_get_extensions_additional() -> None:
+    """Extra extensions are appended and existing ones are not duplicated."""
+    base = get_extensions(frozenset({ContentType.ALL}), None)
+    with_extra = get_extensions(frozenset({ContentType.ALL}), [".kjs"])
+    assert set(with_extra) == set(base) | {".kjs"}
+
+    base_code = get_extensions(frozenset({ContentType.CODE}), None)
+    with_existing = get_extensions(frozenset({ContentType.CODE}), [".py"])
+    assert set(with_existing) == set(base_code)
diff --git a/tests/test_index.py b/tests/test_index.py
index 3f90fcb..2a064ae 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1,12 +1,12 @@
 from pathlib import Path
 from typing import Any
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 
 from semble import SembleIndex
 from semble.index.create import _MAX_FILE_BYTES, create_index_from_path
-from semble.types import Encoder
+from semble.types import ContentType, Encoder
 from tests.conftest import make_chunk
 
 
@@ -17,18 +17,45 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex:
 
 
 @pytest.mark.parametrize(
-    ("include_text_files", "md_in_results"),
-    [(False, False), (True, True)],
+    ("content", "md_in_results"),
+    [
+        (ContentType.CODE, False),
+        (ContentType.DOCS, True),
+        (ContentType.ALL, True),
+        ([ContentType.CODE, ContentType.DOCS], True),
+    ],
 )
 def test_index_markdown_inclusion(
-    mock_model: Encoder, tmp_project: Path, include_text_files: bool, md_in_results: bool
+    mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool
 ) -> None:
-    """Markdown files are excluded by default and included when include_text_files=True."""
-    _, _, chunks = create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files)
+    """Markdown files are excluded for code and included for docs/all/code+docs."""
+    from semble.types import normalize_content
+
+    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=normalize_content(content))
     has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
     assert has_md is md_in_results
 
 
+@pytest.mark.parametrize("include_text_files", [True, False])
+def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None:
+    """include_text_files raises DeprecationWarning on create_index_from_path and from_path."""
+    with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
+        create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files)
+    with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
+        SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files)
+
+
+def test_from_git_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path) -> None:
+    """from_git raises DeprecationWarning when include_text_files is passed."""
+    fake_result = MagicMock()
+    fake_result.returncode = 0
+    with patch("subprocess.run", return_value=fake_result):
+        with patch("semble.index.index.create_index_from_path") as mock_create:
+            mock_create.return_value = (MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")])
+            with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
+                SembleIndex.from_git("https://example.com/repo", model=mock_model, include_text_files=True)
+
+
 def test_index_empty_returns_zero_chunks(mock_model: Encoder, tmp_path: Path) -> None:
     """Indexing an empty directory yields zero files and chunks."""
     with pytest.raises(ValueError):
diff --git a/tests/test_ranking.py b/tests/test_ranking.py
index 6a33397..8caf26b 100644
--- a/tests/test_ranking.py
+++ b/tests/test_ranking.py
@@ -148,9 +148,3 @@ def test_boost_multi_chunk_files() -> None:
     scores: dict = {c1: 1.0, c2: 0.8, c3: 1.0}
     boost_multi_chunk_files(scores)
     assert scores[c1] > 1.0
-
-
-def test_boosting_with_empty() -> None:
-    """Test that boosting with empty chunks return None."""
-    boosted = apply_query_boost({}, "query", [])
-    assert boosted == {}
diff --git a/tests/test_search.py b/tests/test_search.py
index 56bd2f1..9bffdc3 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -8,9 +8,9 @@
 from vicinity.backends.basic import BasicArgs
 
 from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model
-from semble.search import _search_bm25, _search_semantic, _sort_top_k, search
+from semble.search import DEFAULT_DOCS_DIVERSITY, _diversify, _search_bm25, _search_semantic, _sort_top_k, search
 from semble.tokens import tokenize
-from semble.types import Chunk, Encoder
+from semble.types import Chunk, ContentType, Encoder, SearchResult
 from tests.conftest import make_chunk
 
 
@@ -107,7 +107,7 @@ def test_search_hybrid(
         (lambda q, m, s, b, c, k: search(q, m, s, b, c, k), "login", 4),
     ],
 )
-def test_search_source_labels(
+def test_search_returns_results(
     search_fn: Any,
     query: str,
     top_k: int,
@@ -116,14 +116,14 @@ def test_search_source_labels(
     bm25: bm25s.BM25,
     mock_model: Any,
 ) -> None:
-    """Each result carries a source label matching the search mode used."""
+    """BM25, semantic, and hybrid search all return at least one result for a matching query."""
     results = search_fn(query, mock_model, semantic, bm25, chunks, top_k)
     assert len(results) > 0
 
 
 def test_sort_top_k() -> None:
     """_sort_top_k returns the same indices as np.argsort(-x)[:top_k]."""
-    gen = np.random.default_rng()
+    gen = np.random.default_rng(42)
     x = gen.standard_normal(size=(10000,))
     top_k = 100
     indices = _sort_top_k(x, top_k)
@@ -159,3 +159,40 @@ def test_selectable_basic_backend_rejects_k_below_one(
     """SelectableBasicBackend.query guards against k < 1."""
     with pytest.raises(ValueError, match="k should be >= 1"):
         semantic.query(embeddings[:1], k=0)
+
+
+def test_search_with_diversity(
+    chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any
+) -> None:
+    """Search with diversity set runs DPP and returns top_k results."""
+    results = search("authenticate", mock_model, semantic, bm25, chunks, top_k=2, diversity=DEFAULT_DOCS_DIVERSITY)
+    assert len(results) == 2
+    assert all(r.score >= 0 for r in results)
+
+
+def test_diversify_fewer_results_than_top_k(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None:
+    """_diversify returns early when results are already within top_k."""
+    results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate(chunks[:2])]
+    out = _diversify(results, top_k=10, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks)
+    assert len(out) == 2
+
+
+def test_diversify_filters_unknown_chunks(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None:
+    """_diversify returns early when valid results (chunks in index) fall within top_k."""
+    # 4 results but 3 reference unknown chunks not in the index → valid=1 ≤ top_k=3
+    unknown = [make_chunk(f"x = {i}", f"unknown_{i}.py") for i in range(3)]
+    results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate([chunks[0]] + unknown)]
+    out = _diversify(results, top_k=3, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks)
+    assert len(out) == 1  # only the known chunk survives
+
+
+def test_search_content_all_uses_both_pipelines(
+    chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any
+) -> None:
+    """ContentType.ALL activates both rerank and diversity defaults."""
+    from semble import SembleIndex
+
+    index = SembleIndex(mock_model, bm25, semantic, chunks, content=frozenset({ContentType.ALL}))
+    # rerank and diversity both resolve to True/set — results should come back without error
+    results = index.search("authenticate", top_k=2)
+    assert len(results) > 0
diff --git a/uv.lock b/uv.lock
index 1d492fe..6dac6b6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -10,7 +10,7 @@ resolution-markers = [
 
 [options]
 exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values.
-exclude-newer-span = "P3D"
+exclude-newer-span = "P1W"
 
 [[package]]
 name = "annotated-doc"
@@ -2456,6 +2456,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f3/a2/43bbc5860b5034e2af4ef99a0e04d726ff329c43e192ef3abaa8d7ecfce5/python_multipart-0.0.28-py3-none-any.whl", hash = "sha256:10faac07eb966c3f48dc415f9dee46c04cb10d58d30a35677db8027c825ed9b6", size = 29438, upload-time = "2026-05-10T11:05:15.052Z" },
 ]
 
+[[package]]
+name = "pyversity"
+version = "0.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d5/dc/a5a835b57ce06e21b4355d9e37ebb717455d55645014f1f5bde2fd948615/pyversity-0.2.0.tar.gz", hash = "sha256:48be2735b2471da1fa7497ea045aff25aa071f1176b86a011d4a49c01327ab6d", size = 28000, upload-time = "2026-02-02T08:06:38.563Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/e2/7e928734b14944f164cb243cee7fb14229e208d25df022506aa5a2390bcc/pyversity-0.2.0-py3-none-any.whl", hash = "sha256:c28f6a1a3ccfb97a9439d345e80ba0e60ffd14aa7680f2db0af7a70e90aae3f9", size = 21229, upload-time = "2026-02-02T08:06:37.125Z" },
+]
+
 [[package]]
 name = "pywin32"
 version = "311"
@@ -3124,6 +3137,7 @@ dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pathspec" },
+    { name = "pyversity" },
     { name = "tree-sitter" },
     { name = "tree-sitter-language-pack" },
     { name = "vicinity" },
@@ -3168,6 +3182,7 @@ requires-dist = [
     { name = "pydoclint", marker = "extra == 'dev'", specifier = ">=0.5.3" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0" },
+    { name = "pyversity", specifier = ">=0.1.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
     { name = "sentence-transformers", marker = "extra == 'benchmark'", specifier = ">=3.0" },
     { name = "tiktoken", marker = "extra == 'benchmark'", specifier = ">=0.7" },

From f26f30def07ed66ab4cba0949e70f96e83e5c609 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Wed, 20 May 2026 19:11:18 +0200
Subject: [PATCH 02/11] Add content-specific search

---
 src/semble/index/create.py |  3 ---
 src/semble/index/index.py  | 32 ++++++++++++--------------------
 src/semble/mcp.py          |  9 +++++----
 src/semble/types.py        |  9 +++------
 tests/test_index.py        | 14 +++++++++-----
 5 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index 1dd2609..cd23e5d 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -37,7 +37,6 @@ def create_index_from_path(
     extensions: Sequence[str] | None = None,
     content: frozenset[ContentType] = _DEFAULT_CONTENT,
     display_root: Path | None = None,
-    include_text_files: bool | None = None,
 ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
     """Create an index from a resolved directory, optionally storing chunk paths relative to display_root.
 
@@ -46,11 +45,9 @@ def create_index_from_path(
     :param extensions: File extensions to include.
     :param content: Content types to index.
     :param display_root: If set, chunk file paths are stored relative to this root.
-    :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
     :raises ValueError: if no items were found, no index can be created.
     :return: A bm25 index, vicinity index and list of chunks
     """
-    content = _apply_include_text_files(content, include_text_files)
     chunks: list[Chunk] = []
     resolved_extensions = get_extensions(content, extensions)
     for file_path in walk_files(path, resolved_extensions):
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index d29b5a2..2169fd0 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -15,18 +15,7 @@
 from semble.index.dense import SelectableBasicBackend, load_model
 from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search
 from semble.stats import save_search_stats
-from semble.types import (
-    CallType,
-    Chunk,
-    ContentSelection,
-    ContentType,
-    Encoder,
-    IndexStats,
-    SearchResult,
-    normalize_content,
-)
-
-_UNSET = object()
+from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult, _normalize_content
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
 
@@ -105,7 +94,7 @@ def from_path(
         path: str | Path,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        content: ContentSelection = ContentType.CODE,
+        content: ContentType | Sequence[ContentType] = ContentType.CODE,
         include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Create and index a SembleIndex from a directory.
@@ -120,7 +109,7 @@ def from_path(
         :raises FileNotFoundError: If `path` does not exist.
         :raises NotADirectoryError: If `path` exists but is not a directory.
         """
-        normalized = _apply_include_text_files(normalize_content(content), include_text_files)
+        normalized = _apply_include_text_files(_normalize_content(content), include_text_files)
         model = model or load_model()
         path = Path(path)
         if not path.exists():
@@ -145,7 +134,7 @@ def from_git(
         ref: str | None = None,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        content: ContentSelection = ContentType.CODE,
+        content: ContentType | Sequence[ContentType] = ContentType.CODE,
         include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Clone a git repository and index it.
@@ -165,7 +154,7 @@ def from_git(
         :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
-        normalized = _apply_include_text_files(normalize_content(content), include_text_files)
+        normalized = _apply_include_text_files(_normalize_content(content), include_text_files)
         with tempfile.TemporaryDirectory() as tmp_dir:
             # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`).
             cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir]
@@ -225,7 +214,7 @@ def search(
         filter_languages: list[str] | None = None,
         filter_paths: list[str] | None = None,
         rerank: bool | None = None,
-        diversity: float | None = _UNSET,  # type: ignore[assignment]
+        diversity: float | None = None,
     ) -> list[SearchResult]:
         """Search the index and return the top-k most relevant chunks.
 
@@ -240,8 +229,8 @@ def search(
         :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
             Defaults to ``True`` when ``ContentType.CODE`` was indexed.
         :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking.
-            Defaults to ``DEFAULT_DOCS_DIVERSITY`` when ``ContentType.DOCS`` was indexed. Pass
-            ``None`` explicitly to disable.
+            ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed.
+            Pass ``0.0`` to disable diversity even on a docs index.
         :return: Ranked list of :class:`SearchResult` objects, best match first.
         """
         if not self.chunks or not query.strip():
@@ -250,7 +239,10 @@ def search(
         has_code = ContentType.CODE in self._content or ContentType.ALL in self._content
         has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content
         resolved_rerank = has_code if rerank is None else rerank
-        resolved_diversity = (DEFAULT_DOCS_DIVERSITY if has_docs else None) if diversity is _UNSET else diversity
+        if diversity is None:
+            resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None
+        else:
+            resolved_diversity = diversity if diversity > 0 else None
 
         selector = self._get_selector_vector(filter_languages, filter_paths)
         results = search(
diff --git a/src/semble/mcp.py b/src/semble/mcp.py
index ca90897..97e1155 100644
--- a/src/semble/mcp.py
+++ b/src/semble/mcp.py
@@ -3,6 +3,7 @@
 import asyncio
 import logging
 from collections import OrderedDict
+from collections.abc import Sequence
 from pathlib import Path
 from typing import Annotated
 
@@ -12,7 +13,7 @@
 
 from semble.index import SembleIndex
 from semble.index.dense import load_model
-from semble.types import ContentSelection, ContentType, Encoder, normalize_content
+from semble.types import ContentType, Encoder
 from semble.utils import _format_results, _is_git_url, _resolve_chunk
 
 logger = logging.getLogger(__name__)
@@ -115,11 +116,11 @@ async def find_related(
 async def serve(
     path: str | None = None,
     ref: str | None = None,
-    content: ContentSelection = ContentType.CODE,
+    content: ContentType | Sequence[ContentType] = ContentType.CODE,
 ) -> None:
     """Start an MCP stdio server, optionally pre-indexing a default source."""
     model = await asyncio.to_thread(load_model)
-    cache = _IndexCache(model=model, content=normalize_content(content))
+    cache = _IndexCache(model=model, content=content)
     if path:
         await cache.get(path, ref=ref)
         if not _is_git_url(path):
@@ -132,7 +133,7 @@ async def serve(
 class _IndexCache:
     """Cache of indexed repos and local paths for the lifetime of the MCP server process."""
 
-    def __init__(self, model: Encoder, content: frozenset[ContentType] = frozenset({ContentType.CODE})) -> None:
+    def __init__(self, model: Encoder, content: ContentType | Sequence[ContentType] = ContentType.CODE) -> None:
         """Initialise an empty cache with a shared embedding model."""
         self._model = model
         self._content = content
diff --git a/src/semble/types.py b/src/semble/types.py
index ca32fac..f805f31 100644
--- a/src/semble/types.py
+++ b/src/semble/types.py
@@ -1,4 +1,4 @@
-from collections.abc import Iterable, Sequence
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Protocol, TypeAlias
@@ -24,11 +24,8 @@ class ContentType(str, Enum):
     ALL = "all"
 
 
-ContentSelection: TypeAlias = "ContentType | Iterable[ContentType]"
-
-
-def normalize_content(content: ContentSelection) -> frozenset[ContentType]:
-    """Normalize a single ContentType or iterable of ContentType into a frozenset."""
+def _normalize_content(content: "ContentType | Sequence[ContentType]") -> frozenset[ContentType]:
+    """Normalize a single ContentType or sequence into a frozenset."""
     if isinstance(content, ContentType):
         return frozenset({content})
     return frozenset(content)
diff --git a/tests/test_index.py b/tests/test_index.py
index 2a064ae..01d7fe0 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -29,18 +29,16 @@ def test_index_markdown_inclusion(
     mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool
 ) -> None:
     """Markdown files are excluded for code and included for docs/all/code+docs."""
-    from semble.types import normalize_content
+    from semble.types import _normalize_content
 
-    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=normalize_content(content))
+    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=_normalize_content(content))
     has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
     assert has_md is md_in_results
 
 
 @pytest.mark.parametrize("include_text_files", [True, False])
 def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None:
-    """include_text_files raises DeprecationWarning on create_index_from_path and from_path."""
-    with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
-        create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files)
+    """include_text_files raises DeprecationWarning on from_path."""
     with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
         SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files)
 
@@ -102,6 +100,12 @@ def test_search_with_filter_paths_does_not_crash(indexed_index: SembleIndex) ->
     assert all(r.chunk.file_path == target_path for r in results)
 
 
+def test_search_explicit_diversity(indexed_index: SembleIndex) -> None:
+    """Explicit diversity values are accepted; 0.0 disables diversity without error."""
+    assert len(indexed_index.search("authenticate", top_k=3, diversity=0.5)) > 0
+    assert len(indexed_index.search("authenticate", top_k=3, diversity=0.0)) > 0
+
+
 def test_search_without_reranking(indexed_index: SembleIndex) -> None:
     """Filtered search works regardless of where the selected chunk lives in the corpus."""
     with patch("semble.search.rerank_topk") as mock:

From 780f84ddec3852278dda60e767e73b03232b3006 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Wed, 20 May 2026 19:21:31 +0200
Subject: [PATCH 03/11] Add content-specific search

---
 src/semble/cli.py          | 14 ++++++--------
 src/semble/index/create.py | 13 +++++--------
 src/semble/index/files.py  | 15 +++++++--------
 src/semble/index/index.py  | 20 ++++++++++----------
 src/semble/mcp.py          |  5 ++---
 src/semble/types.py        |  7 -------
 tests/test_files.py        | 24 ++++++++----------------
 tests/test_index.py        |  9 +++------
 tests/test_search.py       |  2 +-
 9 files changed, 42 insertions(+), 67 deletions(-)

diff --git a/src/semble/cli.py b/src/semble/cli.py
index 8f47998..4700ee2 100644
--- a/src/semble/cli.py
+++ b/src/semble/cli.py
@@ -40,11 +40,9 @@ def _add_content_args(p: argparse.ArgumentParser) -> None:
     """Add --content and deprecated --include-text-files to a subparser."""
     p.add_argument(
         "--content",
-        action="append",
-        default=None,
+        default=ContentType.CODE.value,
         choices=_CONTENT_CHOICES,
-        metavar="TYPE",
-        help="Content type(s) to index: 'code' (default), 'docs', 'all'. Repeatable: --content code --content docs.",
+        help="Content type to index: 'code' (default), 'docs', or 'all'.",
     )
     p.add_argument(
         "--include-text-files",
@@ -96,16 +94,16 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
     print(f"Created {dest}")
 
 
-def _resolve_content(content_args: list[str] | None, include_text_files: bool) -> list[ContentType]:
-    """Resolve --content values and the deprecated --include-text-files into a list of ContentType."""
+def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType:
+    """Resolve --content and the deprecated --include-text-files into a ContentType."""
     if include_text_files:
         warnings.warn(
             "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.",
             DeprecationWarning,
             stacklevel=3,
         )
-        return [ContentType.ALL]
-    return [ContentType(v) for v in content_args] if content_args else [ContentType.CODE]
+        return ContentType.ALL
+    return ContentType(content_arg)
 
 
 def _cli_main() -> None:
diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index cd23e5d..3a5a78a 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -15,27 +15,24 @@
 from semble.types import Chunk, ContentType, Encoder
 
 _MAX_FILE_BYTES = 1_000_000  # 1 MB max file size to read and index
-_DEFAULT_CONTENT: frozenset[ContentType] = frozenset({ContentType.CODE})
 _DEPRECATION_MSG = (
     "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead."
 )
 
 
-def _apply_include_text_files(
-    normalized: frozenset[ContentType], include_text_files: bool | None
-) -> frozenset[ContentType]:
+def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType:
     """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
     if include_text_files is None:
-        return normalized
+        return content
     warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3)
-    return frozenset({ContentType.ALL}) if include_text_files else _DEFAULT_CONTENT
+    return ContentType.ALL if include_text_files else ContentType.CODE
 
 
 def create_index_from_path(
     path: Path,
     model: Encoder,
     extensions: Sequence[str] | None = None,
-    content: frozenset[ContentType] = _DEFAULT_CONTENT,
+    content: ContentType = ContentType.CODE,
     display_root: Path | None = None,
 ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
     """Create an index from a resolved directory, optionally storing chunk paths relative to display_root.
@@ -43,7 +40,7 @@ def create_index_from_path(
     :param path: Resolved absolute path to index.
     :param model: The model to use for indexing.
     :param extensions: File extensions to include.
-    :param content: Content types to index.
+    :param content: Content type to index.
     :param display_root: If set, chunk file paths are stored relative to this root.
     :raises ValueError: if no items were found, no index can be created.
     :return: A bm25 index, vicinity index and list of chunks
diff --git a/src/semble/index/files.py b/src/semble/index/files.py
index 10ae819..b292e95 100644
--- a/src/semble/index/files.py
+++ b/src/semble/index/files.py
@@ -453,16 +453,15 @@ def detect_language(file_name: Path) -> str | None:
     return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower())
 
 
-def get_extensions(content: frozenset[ContentType], extensions: Sequence[str] | None) -> list[str]:
+def get_extensions(content: ContentType, extensions: Sequence[str] | None) -> list[str]:
     """Returns a list of supported file extensions."""
-    if ContentType.ALL in content:
-        languages: frozenset[str] = ALL_LANGUAGES
+    languages: set[str] | frozenset[str]
+    if content == ContentType.ALL:
+        languages = ALL_LANGUAGES
+    elif content == ContentType.DOCS:
+        languages = _DOC_LANGUAGES
     else:
-        languages = frozenset()
-        if ContentType.CODE in content:
-            languages |= _CODE_LANGUAGES
-        if ContentType.DOCS in content:
-            languages |= _DOC_LANGUAGES
+        languages = _CODE_LANGUAGES
     all_extensions: set[str] = set()
     for language in languages:
         all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set()))
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 2169fd0..80789c3 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -11,11 +11,11 @@
 import numpy.typing as npt
 from bm25s import BM25
 
-from semble.index.create import _DEFAULT_CONTENT, _apply_include_text_files, create_index_from_path
+from semble.index.create import _apply_include_text_files, create_index_from_path
 from semble.index.dense import SelectableBasicBackend, load_model
 from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search
 from semble.stats import save_search_stats
-from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult, _normalize_content
+from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
 
@@ -30,7 +30,7 @@ def __init__(
         semantic_index: SelectableBasicBackend,
         chunks: list[Chunk],
         root: Path | None = None,
-        content: frozenset[ContentType] = _DEFAULT_CONTENT,
+        content: ContentType = ContentType.CODE,
     ) -> None:
         """Initialize a SembleIndex. Should be created with from_path or from_git.
 
@@ -46,7 +46,7 @@ def __init__(
         self._bm25_index: BM25 = bm25_index
         self._semantic_index: SelectableBasicBackend = semantic_index
         self._root: Path | None = root
-        self._content: frozenset[ContentType] = content
+        self._content: ContentType = content
         self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
         self._file_mapping, self._language_mapping = self._populate_mapping()
 
@@ -94,7 +94,7 @@ def from_path(
         path: str | Path,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        content: ContentType | Sequence[ContentType] = ContentType.CODE,
+        content: ContentType = ContentType.CODE,
         include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Create and index a SembleIndex from a directory.
@@ -109,7 +109,7 @@ def from_path(
         :raises FileNotFoundError: If `path` does not exist.
         :raises NotADirectoryError: If `path` exists but is not a directory.
         """
-        normalized = _apply_include_text_files(_normalize_content(content), include_text_files)
+        normalized = _apply_include_text_files(content, include_text_files)
         model = model or load_model()
         path = Path(path)
         if not path.exists():
@@ -134,7 +134,7 @@ def from_git(
         ref: str | None = None,
         model: Encoder | None = None,
         extensions: Sequence[str] | None = None,
-        content: ContentType | Sequence[ContentType] = ContentType.CODE,
+        content: ContentType = ContentType.CODE,
         include_text_files: bool | None = None,
     ) -> SembleIndex:
         """Clone a git repository and index it.
@@ -154,7 +154,7 @@ def from_git(
         :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
-        normalized = _apply_include_text_files(_normalize_content(content), include_text_files)
+        normalized = _apply_include_text_files(content, include_text_files)
         with tempfile.TemporaryDirectory() as tmp_dir:
             # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`).
             cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir]
@@ -236,8 +236,8 @@ def search(
         if not self.chunks or not query.strip():
             return []
 
-        has_code = ContentType.CODE in self._content or ContentType.ALL in self._content
-        has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content
+        has_code = self._content in (ContentType.CODE, ContentType.ALL)
+        has_docs = self._content in (ContentType.DOCS, ContentType.ALL)
         resolved_rerank = has_code if rerank is None else rerank
         if diversity is None:
             resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None
diff --git a/src/semble/mcp.py b/src/semble/mcp.py
index 97e1155..b6f8d14 100644
--- a/src/semble/mcp.py
+++ b/src/semble/mcp.py
@@ -3,7 +3,6 @@
 import asyncio
 import logging
 from collections import OrderedDict
-from collections.abc import Sequence
 from pathlib import Path
 from typing import Annotated
 
@@ -116,7 +115,7 @@ async def find_related(
 async def serve(
     path: str | None = None,
     ref: str | None = None,
-    content: ContentType | Sequence[ContentType] = ContentType.CODE,
+    content: ContentType = ContentType.CODE,
 ) -> None:
     """Start an MCP stdio server, optionally pre-indexing a default source."""
     model = await asyncio.to_thread(load_model)
@@ -133,7 +132,7 @@ async def serve(
 class _IndexCache:
     """Cache of indexed repos and local paths for the lifetime of the MCP server process."""
 
-    def __init__(self, model: Encoder, content: ContentType | Sequence[ContentType] = ContentType.CODE) -> None:
+    def __init__(self, model: Encoder, content: ContentType = ContentType.CODE) -> None:
         """Initialise an empty cache with a shared embedding model."""
         self._model = model
         self._content = content
diff --git a/src/semble/types.py b/src/semble/types.py
index f805f31..1da4c7f 100644
--- a/src/semble/types.py
+++ b/src/semble/types.py
@@ -24,13 +24,6 @@ class ContentType(str, Enum):
     ALL = "all"
 
 
-def _normalize_content(content: "ContentType | Sequence[ContentType]") -> frozenset[ContentType]:
-    """Normalize a single ContentType or sequence into a frozenset."""
-    if isinstance(content, ContentType):
-        return frozenset({content})
-    return frozenset(content)
-
-
 class Encoder(Protocol):
     """Protocol for embedding models."""
 
diff --git a/tests/test_files.py b/tests/test_files.py
index acda49c..1813908 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -23,12 +23,12 @@ def test_language_sets_are_consistent() -> None:
 @pytest.mark.parametrize(
     ("content", "includes", "excludes"),
     [
-        (frozenset({ContentType.CODE}), [".py"], [".md"]),
-        (frozenset({ContentType.DOCS}), [".md"], [".py"]),
-        (frozenset({ContentType.ALL}), [".py", ".md"], []),
+        (ContentType.CODE, [".py"], [".md"]),
+        (ContentType.DOCS, [".md"], [".py"]),
+        (ContentType.ALL, [".py", ".md"], []),
     ],
 )
-def test_get_extensions(content: frozenset[ContentType], includes: list[str], excludes: list[str]) -> None:
+def test_get_extensions(content: ContentType, includes: list[str], excludes: list[str]) -> None:
     """get_extensions returns the right extensions for each content type."""
     exts = set(get_extensions(content, None))
     for ext in includes:
@@ -37,20 +37,12 @@ def test_get_extensions(content: frozenset[ContentType], includes: list[str], ex
         assert ext not in exts
 
 
-def test_get_extensions_code_and_docs() -> None:
-    """Code + docs is the union of each individual set."""
-    code = set(get_extensions(frozenset({ContentType.CODE}), None))
-    docs = set(get_extensions(frozenset({ContentType.DOCS}), None))
-    combined = set(get_extensions(frozenset({ContentType.CODE, ContentType.DOCS}), None))
-    assert combined == code | docs
-
-
 def test_get_extensions_additional() -> None:
     """Extra extensions are appended and existing ones are not duplicated."""
-    base = get_extensions(frozenset({ContentType.ALL}), None)
-    with_extra = get_extensions(frozenset({ContentType.ALL}), [".kjs"])
+    base = get_extensions(ContentType.ALL, None)
+    with_extra = get_extensions(ContentType.ALL, [".kjs"])
     assert set(with_extra) == set(base) | {".kjs"}
 
-    base_code = get_extensions(frozenset({ContentType.CODE}), None)
-    with_existing = get_extensions(frozenset({ContentType.CODE}), [".py"])
+    base_code = get_extensions(ContentType.CODE, None)
+    with_existing = get_extensions(ContentType.CODE, [".py"])
     assert set(with_existing) == set(base_code)
diff --git a/tests/test_index.py b/tests/test_index.py
index 01d7fe0..f9435c1 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -22,16 +22,13 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex:
         (ContentType.CODE, False),
         (ContentType.DOCS, True),
         (ContentType.ALL, True),
-        ([ContentType.CODE, ContentType.DOCS], True),
     ],
 )
 def test_index_markdown_inclusion(
-    mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool
+    mock_model: Encoder, tmp_project: Path, content: ContentType, md_in_results: bool
 ) -> None:
-    """Markdown files are excluded for code and included for docs/all/code+docs."""
-    from semble.types import _normalize_content
-
-    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=_normalize_content(content))
+    """Markdown files are excluded for code and included for docs/all."""
+    _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content)
     has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
     assert has_md is md_in_results
 
diff --git a/tests/test_search.py b/tests/test_search.py
index 9bffdc3..698c465 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -192,7 +192,7 @@ def test_search_content_all_uses_both_pipelines(
     """ContentType.ALL activates both rerank and diversity defaults."""
     from semble import SembleIndex
 
-    index = SembleIndex(mock_model, bm25, semantic, chunks, content=frozenset({ContentType.ALL}))
+    index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL)
     # rerank and diversity both resolve to True/set — results should come back without error
     results = index.search("authenticate", top_k=2)
     assert len(results) > 0

From 737eb1e7cbd4fdcc67ee6243e482139f69075c69 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Wed, 20 May 2026 19:30:59 +0200
Subject: [PATCH 04/11] Clean machine

---
 src/semble/index/create.py | 12 ------------
 src/semble/index/index.py  | 34 +++++++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index 3a5a78a..55ed253 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -1,5 +1,4 @@
 import contextlib
-import warnings
 from collections.abc import Sequence
 from pathlib import Path
 
@@ -15,17 +14,6 @@
 from semble.types import Chunk, ContentType, Encoder
 
 _MAX_FILE_BYTES = 1_000_000  # 1 MB max file size to read and index
-_DEPRECATION_MSG = (
-    "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead."
-)
-
-
-def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType:
-    """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
-    if include_text_files is None:
-        return content
-    warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3)
-    return ContentType.ALL if include_text_files else ContentType.CODE
 
 
 def create_index_from_path(
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 80789c3..73e2f2f 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -3,6 +3,7 @@
 import os
 import subprocess
 import tempfile
+import warnings
 from collections import defaultdict
 from collections.abc import Sequence
 from pathlib import Path
@@ -11,13 +12,24 @@
 import numpy.typing as npt
 from bm25s import BM25
 
-from semble.index.create import _apply_include_text_files, create_index_from_path
+from semble.index.create import create_index_from_path
 from semble.index.dense import SelectableBasicBackend, load_model
 from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search
 from semble.stats import save_search_stats
 from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
+_INCLUDE_TEXT_FILES_DEPRECATION_MSG = (
+    "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead."
+)
+
+
+def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType:
+    """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
+    if include_text_files is None:
+        return content
+    warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=3)
+    return ContentType.ALL if include_text_files else ContentType.CODE
 
 
 class SembleIndex:
@@ -39,7 +51,7 @@ def __init__(
         :param semantic_index: The semantic index.
         :param chunks: The found chunks.
         :param root: Root directory used to read file sizes for token-savings stats.
-        :param content: Content types used when indexing; controls the search pipeline.
+        :param content: Content type used when indexing; controls the search pipeline.
         """
         self.model: Encoder = model
         self.chunks: list[Chunk] = chunks
@@ -102,10 +114,10 @@ def from_path(
         :param path: Root directory to index.
         :param model: Embedding model to use. Defaults to potion-code-16M.
         :param extensions: File extensions to include. Defaults to a standard set of code extensions.
-        :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``,
-            ``ContentType.ALL``, or a list of multiple types.
-        :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
-        :return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
+        :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS,
+            or ContentType.ALL.
+        :param include_text_files: Deprecated. Use content=ContentType.ALL instead.
+        :return: An indexed SembleIndex. Chunk file paths are relative to path.
         :raises FileNotFoundError: If `path` does not exist.
         :raises NotADirectoryError: If `path` exists but is not a directory.
         """
@@ -141,16 +153,16 @@ def from_git(
 
         The repository is cloned into a temporary directory that is removed once
         indexing finishes. Chunk content is preserved in-memory, but
-        ``chunk.file_path`` will not point to a readable file after this call
+        chunk.file_path will not point to a readable file after this call
         returns — it is a repo-relative label, not a filesystem path.
 
         :param url: URL of the git repository to clone (any git provider).
         :param ref: Branch or tag to check out. Defaults to the remote HEAD.
         :param model: Embedding model to use. Defaults to potion-code-16M.
         :param extensions: File extensions to include. Defaults to a standard set of code extensions.
-        :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``,
-            ``ContentType.ALL``, or a list of multiple types.
-        :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead.
+        :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS,
+            or ContentType.ALL.
+        :param include_text_files: Deprecated. Use content=ContentType.ALL instead.
         :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
@@ -227,7 +239,7 @@ def search(
         :param filter_paths: Optional list of repo-relative file paths; if set, only
             chunks from these files are returned.
         :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
-            Defaults to ``True`` when ``ContentType.CODE`` was indexed.
+            Defaults to ``True`` when ContentType.CODE was indexed.
         :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking.
             ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed.
             Pass ``0.0`` to disable diversity even on a docs index.

From 0325dd89126965a05e86b13e20bce8b428d44dcc Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 14:47:57 +0200
Subject: [PATCH 05/11] Drop pyversity, add docs

---
 README.md                     | 48 ++++++++++++++++++++++++++++-------
 pyproject.toml                |  1 -
 src/semble/agents/claude.md   | 14 +++++++---
 src/semble/agents/copilot.md  | 14 +++++++---
 src/semble/agents/cursor.md   | 14 +++++++---
 src/semble/agents/gemini.md   | 14 +++++++---
 src/semble/agents/kiro.md     | 14 +++++++---
 src/semble/agents/opencode.md | 14 +++++++---
 src/semble/cli.py             |  2 +-
 src/semble/index/index.py     | 14 ++--------
 src/semble/search.py          | 44 +++-----------------------------
 tests/test_index.py           |  6 -----
 tests/test_search.py          | 34 +++----------------------
 uv.lock                       | 15 -----------
 14 files changed, 115 insertions(+), 133 deletions(-)

diff --git a/README.md b/README.md
index d8b4341..9b14509 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ​```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+​```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+​```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ​```bash
@@ -76,9 +83,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ### Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
 ```
 
 </details>
@@ -287,6 +295,8 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project):
 | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a local directory path or an https:// git URL. |
 | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. |
 
+By default the MCP server indexes only code files. To also index documentation and prose, append `--content all` (or `--content docs`) to the server command. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`.
+
 
 <a id="bash-agentsmd"></a>
 
@@ -307,6 +317,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ​```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+​```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+​```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ​```bash
@@ -320,9 +337,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
 ```
 
 ### Sub-agent setup
@@ -357,11 +375,17 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec
 # Limit results
 semble search "save model to disk" ./my-project --top-k 10
 
+# Search docs and prose (markdown, rst, etc.) instead of code
+semble search "deployment guide" ./my-project --content docs
+
+# Search everything (code and docs)
+semble search "authentication" ./my-project --content all
+
 # Find code similar to a known location
 semble find-related src/auth.py 42 ./my-project
 ```
 
-`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
+`--content` accepts `code` (default), `docs`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
 
 <details>
 <summary>Savings</summary>
@@ -395,11 +419,17 @@ Stats are stored in `~/.semble/savings.jsonl`.
 Semble can also be used as a Python library for programmatic access, useful when building custom tooling or integrating search directly into your own code.
 
 ```python
-from semble import SembleIndex
+from semble import ContentType, SembleIndex
 
-# Index a local directory
+# Index a local directory (code only, the default)
 index = SembleIndex.from_path("./my-project")
 
+# Index docs and prose (markdown, rst, etc.)
+index = SembleIndex.from_path("./my-project", content=ContentType.DOCS)
+
+# Index everything — code and docs
+index = SembleIndex.from_path("./my-project", content=ContentType.ALL)
+
 # Index a remote git repository
 index = SembleIndex.from_git("https://github.com/MinishLab/model2vec")
 
diff --git a/pyproject.toml b/pyproject.toml
index 022f032..a35cc46 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,6 @@ classifiers = [
 dependencies = [
     "model2vec>=0.4.0",
     "vicinity>=0.4.4",
-    "pyversity>=0.1.0",
     "numpy>=1.24.0",
     "bm25s>=0.2.0",
     "pathspec>=0.12",
diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md
index 515d60e..82d3fd0 100644
--- a/src/semble/agents/claude.md
+++ b/src/semble/agents/claude.md
@@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md
index 515d60e..82d3fd0 100644
--- a/src/semble/agents/copilot.md
+++ b/src/semble/agents/copilot.md
@@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md
index 160aac4..62c9fb5 100644
--- a/src/semble/agents/cursor.md
+++ b/src/semble/agents/cursor.md
@@ -11,6 +11,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -24,6 +31,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md
index 1ea5440..359f69d 100644
--- a/src/semble/agents/gemini.md
+++ b/src/semble/agents/gemini.md
@@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md
index 5177ec5..48381d8 100644
--- a/src/semble/agents/kiro.md
+++ b/src/semble/agents/kiro.md
@@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md
index 2c51867..ea9561b 100644
--- a/src/semble/agents/opencode.md
+++ b/src/semble/agents/opencode.md
@@ -15,6 +15,13 @@ semble search "save_pretrained" ./my-project
 semble search "save model to disk" ./my-project --top-k 10
 ```
 
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
 Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
 
 ```bash
@@ -28,6 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
 ## Workflow
 
 1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/cli.py b/src/semble/cli.py
index 3f2b9b7..5f3fd10 100644
--- a/src/semble/cli.py
+++ b/src/semble/cli.py
@@ -100,7 +100,7 @@ def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType:
         warnings.warn(
             "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.",
             DeprecationWarning,
-            stacklevel=3,
+            stacklevel=2,
         )
         return ContentType.ALL
     return ContentType(content_arg)
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 73e2f2f..e574567 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -14,7 +14,7 @@
 
 from semble.index.create import create_index_from_path
 from semble.index.dense import SelectableBasicBackend, load_model
-from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search
+from semble.search import _search_semantic, search
 from semble.stats import save_search_stats
 from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult
 
@@ -28,7 +28,7 @@ def _apply_include_text_files(content: ContentType, include_text_files: bool | N
     """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
     if include_text_files is None:
         return content
-    warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=3)
+    warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=2)
     return ContentType.ALL if include_text_files else ContentType.CODE
 
 
@@ -226,7 +226,6 @@ def search(
         filter_languages: list[str] | None = None,
         filter_paths: list[str] | None = None,
         rerank: bool | None = None,
-        diversity: float | None = None,
     ) -> list[SearchResult]:
         """Search the index and return the top-k most relevant chunks.
 
@@ -240,21 +239,13 @@ def search(
             chunks from these files are returned.
         :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
             Defaults to ``True`` when ContentType.CODE was indexed.
-        :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking.
-            ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed.
-            Pass ``0.0`` to disable diversity even on a docs index.
         :return: Ranked list of :class:`SearchResult` objects, best match first.
         """
         if not self.chunks or not query.strip():
             return []
 
         has_code = self._content in (ContentType.CODE, ContentType.ALL)
-        has_docs = self._content in (ContentType.DOCS, ContentType.ALL)
         resolved_rerank = has_code if rerank is None else rerank
-        if diversity is None:
-            resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None
-        else:
-            resolved_diversity = diversity if diversity > 0 else None
 
         selector = self._get_selector_vector(filter_languages, filter_paths)
         results = search(
@@ -267,7 +258,6 @@ def search(
             alpha=alpha,
             selector=selector,
             rerank=resolved_rerank,
-            diversity=resolved_diversity,
         )
         save_search_stats(results, CallType.SEARCH, self._file_sizes)
         return results
diff --git a/src/semble/search.py b/src/semble/search.py
index 7b58b8a..3556bf8 100644
--- a/src/semble/search.py
+++ b/src/semble/search.py
@@ -1,7 +1,6 @@
 import bm25s
 import numpy as np
 import numpy.typing as npt
-from pyversity import Strategy, diversify
 
 from semble.index.dense import SelectableBasicBackend
 from semble.index.sparse import selector_to_mask
@@ -10,7 +9,6 @@
 from semble.types import Chunk, Encoder, SearchResult
 
 _RRF_K = 60
-DEFAULT_DOCS_DIVERSITY = 0.3
 
 
 def _rrf_scores(scores: dict[Chunk, float]) -> dict[Chunk, float]:
@@ -74,7 +72,6 @@ def search(
     alpha: float | None = None,
     selector: npt.NDArray[np.int_] | None = None,
     rerank: bool = True,
-    diversity: float | None = None,
 ) -> list[SearchResult]:
     """Hybrid search: alpha-weighted combination of semantic and BM25 scores.
 
@@ -90,8 +87,6 @@ def search(
     :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type.
     :param selector: Optional array of chunk indices to filter results by.
     :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties).
-    :param diversity: DPP diversity weight in [0, 1]. When set, fetches 2× candidates, reranks,
-        then re-selects with pyversity DPP. None disables diversity.
     :return: List of search results sorted by combined score descending.
     """
     alpha_weight = resolve_alpha(query, alpha)
@@ -120,44 +115,11 @@ def search(
         for chunk in all_candidates
     }
 
-    # Over-fetch before reranking so diversity has candidates to choose from.
-    fetch_k = top_k * 2 if diversity is not None else top_k
-
     if rerank:
         boost_multi_chunk_files(combined_scores)
         combined_scores = apply_query_boost(combined_scores, query, chunks)
-        ranked = rerank_topk(combined_scores, fetch_k, penalise_paths=alpha_weight < 1.0)
+        ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0)
     else:
-        ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:fetch_k]
-
-    results = [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]
-
-    if diversity is not None:
-        return _diversify(results, top_k, diversity, semantic_index, chunks)
-    return results
+        ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
 
-
-def _diversify(
-    results: list[SearchResult],
-    top_k: int,
-    diversity_weight: float,
-    semantic_index: SelectableBasicBackend,
-    chunks: list[Chunk],
-) -> list[SearchResult]:
-    """Re-rank results with DPP to improve embedding-space diversity."""
-    if len(results) <= top_k:
-        return results
-    chunk_index = {c: i for i, c in enumerate(chunks)}
-    valid = [r for r in results if r.chunk in chunk_index]
-    if len(valid) <= top_k:
-        return valid
-    indices = np.array([chunk_index[r.chunk] for r in valid])
-    scores = np.array([r.score for r in valid], dtype=np.float32)
-    result = diversify(
-        embeddings=semantic_index.vectors[indices],
-        scores=scores,
-        k=top_k,
-        strategy=Strategy.DPP,
-        diversity=diversity_weight,
-    )
-    return sorted((valid[i] for i in result.indices), key=lambda r: -r.score)
+    return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]
diff --git a/tests/test_index.py b/tests/test_index.py
index f9435c1..83ea57e 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -97,12 +97,6 @@ def test_search_with_filter_paths_does_not_crash(indexed_index: SembleIndex) ->
     assert all(r.chunk.file_path == target_path for r in results)
 
 
-def test_search_explicit_diversity(indexed_index: SembleIndex) -> None:
-    """Explicit diversity values are accepted; 0.0 disables diversity without error."""
-    assert len(indexed_index.search("authenticate", top_k=3, diversity=0.5)) > 0
-    assert len(indexed_index.search("authenticate", top_k=3, diversity=0.0)) > 0
-
-
 def test_search_without_reranking(indexed_index: SembleIndex) -> None:
     """Filtered search works regardless of where the selected chunk lives in the corpus."""
     with patch("semble.search.rerank_topk") as mock:
diff --git a/tests/test_search.py b/tests/test_search.py
index 89b5df0..46449e9 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -8,9 +8,9 @@
 from vicinity.backends.basic import BasicArgs
 
 from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model
-from semble.search import DEFAULT_DOCS_DIVERSITY, _diversify, _search_bm25, _search_semantic, _sort_top_k, search
+from semble.search import _search_bm25, _search_semantic, _sort_top_k, search
 from semble.tokens import tokenize
-from semble.types import Chunk, ContentType, Encoder, SearchResult
+from semble.types import Chunk, ContentType, Encoder
 from tests.conftest import make_chunk
 
 
@@ -161,38 +161,12 @@ def test_selectable_basic_backend_rejects_k_below_one(
         semantic.query(embeddings[:1], k=0)
 
 
-def test_search_with_diversity(
+def test_search_content_all_uses_code_reranking(
     chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any
 ) -> None:
-    """Search with diversity set runs DPP and returns top_k results."""
-    results = search("authenticate", mock_model, semantic, bm25, chunks, top_k=2, diversity=DEFAULT_DOCS_DIVERSITY)
-    assert len(results) == 2
-    assert all(r.score >= 0 for r in results)
-
-
-def test_diversify_fewer_results_than_top_k(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None:
-    """_diversify returns early when results are already within top_k."""
-    results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate(chunks[:2])]
-    out = _diversify(results, top_k=10, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks)
-    assert len(out) == 2
-
-
-def test_diversify_filters_unknown_chunks(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None:
-    """_diversify returns early when valid results (chunks in index) fall within top_k."""
-    # 4 results but 3 reference unknown chunks not in the index → valid=1 ≤ top_k=3
-    unknown = [make_chunk(f"x = {i}", f"unknown_{i}.py") for i in range(3)]
-    results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate([chunks[0]] + unknown)]
-    out = _diversify(results, top_k=3, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks)
-    assert len(out) == 1  # only the known chunk survives
-
-
-def test_search_content_all_uses_both_pipelines(
-    chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any
-) -> None:
-    """ContentType.ALL activates both rerank and diversity defaults."""
+    """ContentType.ALL activates code reranking (has_code=True)."""
     from semble import SembleIndex
 
     index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL)
-    # rerank and diversity both resolve to True/set — results should come back without error
     results = index.search("authenticate", top_k=2)
     assert len(results) > 0
diff --git a/uv.lock b/uv.lock
index fbbfb35..a45dbf4 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2456,19 +2456,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f3/a2/43bbc5860b5034e2af4ef99a0e04d726ff329c43e192ef3abaa8d7ecfce5/python_multipart-0.0.28-py3-none-any.whl", hash = "sha256:10faac07eb966c3f48dc415f9dee46c04cb10d58d30a35677db8027c825ed9b6", size = 29438, upload-time = "2026-05-10T11:05:15.052Z" },
 ]
 
-[[package]]
-name = "pyversity"
-version = "0.2.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d5/dc/a5a835b57ce06e21b4355d9e37ebb717455d55645014f1f5bde2fd948615/pyversity-0.2.0.tar.gz", hash = "sha256:48be2735b2471da1fa7497ea045aff25aa071f1176b86a011d4a49c01327ab6d", size = 28000, upload-time = "2026-02-02T08:06:38.563Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/e2/7e928734b14944f164cb243cee7fb14229e208d25df022506aa5a2390bcc/pyversity-0.2.0-py3-none-any.whl", hash = "sha256:c28f6a1a3ccfb97a9439d345e80ba0e60ffd14aa7680f2db0af7a70e90aae3f9", size = 21229, upload-time = "2026-02-02T08:06:37.125Z" },
-]
-
 [[package]]
 name = "pywin32"
 version = "311"
@@ -3137,7 +3124,6 @@ dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pathspec" },
-    { name = "pyversity" },
     { name = "tree-sitter" },
     { name = "tree-sitter-language-pack" },
     { name = "vicinity" },
@@ -3182,7 +3168,6 @@ requires-dist = [
     { name = "pydoclint", marker = "extra == 'dev'", specifier = ">=0.5.3" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0" },
-    { name = "pyversity", specifier = ">=0.1.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" },
     { name = "sentence-transformers", marker = "extra == 'benchmark'", specifier = ">=3.0" },
     { name = "tiktoken", marker = "extra == 'benchmark'", specifier = ">=0.7" },

From a0bad8999bf32054b99049124fd637971420fd97 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 14:52:52 +0200
Subject: [PATCH 06/11] Rework languages

---
 src/semble/index/files.py |  5 ++---
 src/semble/index/index.py | 10 ++++++----
 tests/test_files.py       |  8 ++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/semble/index/files.py b/src/semble/index/files.py
index b292e95..a9a5492 100644
--- a/src/semble/index/files.py
+++ b/src/semble/index/files.py
@@ -380,8 +380,7 @@
     "vimdoc",
 }
 
-# Everything that is not a programming language — used to derive _CODE_LANGUAGES.
-_NON_CODE_LANGUAGES = _DOC_LANGUAGES | {
+_CONFIG_LANGUAGES = {
     "beancount",
     "capnp",
     "cedarschema",
@@ -444,7 +443,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]:
 
 
 ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values())
-_CODE_LANGUAGES = ALL_LANGUAGES - _NON_CODE_LANGUAGES
+_CODE_LANGUAGES = ALL_LANGUAGES - _DOC_LANGUAGES - _CONFIG_LANGUAGES
 _LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE)
 
 
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index e574567..d070c88 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -19,16 +19,18 @@
 from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult
 
 _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
-_INCLUDE_TEXT_FILES_DEPRECATION_MSG = (
-    "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead."
-)
 
 
 def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType:
     """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
     if include_text_files is None:
         return content
-    warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=2)
+    warnings.warn(
+        "include_text_files is deprecated and will be removed in a future version."
+        " Use content=ContentType.ALL instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     return ContentType.ALL if include_text_files else ContentType.CODE
 
 
diff --git a/tests/test_files.py b/tests/test_files.py
index 1813908..bcb747f 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from semble.index.files import _CODE_LANGUAGES, _DOC_LANGUAGES, _NON_CODE_LANGUAGES, detect_language, get_extensions
+from semble.index.files import _CODE_LANGUAGES, _CONFIG_LANGUAGES, _DOC_LANGUAGES, detect_language, get_extensions
 from semble.types import ContentType
 
 
@@ -14,10 +14,10 @@ def test_detect_language() -> None:
 
 
 def test_language_sets_are_consistent() -> None:
-    """Code, doc, and non-code language sets satisfy their mutual invariants."""
+    """Code, doc, and config language sets are mutually disjoint."""
     assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES)
-    assert _CODE_LANGUAGES.isdisjoint(_NON_CODE_LANGUAGES)
-    assert _DOC_LANGUAGES <= _NON_CODE_LANGUAGES
+    assert _CODE_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES)
+    assert _DOC_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES)
 
 
 @pytest.mark.parametrize(

From b345683b48a42681f782ec3b4ec83ce39623b28f Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 14:57:55 +0200
Subject: [PATCH 07/11] Simplify search logic

---
 src/semble/index/index.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index d070c88..81d37cf 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -246,8 +246,7 @@ def search(
         if not self.chunks or not query.strip():
             return []
 
-        has_code = self._content in (ContentType.CODE, ContentType.ALL)
-        resolved_rerank = has_code if rerank is None else rerank
+        resolved_rerank = (self._content != ContentType.DOCS) if rerank is None else rerank
 
         selector = self._get_selector_vector(filter_languages, filter_paths)
         results = search(

From 38f97dc0d9f9bab702a0c447f25d6fd9048f6247 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 15:04:45 +0200
Subject: [PATCH 08/11] Drop sphinx style params in docstrings

---
 src/semble/index/index.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 81d37cf..91d0771 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -165,7 +165,7 @@ def from_git(
         :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS,
             or ContentType.ALL.
         :param include_text_files: Deprecated. Use content=ContentType.ALL instead.
-        :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
+        :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. src/foo.py).
         :raises RuntimeError: If git is not on PATH, the clone fails, or times out.
         """
         normalized = _apply_include_text_files(content, include_text_files)
@@ -234,14 +234,14 @@ def search(
         :param query: Natural-language or keyword query string.
         :param top_k: Maximum number of results to return.
         :param alpha: Blend weight for hybrid score combination; 1.0 = full semantic
-            weight, 0.0 = full BM25 weight. ``None`` auto-detects from query type.
+            weight, 0.0 = full BM25 weight. None auto-detects from query type.
         :param filter_languages: Optional list of language codes; if set, only chunks in
             these languages are returned.
         :param filter_paths: Optional list of repo-relative file paths; if set, only
             chunks from these files are returned.
         :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
-            Defaults to ``True`` when ContentType.CODE was indexed.
-        :return: Ranked list of :class:`SearchResult` objects, best match first.
+            Defaults to True when ContentType.CODE was indexed.
+        :return: Ranked list of SearchResult objects, best match first.
         """
         if not self.chunks or not query.strip():
             return []

From 55823b294cbc4757e15a7ce2004d8194a4159345 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 15:28:25 +0200
Subject: [PATCH 09/11] Update tests

---
 tests/test_index.py   | 21 +++++++++++++++++++++
 tests/test_ranking.py |  5 +++++
 tests/test_search.py  | 13 +------------
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/tests/test_index.py b/tests/test_index.py
index 83ea57e..5abf90e 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -107,6 +107,27 @@ def test_search_without_reranking(indexed_index: SembleIndex) -> None:
         mock.assert_called()
 
 
+@pytest.mark.parametrize(
+    ("content", "expect_rerank"),
+    [
+        (ContentType.CODE, True),
+        (ContentType.ALL, True),
+        (ContentType.DOCS, False),
+    ],
+)
+def test_search_rerank_default_by_content_type(
+    mock_model: Encoder, tmp_project: Path, content: ContentType, expect_rerank: bool
+) -> None:
+    """Reranking is on by default for code/all content, off for docs-only."""
+    index = SembleIndex.from_path(tmp_project, model=mock_model, content=content)
+    with patch("semble.search.rerank_topk") as mock:
+        index.search("function", top_k=3)
+        if expect_rerank:
+            mock.assert_called()
+        else:
+            mock.assert_not_called()
+
+
 @pytest.mark.parametrize("query", ["", "   ", "\n\n"])
 def test_search_empty_query_returns_empty(indexed_index: SembleIndex, query: str) -> None:
     """Empty / whitespace-only queries return [] across all modes."""
diff --git a/tests/test_ranking.py b/tests/test_ranking.py
index 8caf26b..f510b79 100644
--- a/tests/test_ranking.py
+++ b/tests/test_ranking.py
@@ -148,3 +148,8 @@ def test_boost_multi_chunk_files() -> None:
     scores: dict = {c1: 1.0, c2: 0.8, c3: 1.0}
     boost_multi_chunk_files(scores)
     assert scores[c1] > 1.0
+
+
+def test_boosting_with_empty() -> None:
+    """apply_query_boost with empty inputs returns an empty dict."""
+    assert apply_query_boost({}, "query", []) == {}
diff --git a/tests/test_search.py b/tests/test_search.py
index 46449e9..1bfaa7c 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -10,7 +10,7 @@
 from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model
 from semble.search import _search_bm25, _search_semantic, _sort_top_k, search
 from semble.tokens import tokenize
-from semble.types import Chunk, ContentType, Encoder
+from semble.types import Chunk, Encoder
 from tests.conftest import make_chunk
 
 
@@ -159,14 +159,3 @@ def test_selectable_basic_backend_rejects_k_below_one(
     """SelectableBasicBackend.query guards against k < 1."""
     with pytest.raises(ValueError, match="k should be >= 1"):
         semantic.query(embeddings[:1], k=0)
-
-
-def test_search_content_all_uses_code_reranking(
-    chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any
-) -> None:
-    """ContentType.ALL activates code reranking (has_code=True)."""
-    from semble import SembleIndex
-
-    index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL)
-    results = index.search("authenticate", top_k=2)
-    assert len(results) > 0

From c993df77c5af35cc42de04ae6a2c30813a2097f5 Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 15:31:27 +0200
Subject: [PATCH 10/11] Update tests

---
 tests/test_ranking.py | 5 +++--
 tests/test_search.py  | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/test_ranking.py b/tests/test_ranking.py
index f510b79..6a33397 100644
--- a/tests/test_ranking.py
+++ b/tests/test_ranking.py
@@ -151,5 +151,6 @@ def test_boost_multi_chunk_files() -> None:
 
 
 def test_boosting_with_empty() -> None:
-    """apply_query_boost with empty inputs returns an empty dict."""
-    assert apply_query_boost({}, "query", []) == {}
+    """Test that boosting with empty chunks return None."""
+    boosted = apply_query_boost({}, "query", [])
+    assert boosted == {}
diff --git a/tests/test_search.py b/tests/test_search.py
index 1bfaa7c..2f40fa6 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -107,7 +107,7 @@ def test_search_hybrid(
         (lambda q, m, s, b, c, k: search(q, m, s, b, c, k), "login", 4),
     ],
 )
-def test_search_returns_results(
+def test_search_source_labels(
     search_fn: Any,
     query: str,
     top_k: int,
@@ -116,14 +116,14 @@ def test_search_returns_results(
     bm25: bm25s.BM25,
     mock_model: Any,
 ) -> None:
-    """BM25, semantic, and hybrid search all return at least one result for a matching query."""
+    """Each result carries a source label matching the search mode used."""
     results = search_fn(query, mock_model, semantic, bm25, chunks, top_k)
     assert len(results) > 0
 
 
 def test_sort_top_k() -> None:
     """_sort_top_k returns the same indices as np.argsort(-x)[:top_k]."""
-    gen = np.random.default_rng(42)
+    gen = np.random.default_rng()
     x = gen.standard_normal(size=(10000,))
     top_k = 100
     indices = _sort_top_k(x, top_k)

From 93b1abf84e34d837fb123c71821b15a1b5e50efa Mon Sep 17 00:00:00 2001
From: Pringled <thomas123@live.nl>
Date: Thu, 21 May 2026 15:37:03 +0200
Subject: [PATCH 11/11] Update docstrings

---
 src/semble/search.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/semble/search.py b/src/semble/search.py
index 3556bf8..5b88229 100644
--- a/src/semble/search.py
+++ b/src/semble/search.py
@@ -86,7 +86,7 @@ def search(
     :param top_k: Number of results to return.
     :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type.
     :param selector: Optional array of chunk indices to filter results by.
-    :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties).
+    :param rerank: Whether to perform code-tuned reranking. On by default for code search, off for docs search.
     :return: List of search results sorted by combined score descending.
     """
     alpha_weight = resolve_alpha(query, alpha)
@@ -104,7 +104,8 @@ def search(
     normalized_semantic = _rrf_scores(semantic_scores)
     normalized_bm25 = _rrf_scores(bm25_scores)
 
-    # Sort by the file path and start line to counteract randomness from hashing.
+    # Sort by the file path and start line to
+    # counteract randomness introduces by hashing.
     all_candidates = sorted(
         {*normalized_semantic, *normalized_bm25},
         key=lambda c: c.start_line,
@@ -116,10 +117,13 @@ def search(
     }
 
     if rerank:
+        # Boost files with multiple relevant chunks.
         boost_multi_chunk_files(combined_scores)
+        # Boost queries with specific identifiers in them.
         combined_scores = apply_query_boost(combined_scores, query, chunks)
+        # Rerank the top-k results by applying path-based penalties.
         ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0)
     else:
-        ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
-
+        sorted_by_score = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
+        ranked = sorted_by_score[:top_k]
     return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]