From ebd2401f257cd1d140710aacc2b42a455ac6ed09 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 20 May 2026 18:58:51 +0200 Subject: [PATCH 01/11] Add content-specific search --- pyproject.toml | 1 + src/semble/__init__.py | 3 +- src/semble/cli.py | 60 +++++++++++++++++++++----------- src/semble/index/create.py | 28 ++++++++++++--- src/semble/index/files.py | 58 ++++++++++++++++++------------- src/semble/index/index.py | 70 +++++++++++++++++++++++++++----------- src/semble/mcp.py | 20 ++++++----- src/semble/search.py | 54 +++++++++++++++++++++++------ src/semble/types.py | 20 ++++++++++- tests/test_cli.py | 20 +++++++++++ tests/test_files.py | 63 +++++++++++++++++++++------------- tests/test_index.py | 41 ++++++++++++++++++---- tests/test_ranking.py | 6 ---- tests/test_search.py | 47 ++++++++++++++++++++++--- uv.lock | 17 ++++++++- 15 files changed, 376 insertions(+), 132 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c30236d..a326ab1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ classifiers = [ dependencies = [ "model2vec>=0.4.0", "vicinity>=0.4.4", + "pyversity>=0.1.0", "numpy>=1.24.0", "bm25s>=0.2.0", "pathspec>=0.12", diff --git a/src/semble/__init__.py b/src/semble/__init__.py index ef61bdf..136f345 100644 --- a/src/semble/__init__.py +++ b/src/semble/__init__.py @@ -1,9 +1,10 @@ from semble.index import SembleIndex -from semble.types import Chunk, EmbeddingMatrix, Encoder, IndexStats, SearchResult +from semble.types import Chunk, ContentType, EmbeddingMatrix, Encoder, IndexStats, SearchResult from semble.version import __version__ __all__ = [ "Chunk", + "ContentType", "EmbeddingMatrix", "Encoder", "IndexStats", diff --git a/src/semble/cli.py b/src/semble/cli.py index 9ee2df7..8f47998 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -1,6 +1,7 @@ import argparse import asyncio import sys +import warnings from enum import Enum from importlib.resources import files from importlib.util import find_spec @@ -10,8 +11,11 @@ from semble.index import SembleIndex from semble.stats import format_savings_report +from semble.types import ContentType from semble.utils import _format_results, _is_git_url, _resolve_chunk +_CONTENT_CHOICES = [ct.value for ct in ContentType] + class Agent(str, Enum): CLAUDE = "claude" @@ -32,6 +36,23 @@ def _agent_path(agent: Agent) -> Path: return Path(base_dir) / "agents" / "semble-search.md" +def _add_content_args(p: argparse.ArgumentParser) -> None: + """Add --content and deprecated --include-text-files to a subparser.""" + p.add_argument( + "--content", + action="append", + default=None, + choices=_CONTENT_CHOICES, + metavar="TYPE", + help="Content type(s) to index: 'code' (default), 'docs', 'all'. Repeatable: --content code --content docs.", + ) + p.add_argument( + "--include-text-files", + action="store_true", + help="Deprecated. Use --content all instead.", + ) + + def main() -> None: """Entry point for the semble command-line tool.""" if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS: @@ -52,18 +73,15 @@ def _mcp_main() -> None: help="Local directory or git URL to pre-index at startup (optional).", ) parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") - parser.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(parser) args = parser.parse_args() if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")): print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr) raise SystemExit(1) from semble.mcp import serve - asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files)) + content = _resolve_content(args.content, args.include_text_files) + asyncio.run(serve(args.path, ref=args.ref, content=content)) def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: @@ -78,6 +96,18 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: print(f"Created {dest}") +def _resolve_content(content_args: list[str] | None, include_text_files: bool) -> list[ContentType]: + """Resolve --content values and the deprecated --include-text-files into a list of ContentType.""" + if include_text_files: + warnings.warn( + "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.", + DeprecationWarning, + stacklevel=3, + ) + return [ContentType.ALL] + return [ContentType(v) for v in content_args] if content_args else [ContentType.CODE] + + def _cli_main() -> None: parser = argparse.ArgumentParser(prog="semble") sub = parser.add_subparsers(dest="command") @@ -86,22 +116,14 @@ def _cli_main() -> None: search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(search_p) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") related_p.add_argument("file_path", help="File path as shown in search results.") related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - related_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(related_p) init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.") init_p.add_argument( @@ -126,11 +148,11 @@ def _cli_main() -> None: print(format_savings_report(verbose=args.verbose), end="") return - include_text = args.include_text_files + content = _resolve_content(args.content, args.include_text_files) index = ( - SembleIndex.from_git(args.path, include_text_files=include_text) + SembleIndex.from_git(args.path, content=content) if _is_git_url(args.path) - else SembleIndex.from_path(args.path, include_text_files=include_text) + else SembleIndex.from_path(args.path, content=content) ) if args.command == "search": diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 168f8ef..1dd2609 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -1,4 +1,5 @@ import contextlib +import warnings from collections.abc import Sequence from pathlib import Path @@ -11,31 +12,48 @@ from semble.index.files import detect_language, get_extensions from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize -from semble.types import Chunk, Encoder +from semble.types import Chunk, ContentType, Encoder _MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index +_DEFAULT_CONTENT: frozenset[ContentType] = frozenset({ContentType.CODE}) +_DEPRECATION_MSG = ( + "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead." +) + + +def _apply_include_text_files( + normalized: frozenset[ContentType], include_text_files: bool | None +) -> frozenset[ContentType]: + """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" + if include_text_files is None: + return normalized + warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3) + return frozenset({ContentType.ALL}) if include_text_files else _DEFAULT_CONTENT def create_index_from_path( path: Path, model: Encoder, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: frozenset[ContentType] = _DEFAULT_CONTENT, display_root: Path | None = None, + include_text_files: bool | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. :param path: Resolved absolute path to index. :param model: The model to use for indexing. :param extensions: File extensions to include. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). + :param content: Content types to index. :param display_root: If set, chunk file paths are stored relative to this root. + :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. :raises ValueError: if no items were found, no index can be created. :return: A bm25 index, vicinity index and list of chunks """ + content = _apply_include_text_files(content, include_text_files) chunks: list[Chunk] = [] - extensions = get_extensions(include_text_files, extensions) - for file_path in walk_files(path, extensions): + resolved_extensions = get_extensions(content, extensions) + for file_path in walk_files(path, resolved_extensions): language = detect_language(file_path) with contextlib.suppress(OSError): if file_path.stat().st_size > _MAX_FILE_BYTES: diff --git a/src/semble/index/files.py b/src/semble/index/files.py index e79d7c7..10ae819 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -2,6 +2,8 @@ from collections.abc import Sequence from pathlib import Path +from semble.types import ContentType + _EXTENSION_TO_LANGUAGE = { ".4th": "forth", ".ada": "ada", @@ -357,8 +359,30 @@ _DOC_LANGUAGES = { "asciidoc", - "beancount", "bibtex", + "djot", + "doxygen", + "html", + "javadoc", + "jsdoc", + "latex", + "luadoc", + "markdown", + "markdown_inline", + "mermaid", + "norg", + "norg_meta", + "org", + "phpdoc", + "po", + "rst", + "rtf", + "vimdoc", +} + +# Everything that is not a programming language — used to derive _CODE_LANGUAGES. +_NON_CODE_LANGUAGES = _DOC_LANGUAGES | { + "beancount", "capnp", "cedarschema", "comment", @@ -368,8 +392,6 @@ "desktop", "devicetree", "diff", - "djot", - "doxygen", "dtd", "editorconfig", "ebnf", @@ -384,33 +406,18 @@ "gpg", "hjson", "hocon", - "html", "ini", - "javadoc", - "jsdoc", "json", "json5", "kdl", - "latex", "ledger", - "luadoc", - "markdown", - "markdown_inline", - "mermaid", - "norg", - "norg_meta", - "org", "pem", "pgn", - "phpdoc", - "po", "properties", "proto", "psv", "requirements", "ron", - "rst", - "rtf", "smithy", "ssh_config", "textproto", @@ -420,7 +427,6 @@ "tsv", "turtle", "typespec", - "vimdoc", "wit", "xcompose", "xml", @@ -438,7 +444,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]: ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values()) -_WITHOUT_DOC = ALL_LANGUAGES - _DOC_LANGUAGES +_CODE_LANGUAGES = ALL_LANGUAGES - _NON_CODE_LANGUAGES _LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE) @@ -447,12 +453,16 @@ def detect_language(file_name: Path) -> str | None: return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower()) -def get_extensions(include_text_files: bool, extensions: Sequence[str] | None) -> list[str]: +def get_extensions(content: frozenset[ContentType], extensions: Sequence[str] | None) -> list[str]: """Returns a list of supported file extensions.""" - if include_text_files: - languages = ALL_LANGUAGES + if ContentType.ALL in content: + languages: frozenset[str] = ALL_LANGUAGES else: - languages = _WITHOUT_DOC + languages = frozenset() + if ContentType.CODE in content: + languages |= _CODE_LANGUAGES + if ContentType.DOCS in content: + languages |= _DOC_LANGUAGES all_extensions: set[str] = set() for language in languages: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 56d51e3..d29b5a2 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -11,11 +11,22 @@ import numpy.typing as npt from bm25s import BM25 -from semble.index.create import create_index_from_path +from semble.index.create import _DEFAULT_CONTENT, _apply_include_text_files, create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model -from semble.search import _search_semantic, search +from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search from semble.stats import save_search_stats -from semble.types import CallType, Chunk, Encoder, IndexStats, SearchResult +from semble.types import ( + CallType, + Chunk, + ContentSelection, + ContentType, + Encoder, + IndexStats, + SearchResult, + normalize_content, +) + +_UNSET = object() _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) @@ -30,6 +41,7 @@ def __init__( semantic_index: SelectableBasicBackend, chunks: list[Chunk], root: Path | None = None, + content: frozenset[ContentType] = _DEFAULT_CONTENT, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -38,12 +50,14 @@ def __init__( :param semantic_index: The semantic index. :param chunks: The found chunks. :param root: Root directory used to read file sizes for token-savings stats. + :param content: Content types used when indexing; controls the search pipeline. """ self.model: Encoder = model self.chunks: list[Chunk] = chunks self._bm25_index: BM25 = bm25_index self._semantic_index: SelectableBasicBackend = semantic_index self._root: Path | None = root + self._content: frozenset[ContentType] = content self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() @@ -91,18 +105,22 @@ def from_path( path: str | Path, model: Encoder | None = None, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: ContentSelection = ContentType.CODE, + include_text_files: bool | None = None, ) -> SembleIndex: """Create and index a SembleIndex from a directory. :param path: Root directory to index. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). + :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``, + ``ContentType.ALL``, or a list of multiple types. + :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. :return: An indexed SembleIndex. Chunk file paths are relative to ``path``. :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ + normalized = _apply_include_text_files(normalize_content(content), include_text_files) model = model or load_model() path = Path(path) if not path.exists(): @@ -114,11 +132,11 @@ def from_path( path, model=model, extensions=extensions, - include_text_files=include_text_files, + content=normalized, display_root=path, ) - return SembleIndex(model, bm25, vicinity, chunks, root=path) + return SembleIndex(model, bm25, vicinity, chunks, root=path, content=normalized) @classmethod def from_git( @@ -127,7 +145,8 @@ def from_git( ref: str | None = None, model: Encoder | None = None, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: ContentSelection = ContentType.CODE, + include_text_files: bool | None = None, ) -> SembleIndex: """Clone a git repository and index it. @@ -140,10 +159,13 @@ def from_git( :param ref: Branch or tag to check out. Defaults to the remote HEAD. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). + :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``, + ``ContentType.ALL``, or a list of multiple types. + :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ + normalized = _apply_include_text_files(normalize_content(content), include_text_files) with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -163,11 +185,11 @@ def from_git( resolved_path, model=model, extensions=extensions, - include_text_files=include_text_files, + content=normalized, display_root=resolved_path, ) - return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path) + return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path, content=normalized) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -202,38 +224,46 @@ def search( alpha: float | None = None, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None, - rerank: bool = True, + rerank: bool | None = None, + diversity: float | None = _UNSET, # type: ignore[assignment] ) -> list[SearchResult]: """Search the index and return the top-k most relevant chunks. :param query: Natural-language or keyword query string. :param top_k: Maximum number of results to return. :param alpha: Blend weight for hybrid score combination; 1.0 = full semantic - weight, 0.0 = full BM25 weight. File-path penalties and diversity reranking - are applied regardless. ``None`` auto-detects from query type. + weight, 0.0 = full BM25 weight. ``None`` auto-detects from query type. :param filter_languages: Optional list of language codes; if set, only chunks in these languages are returned. :param filter_paths: Optional list of repo-relative file paths; if set, only chunks from these files are returned. - :param rerank: Whether to rerank the top-k results using custom reranking logic. + :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). + Defaults to ``True`` when ``ContentType.CODE`` was indexed. + :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking. + Defaults to ``DEFAULT_DOCS_DIVERSITY`` when ``ContentType.DOCS`` was indexed. Pass + ``None`` explicitly to disable. :return: Ranked list of :class:`SearchResult` objects, best match first. """ - bm25_index, semantic_index = self._bm25_index, self._semantic_index if not self.chunks or not query.strip(): return [] - selector = self._get_selector_vector(filter_languages, filter_paths) + has_code = ContentType.CODE in self._content or ContentType.ALL in self._content + has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content + resolved_rerank = has_code if rerank is None else rerank + resolved_diversity = (DEFAULT_DOCS_DIVERSITY if has_docs else None) if diversity is _UNSET else diversity + selector = self._get_selector_vector(filter_languages, filter_paths) results = search( query, self.model, - semantic_index, - bm25_index, + self._semantic_index, + self._bm25_index, self.chunks, top_k, alpha=alpha, selector=selector, - rerank=rerank, + rerank=resolved_rerank, + diversity=resolved_diversity, ) save_search_stats(results, CallType.SEARCH, self._file_sizes) return results diff --git a/src/semble/mcp.py b/src/semble/mcp.py index a9c533d..ca90897 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -12,7 +12,7 @@ from semble.index import SembleIndex from semble.index.dense import load_model -from semble.types import Encoder +from semble.types import ContentSelection, ContentType, Encoder, normalize_content from semble.utils import _format_results, _is_git_url, _resolve_chunk logger = logging.getLogger(__name__) @@ -112,10 +112,14 @@ async def find_related( return server -async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None: +async def serve( + path: str | None = None, + ref: str | None = None, + content: ContentSelection = ContentType.CODE, +) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" model = await asyncio.to_thread(load_model) - cache = _IndexCache(model=model, include_text_files=include_text_files) + cache = _IndexCache(model=model, content=normalize_content(content)) if path: await cache.get(path, ref=ref) if not _is_git_url(path): @@ -128,10 +132,10 @@ async def serve(path: str | None = None, ref: str | None = None, include_text_fi class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder, include_text_files: bool = False) -> None: + def __init__(self, model: Encoder, content: frozenset[ContentType] = frozenset({ContentType.CODE})) -> None: """Initialise an empty cache with a shared embedding model.""" self._model = model - self._include_text_files = include_text_files + self._content = content self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict() # ordered for LRU eviction self._watcher_task: asyncio.Task[None] | None = None @@ -175,14 +179,12 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: source, ref=ref, model=self._model, - include_text_files=self._include_text_files, + content=self._content, ) ) else: self._tasks[cache_key] = asyncio.create_task( - asyncio.to_thread( - SembleIndex.from_path, cache_key, model=self._model, include_text_files=self._include_text_files - ) + asyncio.to_thread(SembleIndex.from_path, cache_key, model=self._model, content=self._content) ) task = self._tasks[cache_key] try: diff --git a/src/semble/search.py b/src/semble/search.py index d0a9d25..7b58b8a 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -1,6 +1,7 @@ import bm25s import numpy as np import numpy.typing as npt +from pyversity import Strategy, diversify from semble.index.dense import SelectableBasicBackend from semble.index.sparse import selector_to_mask @@ -9,6 +10,7 @@ from semble.types import Chunk, Encoder, SearchResult _RRF_K = 60 +DEFAULT_DOCS_DIVERSITY = 0.3 def _rrf_scores(scores: dict[Chunk, float]) -> dict[Chunk, float]: @@ -72,6 +74,7 @@ def search( alpha: float | None = None, selector: npt.NDArray[np.int_] | None = None, rerank: bool = True, + diversity: float | None = None, ) -> list[SearchResult]: """Hybrid search: alpha-weighted combination of semantic and BM25 scores. @@ -86,7 +89,9 @@ def search( :param top_k: Number of results to return. :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type. :param selector: Optional array of chunk indices to filter results by. - :param rerank: Whether to perform reranking. This should be done, and is mainly here for benchmarking. + :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties). + :param diversity: DPP diversity weight in [0, 1]. When set, fetches 2× candidates, reranks, + then re-selects with pyversity DPP. None disables diversity. :return: List of search results sorted by combined score descending. """ alpha_weight = resolve_alpha(query, alpha) @@ -104,8 +109,7 @@ def search( normalized_semantic = _rrf_scores(semantic_scores) normalized_bm25 = _rrf_scores(bm25_scores) - # Sort by the file path and start line to - # counteract randomness introduces by hashing. + # Sort by the file path and start line to counteract randomness from hashing. all_candidates = sorted( {*normalized_semantic, *normalized_bm25}, key=lambda c: c.start_line, @@ -116,14 +120,44 @@ def search( for chunk in all_candidates } + # Over-fetch before reranking so diversity has candidates to choose from. + fetch_k = top_k * 2 if diversity is not None else top_k + if rerank: - # Boost files with multiple relevant chunks. boost_multi_chunk_files(combined_scores) - # Boost queries with specific identifiers in them. combined_scores = apply_query_boost(combined_scores, query, chunks) - # Rerank the top-k results by applying path-based penalties. - ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0) + ranked = rerank_topk(combined_scores, fetch_k, penalise_paths=alpha_weight < 1.0) else: - sorted_by_score = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) - ranked = sorted_by_score[:top_k] - return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked] + ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:fetch_k] + + results = [SearchResult(chunk=chunk, score=score) for chunk, score in ranked] + + if diversity is not None: + return _diversify(results, top_k, diversity, semantic_index, chunks) + return results + + +def _diversify( + results: list[SearchResult], + top_k: int, + diversity_weight: float, + semantic_index: SelectableBasicBackend, + chunks: list[Chunk], +) -> list[SearchResult]: + """Re-rank results with DPP to improve embedding-space diversity.""" + if len(results) <= top_k: + return results + chunk_index = {c: i for i, c in enumerate(chunks)} + valid = [r for r in results if r.chunk in chunk_index] + if len(valid) <= top_k: + return valid + indices = np.array([chunk_index[r.chunk] for r in valid]) + scores = np.array([r.score for r in valid], dtype=np.float32) + result = diversify( + embeddings=semantic_index.vectors[indices], + scores=scores, + k=top_k, + strategy=Strategy.DPP, + diversity=diversity_weight, + ) + return sorted((valid[i] for i in result.indices), key=lambda r: -r.score) diff --git a/src/semble/types.py b/src/semble/types.py index d01c774..ca32fac 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -1,4 +1,4 @@ -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field from enum import Enum from typing import Any, Protocol, TypeAlias @@ -16,6 +16,24 @@ class CallType(str, Enum): FIND_RELATED = "find_related" +class ContentType(str, Enum): + """Content type for indexing and search pipeline selection.""" + + CODE = "code" + DOCS = "docs" + ALL = "all" + + +ContentSelection: TypeAlias = "ContentType | Iterable[ContentType]" + + +def normalize_content(content: ContentSelection) -> frozenset[ContentType]: + """Normalize a single ContentType or iterable of ContentType into a frozenset.""" + if isinstance(content, ContentType): + return frozenset({content}) + return frozenset(content) + + class Encoder(Protocol): """Protocol for embedding models.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 2c71a72..6f9928e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -193,6 +193,26 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err +def test_include_text_files_cli_deprecated( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--include-text-files on CLI raises DeprecationWarning.""" + import warnings + + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path", "--include-text-files"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _cli_main() + assert any( + "include-text-files" in str(w.message).lower() for w in caught if issubclass(w.category, DeprecationWarning) + ) + + def test_agent_file_tools_are_bash_only() -> None: """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" frontmatter = _CLAUDE_AGENT_FILE.split("---")[1] diff --git a/tests/test_files.py b/tests/test_files.py index 71ede4c..acda49c 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -1,6 +1,9 @@ from pathlib import Path -from semble.index.files import _DOC_LANGUAGES, _EXTENSION_TO_LANGUAGE, detect_language, get_extensions +import pytest + +from semble.index.files import _CODE_LANGUAGES, _DOC_LANGUAGES, _NON_CODE_LANGUAGES, detect_language, get_extensions +from semble.types import ContentType def test_detect_language() -> None: @@ -10,32 +13,44 @@ def test_detect_language() -> None: assert detect_language(Path("c.txt")) is None -def test_get_extensions() -> None: - """Test the get_extensions function.""" - all_extensions = get_extensions(True, None) - without_doc_extensions = get_extensions(False, None) - - doc_extensions = set(all_extensions) - set(without_doc_extensions) - - for extension in doc_extensions: - assert _EXTENSION_TO_LANGUAGE[extension] in _DOC_LANGUAGES - for extension in without_doc_extensions: - assert _EXTENSION_TO_LANGUAGE[extension] not in _DOC_LANGUAGES +def test_language_sets_are_consistent() -> None: + """Code, doc, and non-code language sets satisfy their mutual invariants.""" + assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES) + assert _CODE_LANGUAGES.isdisjoint(_NON_CODE_LANGUAGES) + assert _DOC_LANGUAGES <= _NON_CODE_LANGUAGES -def test_get_extensions_additional() -> None: - """Test the get_extensions function.""" - all_extensions = get_extensions(True, None) - all_extensions_extra = get_extensions(True, [".kjs"]) - - assert set(all_extensions_extra) == set(all_extensions) | {".kjs"} +@pytest.mark.parametrize( + ("content", "includes", "excludes"), + [ + (frozenset({ContentType.CODE}), [".py"], [".md"]), + (frozenset({ContentType.DOCS}), [".md"], [".py"]), + (frozenset({ContentType.ALL}), [".py", ".md"], []), + ], +) +def test_get_extensions(content: frozenset[ContentType], includes: list[str], excludes: list[str]) -> None: + """get_extensions returns the right extensions for each content type.""" + exts = set(get_extensions(content, None)) + for ext in includes: + assert ext in exts + for ext in excludes: + assert ext not in exts - all_extensions = get_extensions(False, None) - all_extensions_extra = get_extensions(False, [".kjs"]) - assert set(all_extensions_extra) == set(all_extensions) | {".kjs"} +def test_get_extensions_code_and_docs() -> None: + """Code + docs is the union of each individual set.""" + code = set(get_extensions(frozenset({ContentType.CODE}), None)) + docs = set(get_extensions(frozenset({ContentType.DOCS}), None)) + combined = set(get_extensions(frozenset({ContentType.CODE, ContentType.DOCS}), None)) + assert combined == code | docs - all_extensions = get_extensions(False, None) - all_extensions_extra = get_extensions(False, [".py"]) - assert set(all_extensions_extra) == set(all_extensions) +def test_get_extensions_additional() -> None: + """Extra extensions are appended and existing ones are not duplicated.""" + base = get_extensions(frozenset({ContentType.ALL}), None) + with_extra = get_extensions(frozenset({ContentType.ALL}), [".kjs"]) + assert set(with_extra) == set(base) | {".kjs"} + + base_code = get_extensions(frozenset({ContentType.CODE}), None) + with_existing = get_extensions(frozenset({ContentType.CODE}), [".py"]) + assert set(with_existing) == set(base_code) diff --git a/tests/test_index.py b/tests/test_index.py index 3f90fcb..2a064ae 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,12 +1,12 @@ from pathlib import Path from typing import Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from semble import SembleIndex from semble.index.create import _MAX_FILE_BYTES, create_index_from_path -from semble.types import Encoder +from semble.types import ContentType, Encoder from tests.conftest import make_chunk @@ -17,18 +17,45 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex: @pytest.mark.parametrize( - ("include_text_files", "md_in_results"), - [(False, False), (True, True)], + ("content", "md_in_results"), + [ + (ContentType.CODE, False), + (ContentType.DOCS, True), + (ContentType.ALL, True), + ([ContentType.CODE, ContentType.DOCS], True), + ], ) def test_index_markdown_inclusion( - mock_model: Encoder, tmp_project: Path, include_text_files: bool, md_in_results: bool + mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool ) -> None: - """Markdown files are excluded by default and included when include_text_files=True.""" - _, _, chunks = create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files) + """Markdown files are excluded for code and included for docs/all/code+docs.""" + from semble.types import normalize_content + + _, _, chunks = create_index_from_path(tmp_project, mock_model, content=normalize_content(content)) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results +@pytest.mark.parametrize("include_text_files", [True, False]) +def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None: + """include_text_files raises DeprecationWarning on create_index_from_path and from_path.""" + with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): + create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files) + with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): + SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files) + + +def test_from_git_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path) -> None: + """from_git raises DeprecationWarning when include_text_files is passed.""" + fake_result = MagicMock() + fake_result.returncode = 0 + with patch("subprocess.run", return_value=fake_result): + with patch("semble.index.index.create_index_from_path") as mock_create: + mock_create.return_value = (MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")]) + with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): + SembleIndex.from_git("https://example.com/repo", model=mock_model, include_text_files=True) + + def test_index_empty_returns_zero_chunks(mock_model: Encoder, tmp_path: Path) -> None: """Indexing an empty directory yields zero files and chunks.""" with pytest.raises(ValueError): diff --git a/tests/test_ranking.py b/tests/test_ranking.py index 6a33397..8caf26b 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -148,9 +148,3 @@ def test_boost_multi_chunk_files() -> None: scores: dict = {c1: 1.0, c2: 0.8, c3: 1.0} boost_multi_chunk_files(scores) assert scores[c1] > 1.0 - - -def test_boosting_with_empty() -> None: - """Test that boosting with empty chunks return None.""" - boosted = apply_query_boost({}, "query", []) - assert boosted == {} diff --git a/tests/test_search.py b/tests/test_search.py index 56bd2f1..9bffdc3 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -8,9 +8,9 @@ from vicinity.backends.basic import BasicArgs from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model -from semble.search import _search_bm25, _search_semantic, _sort_top_k, search +from semble.search import DEFAULT_DOCS_DIVERSITY, _diversify, _search_bm25, _search_semantic, _sort_top_k, search from semble.tokens import tokenize -from semble.types import Chunk, Encoder +from semble.types import Chunk, ContentType, Encoder, SearchResult from tests.conftest import make_chunk @@ -107,7 +107,7 @@ def test_search_hybrid( (lambda q, m, s, b, c, k: search(q, m, s, b, c, k), "login", 4), ], ) -def test_search_source_labels( +def test_search_returns_results( search_fn: Any, query: str, top_k: int, @@ -116,14 +116,14 @@ def test_search_source_labels( bm25: bm25s.BM25, mock_model: Any, ) -> None: - """Each result carries a source label matching the search mode used.""" + """BM25, semantic, and hybrid search all return at least one result for a matching query.""" results = search_fn(query, mock_model, semantic, bm25, chunks, top_k) assert len(results) > 0 def test_sort_top_k() -> None: """_sort_top_k returns the same indices as np.argsort(-x)[:top_k].""" - gen = np.random.default_rng() + gen = np.random.default_rng(42) x = gen.standard_normal(size=(10000,)) top_k = 100 indices = _sort_top_k(x, top_k) @@ -159,3 +159,40 @@ def test_selectable_basic_backend_rejects_k_below_one( """SelectableBasicBackend.query guards against k < 1.""" with pytest.raises(ValueError, match="k should be >= 1"): semantic.query(embeddings[:1], k=0) + + +def test_search_with_diversity( + chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any +) -> None: + """Search with diversity set runs DPP and returns top_k results.""" + results = search("authenticate", mock_model, semantic, bm25, chunks, top_k=2, diversity=DEFAULT_DOCS_DIVERSITY) + assert len(results) == 2 + assert all(r.score >= 0 for r in results) + + +def test_diversify_fewer_results_than_top_k(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None: + """_diversify returns early when results are already within top_k.""" + results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate(chunks[:2])] + out = _diversify(results, top_k=10, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks) + assert len(out) == 2 + + +def test_diversify_filters_unknown_chunks(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None: + """_diversify returns early when valid results (chunks in index) fall within top_k.""" + # 4 results but 3 reference unknown chunks not in the index → valid=1 ≤ top_k=3 + unknown = [make_chunk(f"x = {i}", f"unknown_{i}.py") for i in range(3)] + results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate([chunks[0]] + unknown)] + out = _diversify(results, top_k=3, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks) + assert len(out) == 1 # only the known chunk survives + + +def test_search_content_all_uses_both_pipelines( + chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any +) -> None: + """ContentType.ALL activates both rerank and diversity defaults.""" + from semble import SembleIndex + + index = SembleIndex(mock_model, bm25, semantic, chunks, content=frozenset({ContentType.ALL})) + # rerank and diversity both resolve to True/set — results should come back without error + results = index.search("authenticate", top_k=2) + assert len(results) > 0 diff --git a/uv.lock b/uv.lock index 1d492fe..6dac6b6 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,7 @@ resolution-markers = [ [options] exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. -exclude-newer-span = "P3D" +exclude-newer-span = "P1W" [[package]] name = "annotated-doc" @@ -2456,6 +2456,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/a2/43bbc5860b5034e2af4ef99a0e04d726ff329c43e192ef3abaa8d7ecfce5/python_multipart-0.0.28-py3-none-any.whl", hash = "sha256:10faac07eb966c3f48dc415f9dee46c04cb10d58d30a35677db8027c825ed9b6", size = 29438, upload-time = "2026-05-10T11:05:15.052Z" }, ] +[[package]] +name = "pyversity" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d5/dc/a5a835b57ce06e21b4355d9e37ebb717455d55645014f1f5bde2fd948615/pyversity-0.2.0.tar.gz", hash = "sha256:48be2735b2471da1fa7497ea045aff25aa071f1176b86a011d4a49c01327ab6d", size = 28000, upload-time = "2026-02-02T08:06:38.563Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/e2/7e928734b14944f164cb243cee7fb14229e208d25df022506aa5a2390bcc/pyversity-0.2.0-py3-none-any.whl", hash = "sha256:c28f6a1a3ccfb97a9439d345e80ba0e60ffd14aa7680f2db0af7a70e90aae3f9", size = 21229, upload-time = "2026-02-02T08:06:37.125Z" }, +] + [[package]] name = "pywin32" version = "311" @@ -3124,6 +3137,7 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pathspec" }, + { name = "pyversity" }, { name = "tree-sitter" }, { name = "tree-sitter-language-pack" }, { name = "vicinity" }, @@ -3168,6 +3182,7 @@ requires-dist = [ { name = "pydoclint", marker = "extra == 'dev'", specifier = ">=0.5.3" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0" }, + { name = "pyversity", specifier = ">=0.1.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, { name = "sentence-transformers", marker = "extra == 'benchmark'", specifier = ">=3.0" }, { name = "tiktoken", marker = "extra == 'benchmark'", specifier = ">=0.7" }, From f26f30def07ed66ab4cba0949e70f96e83e5c609 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 20 May 2026 19:11:18 +0200 Subject: [PATCH 02/11] Add content-specific search --- src/semble/index/create.py | 3 --- src/semble/index/index.py | 32 ++++++++++++-------------------- src/semble/mcp.py | 9 +++++---- src/semble/types.py | 9 +++------ tests/test_index.py | 14 +++++++++----- 5 files changed, 29 insertions(+), 38 deletions(-) diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 1dd2609..cd23e5d 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -37,7 +37,6 @@ def create_index_from_path( extensions: Sequence[str] | None = None, content: frozenset[ContentType] = _DEFAULT_CONTENT, display_root: Path | None = None, - include_text_files: bool | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. @@ -46,11 +45,9 @@ def create_index_from_path( :param extensions: File extensions to include. :param content: Content types to index. :param display_root: If set, chunk file paths are stored relative to this root. - :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. :raises ValueError: if no items were found, no index can be created. :return: A bm25 index, vicinity index and list of chunks """ - content = _apply_include_text_files(content, include_text_files) chunks: list[Chunk] = [] resolved_extensions = get_extensions(content, extensions) for file_path in walk_files(path, resolved_extensions): diff --git a/src/semble/index/index.py b/src/semble/index/index.py index d29b5a2..2169fd0 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -15,18 +15,7 @@ from semble.index.dense import SelectableBasicBackend, load_model from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search from semble.stats import save_search_stats -from semble.types import ( - CallType, - Chunk, - ContentSelection, - ContentType, - Encoder, - IndexStats, - SearchResult, - normalize_content, -) - -_UNSET = object() +from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult, _normalize_content _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) @@ -105,7 +94,7 @@ def from_path( path: str | Path, model: Encoder | None = None, extensions: Sequence[str] | None = None, - content: ContentSelection = ContentType.CODE, + content: ContentType | Sequence[ContentType] = ContentType.CODE, include_text_files: bool | None = None, ) -> SembleIndex: """Create and index a SembleIndex from a directory. @@ -120,7 +109,7 @@ def from_path( :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ - normalized = _apply_include_text_files(normalize_content(content), include_text_files) + normalized = _apply_include_text_files(_normalize_content(content), include_text_files) model = model or load_model() path = Path(path) if not path.exists(): @@ -145,7 +134,7 @@ def from_git( ref: str | None = None, model: Encoder | None = None, extensions: Sequence[str] | None = None, - content: ContentSelection = ContentType.CODE, + content: ContentType | Sequence[ContentType] = ContentType.CODE, include_text_files: bool | None = None, ) -> SembleIndex: """Clone a git repository and index it. @@ -165,7 +154,7 @@ def from_git( :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ - normalized = _apply_include_text_files(normalize_content(content), include_text_files) + normalized = _apply_include_text_files(_normalize_content(content), include_text_files) with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -225,7 +214,7 @@ def search( filter_languages: list[str] | None = None, filter_paths: list[str] | None = None, rerank: bool | None = None, - diversity: float | None = _UNSET, # type: ignore[assignment] + diversity: float | None = None, ) -> list[SearchResult]: """Search the index and return the top-k most relevant chunks. @@ -240,8 +229,8 @@ def search( :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). Defaults to ``True`` when ``ContentType.CODE`` was indexed. :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking. - Defaults to ``DEFAULT_DOCS_DIVERSITY`` when ``ContentType.DOCS`` was indexed. Pass - ``None`` explicitly to disable. + ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed. + Pass ``0.0`` to disable diversity even on a docs index. :return: Ranked list of :class:`SearchResult` objects, best match first. """ if not self.chunks or not query.strip(): @@ -250,7 +239,10 @@ def search( has_code = ContentType.CODE in self._content or ContentType.ALL in self._content has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content resolved_rerank = has_code if rerank is None else rerank - resolved_diversity = (DEFAULT_DOCS_DIVERSITY if has_docs else None) if diversity is _UNSET else diversity + if diversity is None: + resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None + else: + resolved_diversity = diversity if diversity > 0 else None selector = self._get_selector_vector(filter_languages, filter_paths) results = search( diff --git a/src/semble/mcp.py b/src/semble/mcp.py index ca90897..97e1155 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -3,6 +3,7 @@ import asyncio import logging from collections import OrderedDict +from collections.abc import Sequence from pathlib import Path from typing import Annotated @@ -12,7 +13,7 @@ from semble.index import SembleIndex from semble.index.dense import load_model -from semble.types import ContentSelection, ContentType, Encoder, normalize_content +from semble.types import ContentType, Encoder from semble.utils import _format_results, _is_git_url, _resolve_chunk logger = logging.getLogger(__name__) @@ -115,11 +116,11 @@ async def find_related( async def serve( path: str | None = None, ref: str | None = None, - content: ContentSelection = ContentType.CODE, + content: ContentType | Sequence[ContentType] = ContentType.CODE, ) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" model = await asyncio.to_thread(load_model) - cache = _IndexCache(model=model, content=normalize_content(content)) + cache = _IndexCache(model=model, content=content) if path: await cache.get(path, ref=ref) if not _is_git_url(path): @@ -132,7 +133,7 @@ async def serve( class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder, content: frozenset[ContentType] = frozenset({ContentType.CODE})) -> None: + def __init__(self, model: Encoder, content: ContentType | Sequence[ContentType] = ContentType.CODE) -> None: """Initialise an empty cache with a shared embedding model.""" self._model = model self._content = content diff --git a/src/semble/types.py b/src/semble/types.py index ca32fac..f805f31 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -1,4 +1,4 @@ -from collections.abc import Iterable, Sequence +from collections.abc import Sequence from dataclasses import dataclass, field from enum import Enum from typing import Any, Protocol, TypeAlias @@ -24,11 +24,8 @@ class ContentType(str, Enum): ALL = "all" -ContentSelection: TypeAlias = "ContentType | Iterable[ContentType]" - - -def normalize_content(content: ContentSelection) -> frozenset[ContentType]: - """Normalize a single ContentType or iterable of ContentType into a frozenset.""" +def _normalize_content(content: "ContentType | Sequence[ContentType]") -> frozenset[ContentType]: + """Normalize a single ContentType or sequence into a frozenset.""" if isinstance(content, ContentType): return frozenset({content}) return frozenset(content) diff --git a/tests/test_index.py b/tests/test_index.py index 2a064ae..01d7fe0 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -29,18 +29,16 @@ def test_index_markdown_inclusion( mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool ) -> None: """Markdown files are excluded for code and included for docs/all/code+docs.""" - from semble.types import normalize_content + from semble.types import _normalize_content - _, _, chunks = create_index_from_path(tmp_project, mock_model, content=normalize_content(content)) + _, _, chunks = create_index_from_path(tmp_project, mock_model, content=_normalize_content(content)) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results @pytest.mark.parametrize("include_text_files", [True, False]) def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None: - """include_text_files raises DeprecationWarning on create_index_from_path and from_path.""" - with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): - create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files) + """include_text_files raises DeprecationWarning on from_path.""" with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files) @@ -102,6 +100,12 @@ def test_search_with_filter_paths_does_not_crash(indexed_index: SembleIndex) -> assert all(r.chunk.file_path == target_path for r in results) +def test_search_explicit_diversity(indexed_index: SembleIndex) -> None: + """Explicit diversity values are accepted; 0.0 disables diversity without error.""" + assert len(indexed_index.search("authenticate", top_k=3, diversity=0.5)) > 0 + assert len(indexed_index.search("authenticate", top_k=3, diversity=0.0)) > 0 + + def test_search_without_reranking(indexed_index: SembleIndex) -> None: """Filtered search works regardless of where the selected chunk lives in the corpus.""" with patch("semble.search.rerank_topk") as mock: From 780f84ddec3852278dda60e767e73b03232b3006 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 20 May 2026 19:21:31 +0200 Subject: [PATCH 03/11] Add content-specific search --- src/semble/cli.py | 14 ++++++-------- src/semble/index/create.py | 13 +++++-------- src/semble/index/files.py | 15 +++++++-------- src/semble/index/index.py | 20 ++++++++++---------- src/semble/mcp.py | 5 ++--- src/semble/types.py | 7 ------- tests/test_files.py | 24 ++++++++---------------- tests/test_index.py | 9 +++------ tests/test_search.py | 2 +- 9 files changed, 42 insertions(+), 67 deletions(-) diff --git a/src/semble/cli.py b/src/semble/cli.py index 8f47998..4700ee2 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -40,11 +40,9 @@ def _add_content_args(p: argparse.ArgumentParser) -> None: """Add --content and deprecated --include-text-files to a subparser.""" p.add_argument( "--content", - action="append", - default=None, + default=ContentType.CODE.value, choices=_CONTENT_CHOICES, - metavar="TYPE", - help="Content type(s) to index: 'code' (default), 'docs', 'all'. Repeatable: --content code --content docs.", + help="Content type to index: 'code' (default), 'docs', or 'all'.", ) p.add_argument( "--include-text-files", @@ -96,16 +94,16 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: print(f"Created {dest}") -def _resolve_content(content_args: list[str] | None, include_text_files: bool) -> list[ContentType]: - """Resolve --content values and the deprecated --include-text-files into a list of ContentType.""" +def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType: + """Resolve --content and the deprecated --include-text-files into a ContentType.""" if include_text_files: warnings.warn( "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.", DeprecationWarning, stacklevel=3, ) - return [ContentType.ALL] - return [ContentType(v) for v in content_args] if content_args else [ContentType.CODE] + return ContentType.ALL + return ContentType(content_arg) def _cli_main() -> None: diff --git a/src/semble/index/create.py b/src/semble/index/create.py index cd23e5d..3a5a78a 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -15,27 +15,24 @@ from semble.types import Chunk, ContentType, Encoder _MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index -_DEFAULT_CONTENT: frozenset[ContentType] = frozenset({ContentType.CODE}) _DEPRECATION_MSG = ( "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead." ) -def _apply_include_text_files( - normalized: frozenset[ContentType], include_text_files: bool | None -) -> frozenset[ContentType]: +def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType: """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" if include_text_files is None: - return normalized + return content warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3) - return frozenset({ContentType.ALL}) if include_text_files else _DEFAULT_CONTENT + return ContentType.ALL if include_text_files else ContentType.CODE def create_index_from_path( path: Path, model: Encoder, extensions: Sequence[str] | None = None, - content: frozenset[ContentType] = _DEFAULT_CONTENT, + content: ContentType = ContentType.CODE, display_root: Path | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. @@ -43,7 +40,7 @@ def create_index_from_path( :param path: Resolved absolute path to index. :param model: The model to use for indexing. :param extensions: File extensions to include. - :param content: Content types to index. + :param content: Content type to index. :param display_root: If set, chunk file paths are stored relative to this root. :raises ValueError: if no items were found, no index can be created. :return: A bm25 index, vicinity index and list of chunks diff --git a/src/semble/index/files.py b/src/semble/index/files.py index 10ae819..b292e95 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -453,16 +453,15 @@ def detect_language(file_name: Path) -> str | None: return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower()) -def get_extensions(content: frozenset[ContentType], extensions: Sequence[str] | None) -> list[str]: +def get_extensions(content: ContentType, extensions: Sequence[str] | None) -> list[str]: """Returns a list of supported file extensions.""" - if ContentType.ALL in content: - languages: frozenset[str] = ALL_LANGUAGES + languages: set[str] | frozenset[str] + if content == ContentType.ALL: + languages = ALL_LANGUAGES + elif content == ContentType.DOCS: + languages = _DOC_LANGUAGES else: - languages = frozenset() - if ContentType.CODE in content: - languages |= _CODE_LANGUAGES - if ContentType.DOCS in content: - languages |= _DOC_LANGUAGES + languages = _CODE_LANGUAGES all_extensions: set[str] = set() for language in languages: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 2169fd0..80789c3 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -11,11 +11,11 @@ import numpy.typing as npt from bm25s import BM25 -from semble.index.create import _DEFAULT_CONTENT, _apply_include_text_files, create_index_from_path +from semble.index.create import _apply_include_text_files, create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search from semble.stats import save_search_stats -from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult, _normalize_content +from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) @@ -30,7 +30,7 @@ def __init__( semantic_index: SelectableBasicBackend, chunks: list[Chunk], root: Path | None = None, - content: frozenset[ContentType] = _DEFAULT_CONTENT, + content: ContentType = ContentType.CODE, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -46,7 +46,7 @@ def __init__( self._bm25_index: BM25 = bm25_index self._semantic_index: SelectableBasicBackend = semantic_index self._root: Path | None = root - self._content: frozenset[ContentType] = content + self._content: ContentType = content self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() @@ -94,7 +94,7 @@ def from_path( path: str | Path, model: Encoder | None = None, extensions: Sequence[str] | None = None, - content: ContentType | Sequence[ContentType] = ContentType.CODE, + content: ContentType = ContentType.CODE, include_text_files: bool | None = None, ) -> SembleIndex: """Create and index a SembleIndex from a directory. @@ -109,7 +109,7 @@ def from_path( :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ - normalized = _apply_include_text_files(_normalize_content(content), include_text_files) + normalized = _apply_include_text_files(content, include_text_files) model = model or load_model() path = Path(path) if not path.exists(): @@ -134,7 +134,7 @@ def from_git( ref: str | None = None, model: Encoder | None = None, extensions: Sequence[str] | None = None, - content: ContentType | Sequence[ContentType] = ContentType.CODE, + content: ContentType = ContentType.CODE, include_text_files: bool | None = None, ) -> SembleIndex: """Clone a git repository and index it. @@ -154,7 +154,7 @@ def from_git( :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ - normalized = _apply_include_text_files(_normalize_content(content), include_text_files) + normalized = _apply_include_text_files(content, include_text_files) with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -236,8 +236,8 @@ def search( if not self.chunks or not query.strip(): return [] - has_code = ContentType.CODE in self._content or ContentType.ALL in self._content - has_docs = ContentType.DOCS in self._content or ContentType.ALL in self._content + has_code = self._content in (ContentType.CODE, ContentType.ALL) + has_docs = self._content in (ContentType.DOCS, ContentType.ALL) resolved_rerank = has_code if rerank is None else rerank if diversity is None: resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 97e1155..b6f8d14 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -3,7 +3,6 @@ import asyncio import logging from collections import OrderedDict -from collections.abc import Sequence from pathlib import Path from typing import Annotated @@ -116,7 +115,7 @@ async def find_related( async def serve( path: str | None = None, ref: str | None = None, - content: ContentType | Sequence[ContentType] = ContentType.CODE, + content: ContentType = ContentType.CODE, ) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" model = await asyncio.to_thread(load_model) @@ -133,7 +132,7 @@ async def serve( class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder, content: ContentType | Sequence[ContentType] = ContentType.CODE) -> None: + def __init__(self, model: Encoder, content: ContentType = ContentType.CODE) -> None: """Initialise an empty cache with a shared embedding model.""" self._model = model self._content = content diff --git a/src/semble/types.py b/src/semble/types.py index f805f31..1da4c7f 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -24,13 +24,6 @@ class ContentType(str, Enum): ALL = "all" -def _normalize_content(content: "ContentType | Sequence[ContentType]") -> frozenset[ContentType]: - """Normalize a single ContentType or sequence into a frozenset.""" - if isinstance(content, ContentType): - return frozenset({content}) - return frozenset(content) - - class Encoder(Protocol): """Protocol for embedding models.""" diff --git a/tests/test_files.py b/tests/test_files.py index acda49c..1813908 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -23,12 +23,12 @@ def test_language_sets_are_consistent() -> None: @pytest.mark.parametrize( ("content", "includes", "excludes"), [ - (frozenset({ContentType.CODE}), [".py"], [".md"]), - (frozenset({ContentType.DOCS}), [".md"], [".py"]), - (frozenset({ContentType.ALL}), [".py", ".md"], []), + (ContentType.CODE, [".py"], [".md"]), + (ContentType.DOCS, [".md"], [".py"]), + (ContentType.ALL, [".py", ".md"], []), ], ) -def test_get_extensions(content: frozenset[ContentType], includes: list[str], excludes: list[str]) -> None: +def test_get_extensions(content: ContentType, includes: list[str], excludes: list[str]) -> None: """get_extensions returns the right extensions for each content type.""" exts = set(get_extensions(content, None)) for ext in includes: @@ -37,20 +37,12 @@ def test_get_extensions(content: frozenset[ContentType], includes: list[str], ex assert ext not in exts -def test_get_extensions_code_and_docs() -> None: - """Code + docs is the union of each individual set.""" - code = set(get_extensions(frozenset({ContentType.CODE}), None)) - docs = set(get_extensions(frozenset({ContentType.DOCS}), None)) - combined = set(get_extensions(frozenset({ContentType.CODE, ContentType.DOCS}), None)) - assert combined == code | docs - - def test_get_extensions_additional() -> None: """Extra extensions are appended and existing ones are not duplicated.""" - base = get_extensions(frozenset({ContentType.ALL}), None) - with_extra = get_extensions(frozenset({ContentType.ALL}), [".kjs"]) + base = get_extensions(ContentType.ALL, None) + with_extra = get_extensions(ContentType.ALL, [".kjs"]) assert set(with_extra) == set(base) | {".kjs"} - base_code = get_extensions(frozenset({ContentType.CODE}), None) - with_existing = get_extensions(frozenset({ContentType.CODE}), [".py"]) + base_code = get_extensions(ContentType.CODE, None) + with_existing = get_extensions(ContentType.CODE, [".py"]) assert set(with_existing) == set(base_code) diff --git a/tests/test_index.py b/tests/test_index.py index 01d7fe0..f9435c1 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -22,16 +22,13 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex: (ContentType.CODE, False), (ContentType.DOCS, True), (ContentType.ALL, True), - ([ContentType.CODE, ContentType.DOCS], True), ], ) def test_index_markdown_inclusion( - mock_model: Encoder, tmp_project: Path, content: ContentType | list[ContentType], md_in_results: bool + mock_model: Encoder, tmp_project: Path, content: ContentType, md_in_results: bool ) -> None: - """Markdown files are excluded for code and included for docs/all/code+docs.""" - from semble.types import _normalize_content - - _, _, chunks = create_index_from_path(tmp_project, mock_model, content=_normalize_content(content)) + """Markdown files are excluded for code and included for docs/all.""" + _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results diff --git a/tests/test_search.py b/tests/test_search.py index 9bffdc3..698c465 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -192,7 +192,7 @@ def test_search_content_all_uses_both_pipelines( """ContentType.ALL activates both rerank and diversity defaults.""" from semble import SembleIndex - index = SembleIndex(mock_model, bm25, semantic, chunks, content=frozenset({ContentType.ALL})) + index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL) # rerank and diversity both resolve to True/set — results should come back without error results = index.search("authenticate", top_k=2) assert len(results) > 0 From 737eb1e7cbd4fdcc67ee6243e482139f69075c69 Mon Sep 17 00:00:00 2001 From: Pringled Date: Wed, 20 May 2026 19:30:59 +0200 Subject: [PATCH 04/11] Clean machine --- src/semble/index/create.py | 12 ------------ src/semble/index/index.py | 34 +++++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 3a5a78a..55ed253 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -1,5 +1,4 @@ import contextlib -import warnings from collections.abc import Sequence from pathlib import Path @@ -15,17 +14,6 @@ from semble.types import Chunk, ContentType, Encoder _MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index -_DEPRECATION_MSG = ( - "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead." -) - - -def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType: - """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" - if include_text_files is None: - return content - warnings.warn(_DEPRECATION_MSG, DeprecationWarning, stacklevel=3) - return ContentType.ALL if include_text_files else ContentType.CODE def create_index_from_path( diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 80789c3..73e2f2f 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -3,6 +3,7 @@ import os import subprocess import tempfile +import warnings from collections import defaultdict from collections.abc import Sequence from pathlib import Path @@ -11,13 +12,24 @@ import numpy.typing as npt from bm25s import BM25 -from semble.index.create import _apply_include_text_files, create_index_from_path +from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search from semble.stats import save_search_stats from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) +_INCLUDE_TEXT_FILES_DEPRECATION_MSG = ( + "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead." +) + + +def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType: + """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" + if include_text_files is None: + return content + warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=3) + return ContentType.ALL if include_text_files else ContentType.CODE class SembleIndex: @@ -39,7 +51,7 @@ def __init__( :param semantic_index: The semantic index. :param chunks: The found chunks. :param root: Root directory used to read file sizes for token-savings stats. - :param content: Content types used when indexing; controls the search pipeline. + :param content: Content type used when indexing; controls the search pipeline. """ self.model: Encoder = model self.chunks: list[Chunk] = chunks @@ -102,10 +114,10 @@ def from_path( :param path: Root directory to index. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``, - ``ContentType.ALL``, or a list of multiple types. - :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. - :return: An indexed SembleIndex. Chunk file paths are relative to ``path``. + :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS, + or ContentType.ALL. + :param include_text_files: Deprecated. Use content=ContentType.ALL instead. + :return: An indexed SembleIndex. Chunk file paths are relative to path. :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ @@ -141,16 +153,16 @@ def from_git( The repository is cloned into a temporary directory that is removed once indexing finishes. Chunk content is preserved in-memory, but - ``chunk.file_path`` will not point to a readable file after this call + chunk.file_path will not point to a readable file after this call returns — it is a repo-relative label, not a filesystem path. :param url: URL of the git repository to clone (any git provider). :param ref: Branch or tag to check out. Defaults to the remote HEAD. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param content: Content type(s) to index — ``ContentType.CODE`` (default), ``ContentType.DOCS``, - ``ContentType.ALL``, or a list of multiple types. - :param include_text_files: Deprecated. Use ``content=ContentType.ALL`` instead. + :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS, + or ContentType.ALL. + :param include_text_files: Deprecated. Use content=ContentType.ALL instead. :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ @@ -227,7 +239,7 @@ def search( :param filter_paths: Optional list of repo-relative file paths; if set, only chunks from these files are returned. :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). - Defaults to ``True`` when ``ContentType.CODE`` was indexed. + Defaults to ``True`` when ContentType.CODE was indexed. :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking. ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed. Pass ``0.0`` to disable diversity even on a docs index. From 0325dd89126965a05e86b13e20bce8b428d44dcc Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 14:47:57 +0200 Subject: [PATCH 05/11] Drop pyversity, add docs --- README.md | 48 ++++++++++++++++++++++++++++------- pyproject.toml | 1 - src/semble/agents/claude.md | 14 +++++++--- src/semble/agents/copilot.md | 14 +++++++--- src/semble/agents/cursor.md | 14 +++++++--- src/semble/agents/gemini.md | 14 +++++++--- src/semble/agents/kiro.md | 14 +++++++--- src/semble/agents/opencode.md | 14 +++++++--- src/semble/cli.py | 2 +- src/semble/index/index.py | 14 ++-------- src/semble/search.py | 44 +++----------------------------- tests/test_index.py | 6 ----- tests/test_search.py | 34 +++---------------------- uv.lock | 15 ----------- 14 files changed, 115 insertions(+), 133 deletions(-) diff --git a/README.md b/README.md index d8b4341..9b14509 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +​```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +​``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ​```bash @@ -76,9 +83,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` @@ -287,6 +295,8 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project): | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a local directory path or an https:// git URL. | | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. | +By default the MCP server indexes only code files. To also index documentation and prose, append `--content all` (or `--content docs`) to the server command. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`. + @@ -307,6 +317,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +​```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +​``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ​```bash @@ -320,9 +337,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` ### Sub-agent setup @@ -357,11 +375,17 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec # Limit results semble search "save model to disk" ./my-project --top-k 10 +# Search docs and prose (markdown, rst, etc.) instead of code +semble search "deployment guide" ./my-project --content docs + +# Search everything (code and docs) +semble search "authentication" ./my-project --content all + # Find code similar to a known location semble find-related src/auth.py 42 ./my-project ``` -`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +`--content` accepts `code` (default), `docs`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
Savings @@ -395,11 +419,17 @@ Stats are stored in `~/.semble/savings.jsonl`. Semble can also be used as a Python library for programmatic access, useful when building custom tooling or integrating search directly into your own code. ```python -from semble import SembleIndex +from semble import ContentType, SembleIndex -# Index a local directory +# Index a local directory (code only, the default) index = SembleIndex.from_path("./my-project") +# Index docs and prose (markdown, rst, etc.) +index = SembleIndex.from_path("./my-project", content=ContentType.DOCS) + +# Index everything — code and docs +index = SembleIndex.from_path("./my-project", content=ContentType.ALL) + # Index a remote git repository index = SembleIndex.from_git("https://github.com/MinishLab/model2vec") diff --git a/pyproject.toml b/pyproject.toml index 022f032..a35cc46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,6 @@ classifiers = [ dependencies = [ "model2vec>=0.4.0", "vicinity>=0.4.4", - "pyversity>=0.1.0", "numpy>=1.24.0", "bm25s>=0.2.0", "pathspec>=0.12", diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 515d60e..82d3fd0 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 515d60e..82d3fd0 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 160aac4..62c9fb5 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -11,6 +11,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -24,6 +31,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index 1ea5440..359f69d 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index 5177ec5..48381d8 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 2c51867..ea9561b 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -15,6 +15,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -28,6 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/cli.py b/src/semble/cli.py index 3f2b9b7..5f3fd10 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -100,7 +100,7 @@ def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType: warnings.warn( "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.", DeprecationWarning, - stacklevel=3, + stacklevel=2, ) return ContentType.ALL return ContentType(content_arg) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 73e2f2f..e574567 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -14,7 +14,7 @@ from semble.index.create import create_index_from_path from semble.index.dense import SelectableBasicBackend, load_model -from semble.search import DEFAULT_DOCS_DIVERSITY, _search_semantic, search +from semble.search import _search_semantic, search from semble.stats import save_search_stats from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult @@ -28,7 +28,7 @@ def _apply_include_text_files(content: ContentType, include_text_files: bool | N """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" if include_text_files is None: return content - warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=3) + warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=2) return ContentType.ALL if include_text_files else ContentType.CODE @@ -226,7 +226,6 @@ def search( filter_languages: list[str] | None = None, filter_paths: list[str] | None = None, rerank: bool | None = None, - diversity: float | None = None, ) -> list[SearchResult]: """Search the index and return the top-k most relevant chunks. @@ -240,21 +239,13 @@ def search( chunks from these files are returned. :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). Defaults to ``True`` when ContentType.CODE was indexed. - :param diversity: DPP diversity weight in [0, 1]; re-ranks with pyversity after reranking. - ``None`` (default) auto-detects: uses ``DEFAULT_DOCS_DIVERSITY`` when docs were indexed. - Pass ``0.0`` to disable diversity even on a docs index. :return: Ranked list of :class:`SearchResult` objects, best match first. """ if not self.chunks or not query.strip(): return [] has_code = self._content in (ContentType.CODE, ContentType.ALL) - has_docs = self._content in (ContentType.DOCS, ContentType.ALL) resolved_rerank = has_code if rerank is None else rerank - if diversity is None: - resolved_diversity = DEFAULT_DOCS_DIVERSITY if has_docs else None - else: - resolved_diversity = diversity if diversity > 0 else None selector = self._get_selector_vector(filter_languages, filter_paths) results = search( @@ -267,7 +258,6 @@ def search( alpha=alpha, selector=selector, rerank=resolved_rerank, - diversity=resolved_diversity, ) save_search_stats(results, CallType.SEARCH, self._file_sizes) return results diff --git a/src/semble/search.py b/src/semble/search.py index 7b58b8a..3556bf8 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -1,7 +1,6 @@ import bm25s import numpy as np import numpy.typing as npt -from pyversity import Strategy, diversify from semble.index.dense import SelectableBasicBackend from semble.index.sparse import selector_to_mask @@ -10,7 +9,6 @@ from semble.types import Chunk, Encoder, SearchResult _RRF_K = 60 -DEFAULT_DOCS_DIVERSITY = 0.3 def _rrf_scores(scores: dict[Chunk, float]) -> dict[Chunk, float]: @@ -74,7 +72,6 @@ def search( alpha: float | None = None, selector: npt.NDArray[np.int_] | None = None, rerank: bool = True, - diversity: float | None = None, ) -> list[SearchResult]: """Hybrid search: alpha-weighted combination of semantic and BM25 scores. @@ -90,8 +87,6 @@ def search( :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type. :param selector: Optional array of chunk indices to filter results by. :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties). - :param diversity: DPP diversity weight in [0, 1]. When set, fetches 2× candidates, reranks, - then re-selects with pyversity DPP. None disables diversity. :return: List of search results sorted by combined score descending. """ alpha_weight = resolve_alpha(query, alpha) @@ -120,44 +115,11 @@ def search( for chunk in all_candidates } - # Over-fetch before reranking so diversity has candidates to choose from. - fetch_k = top_k * 2 if diversity is not None else top_k - if rerank: boost_multi_chunk_files(combined_scores) combined_scores = apply_query_boost(combined_scores, query, chunks) - ranked = rerank_topk(combined_scores, fetch_k, penalise_paths=alpha_weight < 1.0) + ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0) else: - ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:fetch_k] - - results = [SearchResult(chunk=chunk, score=score) for chunk, score in ranked] - - if diversity is not None: - return _diversify(results, top_k, diversity, semantic_index, chunks) - return results + ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k] - -def _diversify( - results: list[SearchResult], - top_k: int, - diversity_weight: float, - semantic_index: SelectableBasicBackend, - chunks: list[Chunk], -) -> list[SearchResult]: - """Re-rank results with DPP to improve embedding-space diversity.""" - if len(results) <= top_k: - return results - chunk_index = {c: i for i, c in enumerate(chunks)} - valid = [r for r in results if r.chunk in chunk_index] - if len(valid) <= top_k: - return valid - indices = np.array([chunk_index[r.chunk] for r in valid]) - scores = np.array([r.score for r in valid], dtype=np.float32) - result = diversify( - embeddings=semantic_index.vectors[indices], - scores=scores, - k=top_k, - strategy=Strategy.DPP, - diversity=diversity_weight, - ) - return sorted((valid[i] for i in result.indices), key=lambda r: -r.score) + return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked] diff --git a/tests/test_index.py b/tests/test_index.py index f9435c1..83ea57e 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -97,12 +97,6 @@ def test_search_with_filter_paths_does_not_crash(indexed_index: SembleIndex) -> assert all(r.chunk.file_path == target_path for r in results) -def test_search_explicit_diversity(indexed_index: SembleIndex) -> None: - """Explicit diversity values are accepted; 0.0 disables diversity without error.""" - assert len(indexed_index.search("authenticate", top_k=3, diversity=0.5)) > 0 - assert len(indexed_index.search("authenticate", top_k=3, diversity=0.0)) > 0 - - def test_search_without_reranking(indexed_index: SembleIndex) -> None: """Filtered search works regardless of where the selected chunk lives in the corpus.""" with patch("semble.search.rerank_topk") as mock: diff --git a/tests/test_search.py b/tests/test_search.py index 89b5df0..46449e9 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -8,9 +8,9 @@ from vicinity.backends.basic import BasicArgs from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model -from semble.search import DEFAULT_DOCS_DIVERSITY, _diversify, _search_bm25, _search_semantic, _sort_top_k, search +from semble.search import _search_bm25, _search_semantic, _sort_top_k, search from semble.tokens import tokenize -from semble.types import Chunk, ContentType, Encoder, SearchResult +from semble.types import Chunk, ContentType, Encoder from tests.conftest import make_chunk @@ -161,38 +161,12 @@ def test_selectable_basic_backend_rejects_k_below_one( semantic.query(embeddings[:1], k=0) -def test_search_with_diversity( +def test_search_content_all_uses_code_reranking( chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any ) -> None: - """Search with diversity set runs DPP and returns top_k results.""" - results = search("authenticate", mock_model, semantic, bm25, chunks, top_k=2, diversity=DEFAULT_DOCS_DIVERSITY) - assert len(results) == 2 - assert all(r.score >= 0 for r in results) - - -def test_diversify_fewer_results_than_top_k(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None: - """_diversify returns early when results are already within top_k.""" - results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate(chunks[:2])] - out = _diversify(results, top_k=10, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks) - assert len(out) == 2 - - -def test_diversify_filters_unknown_chunks(chunks: list[Chunk], semantic: SelectableBasicBackend) -> None: - """_diversify returns early when valid results (chunks in index) fall within top_k.""" - # 4 results but 3 reference unknown chunks not in the index → valid=1 ≤ top_k=3 - unknown = [make_chunk(f"x = {i}", f"unknown_{i}.py") for i in range(3)] - results = [SearchResult(chunk=c, score=1.0 - i * 0.1) for i, c in enumerate([chunks[0]] + unknown)] - out = _diversify(results, top_k=3, diversity_weight=DEFAULT_DOCS_DIVERSITY, semantic_index=semantic, chunks=chunks) - assert len(out) == 1 # only the known chunk survives - - -def test_search_content_all_uses_both_pipelines( - chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any -) -> None: - """ContentType.ALL activates both rerank and diversity defaults.""" + """ContentType.ALL activates code reranking (has_code=True).""" from semble import SembleIndex index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL) - # rerank and diversity both resolve to True/set — results should come back without error results = index.search("authenticate", top_k=2) assert len(results) > 0 diff --git a/uv.lock b/uv.lock index fbbfb35..a45dbf4 100644 --- a/uv.lock +++ b/uv.lock @@ -2456,19 +2456,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/a2/43bbc5860b5034e2af4ef99a0e04d726ff329c43e192ef3abaa8d7ecfce5/python_multipart-0.0.28-py3-none-any.whl", hash = "sha256:10faac07eb966c3f48dc415f9dee46c04cb10d58d30a35677db8027c825ed9b6", size = 29438, upload-time = "2026-05-10T11:05:15.052Z" }, ] -[[package]] -name = "pyversity" -version = "0.2.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d5/dc/a5a835b57ce06e21b4355d9e37ebb717455d55645014f1f5bde2fd948615/pyversity-0.2.0.tar.gz", hash = "sha256:48be2735b2471da1fa7497ea045aff25aa071f1176b86a011d4a49c01327ab6d", size = 28000, upload-time = "2026-02-02T08:06:38.563Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/e2/7e928734b14944f164cb243cee7fb14229e208d25df022506aa5a2390bcc/pyversity-0.2.0-py3-none-any.whl", hash = "sha256:c28f6a1a3ccfb97a9439d345e80ba0e60ffd14aa7680f2db0af7a70e90aae3f9", size = 21229, upload-time = "2026-02-02T08:06:37.125Z" }, -] - [[package]] name = "pywin32" version = "311" @@ -3137,7 +3124,6 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pathspec" }, - { name = "pyversity" }, { name = "tree-sitter" }, { name = "tree-sitter-language-pack" }, { name = "vicinity" }, @@ -3182,7 +3168,6 @@ requires-dist = [ { name = "pydoclint", marker = "extra == 'dev'", specifier = ">=0.5.3" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0" }, - { name = "pyversity", specifier = ">=0.1.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.9.0" }, { name = "sentence-transformers", marker = "extra == 'benchmark'", specifier = ">=3.0" }, { name = "tiktoken", marker = "extra == 'benchmark'", specifier = ">=0.7" }, From a0bad8999bf32054b99049124fd637971420fd97 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 14:52:52 +0200 Subject: [PATCH 06/11] Rework languages --- src/semble/index/files.py | 5 ++--- src/semble/index/index.py | 10 ++++++---- tests/test_files.py | 8 ++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/semble/index/files.py b/src/semble/index/files.py index b292e95..a9a5492 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -380,8 +380,7 @@ "vimdoc", } -# Everything that is not a programming language — used to derive _CODE_LANGUAGES. -_NON_CODE_LANGUAGES = _DOC_LANGUAGES | { +_CONFIG_LANGUAGES = { "beancount", "capnp", "cedarschema", @@ -444,7 +443,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]: ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values()) -_CODE_LANGUAGES = ALL_LANGUAGES - _NON_CODE_LANGUAGES +_CODE_LANGUAGES = ALL_LANGUAGES - _DOC_LANGUAGES - _CONFIG_LANGUAGES _LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index e574567..d070c88 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -19,16 +19,18 @@ from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) -_INCLUDE_TEXT_FILES_DEPRECATION_MSG = ( - "include_text_files is deprecated and will be removed in a future version. Use content=ContentType.ALL instead." -) def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType: """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" if include_text_files is None: return content - warnings.warn(_INCLUDE_TEXT_FILES_DEPRECATION_MSG, DeprecationWarning, stacklevel=2) + warnings.warn( + "include_text_files is deprecated and will be removed in a future version." + " Use content=ContentType.ALL instead.", + DeprecationWarning, + stacklevel=2, + ) return ContentType.ALL if include_text_files else ContentType.CODE diff --git a/tests/test_files.py b/tests/test_files.py index 1813908..bcb747f 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -2,7 +2,7 @@ import pytest -from semble.index.files import _CODE_LANGUAGES, _DOC_LANGUAGES, _NON_CODE_LANGUAGES, detect_language, get_extensions +from semble.index.files import _CODE_LANGUAGES, _CONFIG_LANGUAGES, _DOC_LANGUAGES, detect_language, get_extensions from semble.types import ContentType @@ -14,10 +14,10 @@ def test_detect_language() -> None: def test_language_sets_are_consistent() -> None: - """Code, doc, and non-code language sets satisfy their mutual invariants.""" + """Code, doc, and config language sets are mutually disjoint.""" assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES) - assert _CODE_LANGUAGES.isdisjoint(_NON_CODE_LANGUAGES) - assert _DOC_LANGUAGES <= _NON_CODE_LANGUAGES + assert _CODE_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES) + assert _DOC_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES) @pytest.mark.parametrize( From b345683b48a42681f782ec3b4ec83ce39623b28f Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 14:57:55 +0200 Subject: [PATCH 07/11] Simplify search logic --- src/semble/index/index.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index d070c88..81d37cf 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -246,8 +246,7 @@ def search( if not self.chunks or not query.strip(): return [] - has_code = self._content in (ContentType.CODE, ContentType.ALL) - resolved_rerank = has_code if rerank is None else rerank + resolved_rerank = (self._content != ContentType.DOCS) if rerank is None else rerank selector = self._get_selector_vector(filter_languages, filter_paths) results = search( From 38f97dc0d9f9bab702a0c447f25d6fd9048f6247 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 15:04:45 +0200 Subject: [PATCH 08/11] Drop sphinx style params in docstrings --- src/semble/index/index.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 81d37cf..91d0771 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -165,7 +165,7 @@ def from_git( :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS, or ContentType.ALL. :param include_text_files: Deprecated. Use content=ContentType.ALL instead. - :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). + :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. src/foo.py). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ normalized = _apply_include_text_files(content, include_text_files) @@ -234,14 +234,14 @@ def search( :param query: Natural-language or keyword query string. :param top_k: Maximum number of results to return. :param alpha: Blend weight for hybrid score combination; 1.0 = full semantic - weight, 0.0 = full BM25 weight. ``None`` auto-detects from query type. + weight, 0.0 = full BM25 weight. None auto-detects from query type. :param filter_languages: Optional list of language codes; if set, only chunks in these languages are returned. :param filter_paths: Optional list of repo-relative file paths; if set, only chunks from these files are returned. :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). - Defaults to ``True`` when ContentType.CODE was indexed. - :return: Ranked list of :class:`SearchResult` objects, best match first. + Defaults to True when ContentType.CODE was indexed. + :return: Ranked list of SearchResult objects, best match first. """ if not self.chunks or not query.strip(): return [] From 55823b294cbc4757e15a7ce2004d8194a4159345 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 15:28:25 +0200 Subject: [PATCH 09/11] Update tests --- tests/test_index.py | 21 +++++++++++++++++++++ tests/test_ranking.py | 5 +++++ tests/test_search.py | 13 +------------ 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/tests/test_index.py b/tests/test_index.py index 83ea57e..5abf90e 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -107,6 +107,27 @@ def test_search_without_reranking(indexed_index: SembleIndex) -> None: mock.assert_called() +@pytest.mark.parametrize( + ("content", "expect_rerank"), + [ + (ContentType.CODE, True), + (ContentType.ALL, True), + (ContentType.DOCS, False), + ], +) +def test_search_rerank_default_by_content_type( + mock_model: Encoder, tmp_project: Path, content: ContentType, expect_rerank: bool +) -> None: + """Reranking is on by default for code/all content, off for docs-only.""" + index = SembleIndex.from_path(tmp_project, model=mock_model, content=content) + with patch("semble.search.rerank_topk") as mock: + index.search("function", top_k=3) + if expect_rerank: + mock.assert_called() + else: + mock.assert_not_called() + + @pytest.mark.parametrize("query", ["", " ", "\n\n"]) def test_search_empty_query_returns_empty(indexed_index: SembleIndex, query: str) -> None: """Empty / whitespace-only queries return [] across all modes.""" diff --git a/tests/test_ranking.py b/tests/test_ranking.py index 8caf26b..f510b79 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -148,3 +148,8 @@ def test_boost_multi_chunk_files() -> None: scores: dict = {c1: 1.0, c2: 0.8, c3: 1.0} boost_multi_chunk_files(scores) assert scores[c1] > 1.0 + + +def test_boosting_with_empty() -> None: + """apply_query_boost with empty inputs returns an empty dict.""" + assert apply_query_boost({}, "query", []) == {} diff --git a/tests/test_search.py b/tests/test_search.py index 46449e9..1bfaa7c 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -10,7 +10,7 @@ from semble.index.dense import SelectableBasicBackend, embed_chunks, load_model from semble.search import _search_bm25, _search_semantic, _sort_top_k, search from semble.tokens import tokenize -from semble.types import Chunk, ContentType, Encoder +from semble.types import Chunk, Encoder from tests.conftest import make_chunk @@ -159,14 +159,3 @@ def test_selectable_basic_backend_rejects_k_below_one( """SelectableBasicBackend.query guards against k < 1.""" with pytest.raises(ValueError, match="k should be >= 1"): semantic.query(embeddings[:1], k=0) - - -def test_search_content_all_uses_code_reranking( - chunks: list[Chunk], semantic: SelectableBasicBackend, bm25: bm25s.BM25, mock_model: Any -) -> None: - """ContentType.ALL activates code reranking (has_code=True).""" - from semble import SembleIndex - - index = SembleIndex(mock_model, bm25, semantic, chunks, content=ContentType.ALL) - results = index.search("authenticate", top_k=2) - assert len(results) > 0 From c993df77c5af35cc42de04ae6a2c30813a2097f5 Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 15:31:27 +0200 Subject: [PATCH 10/11] Update tests --- tests/test_ranking.py | 5 +++-- tests/test_search.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_ranking.py b/tests/test_ranking.py index f510b79..6a33397 100644 --- a/tests/test_ranking.py +++ b/tests/test_ranking.py @@ -151,5 +151,6 @@ def test_boost_multi_chunk_files() -> None: def test_boosting_with_empty() -> None: - """apply_query_boost with empty inputs returns an empty dict.""" - assert apply_query_boost({}, "query", []) == {} + """Test that boosting with empty chunks return None.""" + boosted = apply_query_boost({}, "query", []) + assert boosted == {} diff --git a/tests/test_search.py b/tests/test_search.py index 1bfaa7c..2f40fa6 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -107,7 +107,7 @@ def test_search_hybrid( (lambda q, m, s, b, c, k: search(q, m, s, b, c, k), "login", 4), ], ) -def test_search_returns_results( +def test_search_source_labels( search_fn: Any, query: str, top_k: int, @@ -116,14 +116,14 @@ def test_search_returns_results( bm25: bm25s.BM25, mock_model: Any, ) -> None: - """BM25, semantic, and hybrid search all return at least one result for a matching query.""" + """Each result carries a source label matching the search mode used.""" results = search_fn(query, mock_model, semantic, bm25, chunks, top_k) assert len(results) > 0 def test_sort_top_k() -> None: """_sort_top_k returns the same indices as np.argsort(-x)[:top_k].""" - gen = np.random.default_rng(42) + gen = np.random.default_rng() x = gen.standard_normal(size=(10000,)) top_k = 100 indices = _sort_top_k(x, top_k) From 93b1abf84e34d837fb123c71821b15a1b5e50efa Mon Sep 17 00:00:00 2001 From: Pringled Date: Thu, 21 May 2026 15:37:03 +0200 Subject: [PATCH 11/11] Update docstrings --- src/semble/search.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/semble/search.py b/src/semble/search.py index 3556bf8..5b88229 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -86,7 +86,7 @@ def search( :param top_k: Number of results to return. :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type. :param selector: Optional array of chunk indices to filter results by. - :param rerank: Whether to apply code-tuned reranking (file boost, identifier boost, path penalties). + :param rerank: Whether to perform code-tuned reranking. On by default for code search, off for docs search. :return: List of search results sorted by combined score descending. """ alpha_weight = resolve_alpha(query, alpha) @@ -104,7 +104,8 @@ def search( normalized_semantic = _rrf_scores(semantic_scores) normalized_bm25 = _rrf_scores(bm25_scores) - # Sort by the file path and start line to counteract randomness from hashing. + # Sort by the file path and start line to + # counteract randomness introduces by hashing. all_candidates = sorted( {*normalized_semantic, *normalized_bm25}, key=lambda c: c.start_line, @@ -116,10 +117,13 @@ def search( } if rerank: + # Boost files with multiple relevant chunks. boost_multi_chunk_files(combined_scores) + # Boost queries with specific identifiers in them. combined_scores = apply_query_boost(combined_scores, query, chunks) + # Rerank the top-k results by applying path-based penalties. ranked = rerank_topk(combined_scores, top_k, penalise_paths=alpha_weight < 1.0) else: - ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k] - + sorted_by_score = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) + ranked = sorted_by_score[:top_k] return [SearchResult(chunk=chunk, score=score) for chunk, score in ranked]