diff --git a/README.md b/README.md index d8b4341..9b14509 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +​```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +​``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ​```bash @@ -76,9 +83,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ### Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` @@ -287,6 +295,8 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project): | `search` | Search a codebase with a natural-language or code query. Pass `repo` as a local directory path or an https:// git URL. | | `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. | +By default the MCP server indexes only code files. To also index documentation and prose, append `--content all` (or `--content docs`) to the server command. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`. + @@ -307,6 +317,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ​``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +​```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +​``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ​```bash @@ -320,9 +337,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. ``` ### Sub-agent setup @@ -357,11 +375,17 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec # Limit results semble search "save model to disk" ./my-project --top-k 10 +# Search docs and prose (markdown, rst, etc.) instead of code +semble search "deployment guide" ./my-project --content docs + +# Search everything (code and docs) +semble search "authentication" ./my-project --content all + # Find code similar to a known location semble find-related src/auth.py 42 ./my-project ``` -`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place. +`--content` accepts `code` (default), `docs`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
Savings @@ -395,11 +419,17 @@ Stats are stored in `~/.semble/savings.jsonl`. Semble can also be used as a Python library for programmatic access, useful when building custom tooling or integrating search directly into your own code. ```python -from semble import SembleIndex +from semble import ContentType, SembleIndex -# Index a local directory +# Index a local directory (code only, the default) index = SembleIndex.from_path("./my-project") +# Index docs and prose (markdown, rst, etc.) +index = SembleIndex.from_path("./my-project", content=ContentType.DOCS) + +# Index everything — code and docs +index = SembleIndex.from_path("./my-project", content=ContentType.ALL) + # Index a remote git repository index = SembleIndex.from_git("https://github.com/MinishLab/model2vec") diff --git a/src/semble/__init__.py b/src/semble/__init__.py index ef61bdf..136f345 100644 --- a/src/semble/__init__.py +++ b/src/semble/__init__.py @@ -1,9 +1,10 @@ from semble.index import SembleIndex -from semble.types import Chunk, EmbeddingMatrix, Encoder, IndexStats, SearchResult +from semble.types import Chunk, ContentType, EmbeddingMatrix, Encoder, IndexStats, SearchResult from semble.version import __version__ __all__ = [ "Chunk", + "ContentType", "EmbeddingMatrix", "Encoder", "IndexStats", diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md index 515d60e..82d3fd0 100644 --- a/src/semble/agents/claude.md +++ b/src/semble/agents/claude.md @@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md index 515d60e..82d3fd0 100644 --- a/src/semble/agents/copilot.md +++ b/src/semble/agents/copilot.md @@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md index 160aac4..62c9fb5 100644 --- a/src/semble/agents/cursor.md +++ b/src/semble/agents/cursor.md @@ -11,6 +11,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -24,6 +31,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md index 1ea5440..359f69d 100644 --- a/src/semble/agents/gemini.md +++ b/src/semble/agents/gemini.md @@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md index 5177ec5..48381d8 100644 --- a/src/semble/agents/kiro.md +++ b/src/semble/agents/kiro.md @@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md index 2c51867..ea9561b 100644 --- a/src/semble/agents/opencode.md +++ b/src/semble/agents/opencode.md @@ -15,6 +15,13 @@ semble search "save_pretrained" ./my-project semble search "save model to disk" ./my-project --top-k 10 ``` +Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything: + +```bash +semble search "deployment guide" ./my-project --content docs +semble search "authentication" ./my-project --content all +``` + Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result): ```bash @@ -28,6 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac ## Workflow 1. Start with `semble search` to find relevant chunks. -2. Inspect full files only when the returned chunk is not enough context. -3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. -4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. +2. Use `--content docs` when looking for documentation, READMEs, or prose files. +3. Inspect full files only when the returned chunk is not enough context. +4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations. +5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/src/semble/cli.py b/src/semble/cli.py index d7c831f..5f3fd10 100644 --- a/src/semble/cli.py +++ b/src/semble/cli.py @@ -1,6 +1,7 @@ import argparse import asyncio import sys +import warnings from enum import Enum from importlib.resources import files from importlib.util import find_spec @@ -10,8 +11,11 @@ from semble.index import SembleIndex from semble.stats import format_savings_report +from semble.types import ContentType from semble.utils import _format_results, _is_git_url, _resolve_chunk +_CONTENT_CHOICES = [ct.value for ct in ContentType] + class Agent(str, Enum): CLAUDE = "claude" @@ -32,6 +36,21 @@ def _agent_path(agent: Agent) -> Path: return Path(base_dir) / "agents" / "semble-search.md" +def _add_content_args(p: argparse.ArgumentParser) -> None: + """Add --content and deprecated --include-text-files to a subparser.""" + p.add_argument( + "--content", + default=ContentType.CODE.value, + choices=_CONTENT_CHOICES, + help="Content type to index: 'code' (default), 'docs', or 'all'.", + ) + p.add_argument( + "--include-text-files", + action="store_true", + help="Deprecated. Use --content all instead.", + ) + + def main() -> None: """Entry point for the semble command-line tool.""" if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS: @@ -52,18 +71,15 @@ def _mcp_main() -> None: help="Local directory or git URL to pre-index at startup (optional).", ) parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).") - parser.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(parser) args = parser.parse_args() if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")): print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr) raise SystemExit(1) from semble.mcp import serve - asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files)) + content = _resolve_content(args.content, args.include_text_files) + asyncio.run(serve(args.path, ref=args.ref, content=content)) def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: @@ -78,6 +94,18 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None: print(f"Created {dest}") +def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType: + """Resolve --content and the deprecated --include-text-files into a ContentType.""" + if include_text_files: + warnings.warn( + "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.", + DeprecationWarning, + stacklevel=2, + ) + return ContentType.ALL + return ContentType(content_arg) + + def _cli_main() -> None: parser = argparse.ArgumentParser(prog="semble") sub = parser.add_subparsers(dest="command") @@ -86,22 +114,14 @@ def _cli_main() -> None: search_p.add_argument("query", help="Natural language or code query.") search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - search_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(search_p) related_p = sub.add_parser("find-related", help="Find code similar to a specific location.") related_p.add_argument("file_path", help="File path as shown in search results.") related_p.add_argument("line", type=int, help="Line number (1-indexed).") related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).") related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).") - related_p.add_argument( - "--include-text-files", - action="store_true", - help="Also index non-code text files (.md, .yaml, .json, etc.).", - ) + _add_content_args(related_p) init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.") init_p.add_argument( @@ -126,11 +146,11 @@ def _cli_main() -> None: print(format_savings_report(verbose=args.verbose), end="") return - include_text = args.include_text_files + content = _resolve_content(args.content, args.include_text_files) index = ( - SembleIndex.from_git(args.path, include_text_files=include_text) + SembleIndex.from_git(args.path, content=content) if _is_git_url(args.path) - else SembleIndex.from_path(args.path, include_text_files=include_text) + else SembleIndex.from_path(args.path, content=content) ) if args.command == "search": diff --git a/src/semble/index/create.py b/src/semble/index/create.py index 168f8ef..55ed253 100644 --- a/src/semble/index/create.py +++ b/src/semble/index/create.py @@ -11,7 +11,7 @@ from semble.index.files import detect_language, get_extensions from semble.index.sparse import enrich_for_bm25 from semble.tokens import tokenize -from semble.types import Chunk, Encoder +from semble.types import Chunk, ContentType, Encoder _MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index @@ -20,7 +20,7 @@ def create_index_from_path( path: Path, model: Encoder, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: ContentType = ContentType.CODE, display_root: Path | None = None, ) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]: """Create an index from a resolved directory, optionally storing chunk paths relative to display_root. @@ -28,14 +28,14 @@ def create_index_from_path( :param path: Resolved absolute path to index. :param model: The model to use for indexing. :param extensions: File extensions to include. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). + :param content: Content type to index. :param display_root: If set, chunk file paths are stored relative to this root. :raises ValueError: if no items were found, no index can be created. :return: A bm25 index, vicinity index and list of chunks """ chunks: list[Chunk] = [] - extensions = get_extensions(include_text_files, extensions) - for file_path in walk_files(path, extensions): + resolved_extensions = get_extensions(content, extensions) + for file_path in walk_files(path, resolved_extensions): language = detect_language(file_path) with contextlib.suppress(OSError): if file_path.stat().st_size > _MAX_FILE_BYTES: diff --git a/src/semble/index/files.py b/src/semble/index/files.py index e79d7c7..a9a5492 100644 --- a/src/semble/index/files.py +++ b/src/semble/index/files.py @@ -2,6 +2,8 @@ from collections.abc import Sequence from pathlib import Path +from semble.types import ContentType + _EXTENSION_TO_LANGUAGE = { ".4th": "forth", ".ada": "ada", @@ -357,8 +359,29 @@ _DOC_LANGUAGES = { "asciidoc", - "beancount", "bibtex", + "djot", + "doxygen", + "html", + "javadoc", + "jsdoc", + "latex", + "luadoc", + "markdown", + "markdown_inline", + "mermaid", + "norg", + "norg_meta", + "org", + "phpdoc", + "po", + "rst", + "rtf", + "vimdoc", +} + +_CONFIG_LANGUAGES = { + "beancount", "capnp", "cedarschema", "comment", @@ -368,8 +391,6 @@ "desktop", "devicetree", "diff", - "djot", - "doxygen", "dtd", "editorconfig", "ebnf", @@ -384,33 +405,18 @@ "gpg", "hjson", "hocon", - "html", "ini", - "javadoc", - "jsdoc", "json", "json5", "kdl", - "latex", "ledger", - "luadoc", - "markdown", - "markdown_inline", - "mermaid", - "norg", - "norg_meta", - "org", "pem", "pgn", - "phpdoc", - "po", "properties", "proto", "psv", "requirements", "ron", - "rst", - "rtf", "smithy", "ssh_config", "textproto", @@ -420,7 +426,6 @@ "tsv", "turtle", "typespec", - "vimdoc", "wit", "xcompose", "xml", @@ -438,7 +443,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]: ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values()) -_WITHOUT_DOC = ALL_LANGUAGES - _DOC_LANGUAGES +_CODE_LANGUAGES = ALL_LANGUAGES - _DOC_LANGUAGES - _CONFIG_LANGUAGES _LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE) @@ -447,12 +452,15 @@ def detect_language(file_name: Path) -> str | None: return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower()) -def get_extensions(include_text_files: bool, extensions: Sequence[str] | None) -> list[str]: +def get_extensions(content: ContentType, extensions: Sequence[str] | None) -> list[str]: """Returns a list of supported file extensions.""" - if include_text_files: + languages: set[str] | frozenset[str] + if content == ContentType.ALL: languages = ALL_LANGUAGES + elif content == ContentType.DOCS: + languages = _DOC_LANGUAGES else: - languages = _WITHOUT_DOC + languages = _CODE_LANGUAGES all_extensions: set[str] = set() for language in languages: all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set())) diff --git a/src/semble/index/index.py b/src/semble/index/index.py index 56d51e3..91d0771 100644 --- a/src/semble/index/index.py +++ b/src/semble/index/index.py @@ -3,6 +3,7 @@ import os import subprocess import tempfile +import warnings from collections import defaultdict from collections.abc import Sequence from pathlib import Path @@ -15,11 +16,24 @@ from semble.index.dense import SelectableBasicBackend, load_model from semble.search import _search_semantic, search from semble.stats import save_search_stats -from semble.types import CallType, Chunk, Encoder, IndexStats, SearchResult +from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult _GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60)) +def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType: + """Apply the deprecated include_text_files override, emitting a DeprecationWarning.""" + if include_text_files is None: + return content + warnings.warn( + "include_text_files is deprecated and will be removed in a future version." + " Use content=ContentType.ALL instead.", + DeprecationWarning, + stacklevel=2, + ) + return ContentType.ALL if include_text_files else ContentType.CODE + + class SembleIndex: """Fast local code index with hybrid search.""" @@ -30,6 +44,7 @@ def __init__( semantic_index: SelectableBasicBackend, chunks: list[Chunk], root: Path | None = None, + content: ContentType = ContentType.CODE, ) -> None: """Initialize a SembleIndex. Should be created with from_path or from_git. @@ -38,12 +53,14 @@ def __init__( :param semantic_index: The semantic index. :param chunks: The found chunks. :param root: Root directory used to read file sizes for token-savings stats. + :param content: Content type used when indexing; controls the search pipeline. """ self.model: Encoder = model self.chunks: list[Chunk] = chunks self._bm25_index: BM25 = bm25_index self._semantic_index: SelectableBasicBackend = semantic_index self._root: Path | None = root + self._content: ContentType = content self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {} self._file_mapping, self._language_mapping = self._populate_mapping() @@ -91,18 +108,22 @@ def from_path( path: str | Path, model: Encoder | None = None, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: ContentType = ContentType.CODE, + include_text_files: bool | None = None, ) -> SembleIndex: """Create and index a SembleIndex from a directory. :param path: Root directory to index. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). - :return: An indexed SembleIndex. Chunk file paths are relative to ``path``. + :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS, + or ContentType.ALL. + :param include_text_files: Deprecated. Use content=ContentType.ALL instead. + :return: An indexed SembleIndex. Chunk file paths are relative to path. :raises FileNotFoundError: If `path` does not exist. :raises NotADirectoryError: If `path` exists but is not a directory. """ + normalized = _apply_include_text_files(content, include_text_files) model = model or load_model() path = Path(path) if not path.exists(): @@ -114,11 +135,11 @@ def from_path( path, model=model, extensions=extensions, - include_text_files=include_text_files, + content=normalized, display_root=path, ) - return SembleIndex(model, bm25, vicinity, chunks, root=path) + return SembleIndex(model, bm25, vicinity, chunks, root=path, content=normalized) @classmethod def from_git( @@ -127,23 +148,27 @@ def from_git( ref: str | None = None, model: Encoder | None = None, extensions: Sequence[str] | None = None, - include_text_files: bool = False, + content: ContentType = ContentType.CODE, + include_text_files: bool | None = None, ) -> SembleIndex: """Clone a git repository and index it. The repository is cloned into a temporary directory that is removed once indexing finishes. Chunk content is preserved in-memory, but - ``chunk.file_path`` will not point to a readable file after this call + chunk.file_path will not point to a readable file after this call returns — it is a repo-relative label, not a filesystem path. :param url: URL of the git repository to clone (any git provider). :param ref: Branch or tag to check out. Defaults to the remote HEAD. :param model: Embedding model to use. Defaults to potion-code-16M. :param extensions: File extensions to include. Defaults to a standard set of code extensions. - :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.). - :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``). + :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS, + or ContentType.ALL. + :param include_text_files: Deprecated. Use content=ContentType.ALL instead. + :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. src/foo.py). :raises RuntimeError: If git is not on PATH, the clone fails, or times out. """ + normalized = _apply_include_text_files(content, include_text_files) with tempfile.TemporaryDirectory() as tmp_dir: # `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir] @@ -163,11 +188,11 @@ def from_git( resolved_path, model=model, extensions=extensions, - include_text_files=include_text_files, + content=normalized, display_root=resolved_path, ) - return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path) + return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path, content=normalized) def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]: """Return chunks semantically similar to the given chunk or search result. @@ -202,38 +227,38 @@ def search( alpha: float | None = None, filter_languages: list[str] | None = None, filter_paths: list[str] | None = None, - rerank: bool = True, + rerank: bool | None = None, ) -> list[SearchResult]: """Search the index and return the top-k most relevant chunks. :param query: Natural-language or keyword query string. :param top_k: Maximum number of results to return. :param alpha: Blend weight for hybrid score combination; 1.0 = full semantic - weight, 0.0 = full BM25 weight. File-path penalties and diversity reranking - are applied regardless. ``None`` auto-detects from query type. + weight, 0.0 = full BM25 weight. None auto-detects from query type. :param filter_languages: Optional list of language codes; if set, only chunks in these languages are returned. :param filter_paths: Optional list of repo-relative file paths; if set, only chunks from these files are returned. - :param rerank: Whether to rerank the top-k results using custom reranking logic. - :return: Ranked list of :class:`SearchResult` objects, best match first. + :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties). + Defaults to True when ContentType.CODE was indexed. + :return: Ranked list of SearchResult objects, best match first. """ - bm25_index, semantic_index = self._bm25_index, self._semantic_index if not self.chunks or not query.strip(): return [] - selector = self._get_selector_vector(filter_languages, filter_paths) + resolved_rerank = (self._content != ContentType.DOCS) if rerank is None else rerank + selector = self._get_selector_vector(filter_languages, filter_paths) results = search( query, self.model, - semantic_index, - bm25_index, + self._semantic_index, + self._bm25_index, self.chunks, top_k, alpha=alpha, selector=selector, - rerank=rerank, + rerank=resolved_rerank, ) save_search_stats(results, CallType.SEARCH, self._file_sizes) return results diff --git a/src/semble/mcp.py b/src/semble/mcp.py index 5993c6c..b19b762 100644 --- a/src/semble/mcp.py +++ b/src/semble/mcp.py @@ -12,7 +12,7 @@ from semble.index import SembleIndex from semble.index.dense import load_model -from semble.types import Encoder +from semble.types import ContentType, Encoder from semble.utils import _format_results, _is_git_url, _resolve_chunk logger = logging.getLogger(__name__) @@ -112,9 +112,13 @@ async def find_related( return server -async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None: +async def serve( + path: str | None = None, + ref: str | None = None, + content: ContentType = ContentType.CODE, +) -> None: """Start an MCP stdio server, optionally pre-indexing a default source.""" - cache = _IndexCache(include_text_files=include_text_files) + cache = _IndexCache(content=content) async def _load_and_prewarm() -> None: """Pre-load the model and optionally pre-index the default source in parallel with starting the server.""" @@ -146,14 +150,14 @@ async def _load_and_prewarm() -> None: class _IndexCache: """Cache of indexed repos and local paths for the lifetime of the MCP server process.""" - def __init__(self, model: Encoder | None = None, include_text_files: bool = False) -> None: + def __init__(self, model: Encoder | None = None, content: ContentType = ContentType.CODE) -> None: """Initialise an empty cache.""" self._model: Encoder | None = model self._model_error: BaseException | None = None self._model_ready = asyncio.Event() if model is not None: self._model_ready.set() - self._include_text_files = include_text_files + self._content = content self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict() # ordered for LRU eviction self._watcher_task: asyncio.Task[None] | None = None @@ -206,7 +210,7 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: source, ref=ref, model=model, - include_text_files=self._include_text_files, + content=self._content, ) ) else: @@ -215,7 +219,7 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex: SembleIndex.from_path, cache_key, model=model, - include_text_files=self._include_text_files, + content=self._content, ) ) self._tasks.move_to_end(cache_key) diff --git a/src/semble/search.py b/src/semble/search.py index d0a9d25..5b88229 100644 --- a/src/semble/search.py +++ b/src/semble/search.py @@ -86,7 +86,7 @@ def search( :param top_k: Number of results to return. :param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type. :param selector: Optional array of chunk indices to filter results by. - :param rerank: Whether to perform reranking. This should be done, and is mainly here for benchmarking. + :param rerank: Whether to perform code-tuned reranking. On by default for code search, off for docs search. :return: List of search results sorted by combined score descending. """ alpha_weight = resolve_alpha(query, alpha) diff --git a/src/semble/types.py b/src/semble/types.py index d01c774..1da4c7f 100644 --- a/src/semble/types.py +++ b/src/semble/types.py @@ -16,6 +16,14 @@ class CallType(str, Enum): FIND_RELATED = "find_related" +class ContentType(str, Enum): + """Content type for indexing and search pipeline selection.""" + + CODE = "code" + DOCS = "docs" + ALL = "all" + + class Encoder(Protocol): """Protocol for embedding models.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 28d19cd..c9caa11 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -195,6 +195,26 @@ def test_mcp_main_exits_with_message_when_extras_missing( assert "pip install 'semble[mcp]'" in capsys.readouterr().err +def test_include_text_files_cli_deprecated( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], +) -> None: + """--include-text-files on CLI raises DeprecationWarning.""" + import warnings + + chunk = make_chunk("def foo(): pass", "src/foo.py") + fake_index = MagicMock() + fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9)] + monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path", "--include-text-files"]) + with patch("semble.cli.SembleIndex.from_path", return_value=fake_index): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _cli_main() + assert any( + "include-text-files" in str(w.message).lower() for w in caught if issubclass(w.category, DeprecationWarning) + ) + + def test_agent_file_tools_are_bash_only() -> None: """The agent file must list only Bash and Read — no MCP tools that require schema loading.""" frontmatter = files("semble").joinpath("agents/claude.md").read_text(encoding="utf-8").split("---")[1] diff --git a/tests/test_files.py b/tests/test_files.py index 71ede4c..bcb747f 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -1,6 +1,9 @@ from pathlib import Path -from semble.index.files import _DOC_LANGUAGES, _EXTENSION_TO_LANGUAGE, detect_language, get_extensions +import pytest + +from semble.index.files import _CODE_LANGUAGES, _CONFIG_LANGUAGES, _DOC_LANGUAGES, detect_language, get_extensions +from semble.types import ContentType def test_detect_language() -> None: @@ -10,32 +13,36 @@ def test_detect_language() -> None: assert detect_language(Path("c.txt")) is None -def test_get_extensions() -> None: - """Test the get_extensions function.""" - all_extensions = get_extensions(True, None) - without_doc_extensions = get_extensions(False, None) +def test_language_sets_are_consistent() -> None: + """Code, doc, and config language sets are mutually disjoint.""" + assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES) + assert _CODE_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES) + assert _DOC_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES) - doc_extensions = set(all_extensions) - set(without_doc_extensions) - for extension in doc_extensions: - assert _EXTENSION_TO_LANGUAGE[extension] in _DOC_LANGUAGES - for extension in without_doc_extensions: - assert _EXTENSION_TO_LANGUAGE[extension] not in _DOC_LANGUAGES +@pytest.mark.parametrize( + ("content", "includes", "excludes"), + [ + (ContentType.CODE, [".py"], [".md"]), + (ContentType.DOCS, [".md"], [".py"]), + (ContentType.ALL, [".py", ".md"], []), + ], +) +def test_get_extensions(content: ContentType, includes: list[str], excludes: list[str]) -> None: + """get_extensions returns the right extensions for each content type.""" + exts = set(get_extensions(content, None)) + for ext in includes: + assert ext in exts + for ext in excludes: + assert ext not in exts def test_get_extensions_additional() -> None: - """Test the get_extensions function.""" - all_extensions = get_extensions(True, None) - all_extensions_extra = get_extensions(True, [".kjs"]) - - assert set(all_extensions_extra) == set(all_extensions) | {".kjs"} - - all_extensions = get_extensions(False, None) - all_extensions_extra = get_extensions(False, [".kjs"]) - - assert set(all_extensions_extra) == set(all_extensions) | {".kjs"} - - all_extensions = get_extensions(False, None) - all_extensions_extra = get_extensions(False, [".py"]) - - assert set(all_extensions_extra) == set(all_extensions) + """Extra extensions are appended and existing ones are not duplicated.""" + base = get_extensions(ContentType.ALL, None) + with_extra = get_extensions(ContentType.ALL, [".kjs"]) + assert set(with_extra) == set(base) | {".kjs"} + + base_code = get_extensions(ContentType.CODE, None) + with_existing = get_extensions(ContentType.CODE, [".py"]) + assert set(with_existing) == set(base_code) diff --git a/tests/test_index.py b/tests/test_index.py index 3f90fcb..5abf90e 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,12 +1,12 @@ from pathlib import Path from typing import Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from semble import SembleIndex from semble.index.create import _MAX_FILE_BYTES, create_index_from_path -from semble.types import Encoder +from semble.types import ContentType, Encoder from tests.conftest import make_chunk @@ -17,18 +17,40 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex: @pytest.mark.parametrize( - ("include_text_files", "md_in_results"), - [(False, False), (True, True)], + ("content", "md_in_results"), + [ + (ContentType.CODE, False), + (ContentType.DOCS, True), + (ContentType.ALL, True), + ], ) def test_index_markdown_inclusion( - mock_model: Encoder, tmp_project: Path, include_text_files: bool, md_in_results: bool + mock_model: Encoder, tmp_project: Path, content: ContentType, md_in_results: bool ) -> None: - """Markdown files are excluded by default and included when include_text_files=True.""" - _, _, chunks = create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files) + """Markdown files are excluded for code and included for docs/all.""" + _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content) has_md = ".md" in {Path(c.file_path).suffix for c in chunks} assert has_md is md_in_results +@pytest.mark.parametrize("include_text_files", [True, False]) +def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None: + """include_text_files raises DeprecationWarning on from_path.""" + with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): + SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files) + + +def test_from_git_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path) -> None: + """from_git raises DeprecationWarning when include_text_files is passed.""" + fake_result = MagicMock() + fake_result.returncode = 0 + with patch("subprocess.run", return_value=fake_result): + with patch("semble.index.index.create_index_from_path") as mock_create: + mock_create.return_value = (MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")]) + with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"): + SembleIndex.from_git("https://example.com/repo", model=mock_model, include_text_files=True) + + def test_index_empty_returns_zero_chunks(mock_model: Encoder, tmp_path: Path) -> None: """Indexing an empty directory yields zero files and chunks.""" with pytest.raises(ValueError): @@ -85,6 +107,27 @@ def test_search_without_reranking(indexed_index: SembleIndex) -> None: mock.assert_called() +@pytest.mark.parametrize( + ("content", "expect_rerank"), + [ + (ContentType.CODE, True), + (ContentType.ALL, True), + (ContentType.DOCS, False), + ], +) +def test_search_rerank_default_by_content_type( + mock_model: Encoder, tmp_project: Path, content: ContentType, expect_rerank: bool +) -> None: + """Reranking is on by default for code/all content, off for docs-only.""" + index = SembleIndex.from_path(tmp_project, model=mock_model, content=content) + with patch("semble.search.rerank_topk") as mock: + index.search("function", top_k=3) + if expect_rerank: + mock.assert_called() + else: + mock.assert_not_called() + + @pytest.mark.parametrize("query", ["", " ", "\n\n"]) def test_search_empty_query_returns_empty(indexed_index: SembleIndex, query: str) -> None: """Empty / whitespace-only queries return [] across all modes."""