diff --git a/README.md b/README.md
index d8b4341..9b14509 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -76,9 +83,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
### Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
```
@@ -287,6 +295,8 @@ Add to `~/.config/zed/settings.json` (or `.zed/settings.json` in your project):
| `search` | Search a codebase with a natural-language or code query. Pass `repo` as a local directory path or an https:// git URL. |
| `find_related` | Given a file path and line number, return chunks semantically similar to the code at that location. |
+By default the MCP server indexes only code files. To also index documentation and prose, append `--content all` (or `--content docs`) to the server command. For example, in Claude Code: `claude mcp add semble -s user -- uvx --from "semble[mcp]" semble --content all`.
+
@@ -307,6 +317,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -320,9 +337,10 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
```
### Sub-agent setup
@@ -357,11 +375,17 @@ semble search "save model to disk" https://github.com/MinishLab/model2vec
# Limit results
semble search "save model to disk" ./my-project --top-k 10
+# Search docs and prose (markdown, rst, etc.) instead of code
+semble search "deployment guide" ./my-project --content docs
+
+# Search everything (code and docs)
+semble search "authentication" ./my-project --content all
+
# Find code similar to a known location
semble find-related src/auth.py 42 ./my-project
```
-`path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
+`--content` accepts `code` (default), `docs`, or `all`. `path` defaults to the current directory when omitted; git URLs are accepted. If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its place.
Savings
@@ -395,11 +419,17 @@ Stats are stored in `~/.semble/savings.jsonl`.
Semble can also be used as a Python library for programmatic access, useful when building custom tooling or integrating search directly into your own code.
```python
-from semble import SembleIndex
+from semble import ContentType, SembleIndex
-# Index a local directory
+# Index a local directory (code only, the default)
index = SembleIndex.from_path("./my-project")
+# Index docs and prose (markdown, rst, etc.)
+index = SembleIndex.from_path("./my-project", content=ContentType.DOCS)
+
+# Index everything — code and docs
+index = SembleIndex.from_path("./my-project", content=ContentType.ALL)
+
# Index a remote git repository
index = SembleIndex.from_git("https://github.com/MinishLab/model2vec")
diff --git a/src/semble/__init__.py b/src/semble/__init__.py
index ef61bdf..136f345 100644
--- a/src/semble/__init__.py
+++ b/src/semble/__init__.py
@@ -1,9 +1,10 @@
from semble.index import SembleIndex
-from semble.types import Chunk, EmbeddingMatrix, Encoder, IndexStats, SearchResult
+from semble.types import Chunk, ContentType, EmbeddingMatrix, Encoder, IndexStats, SearchResult
from semble.version import __version__
__all__ = [
"Chunk",
+ "ContentType",
"EmbeddingMatrix",
"Encoder",
"IndexStats",
diff --git a/src/semble/agents/claude.md b/src/semble/agents/claude.md
index 515d60e..82d3fd0 100644
--- a/src/semble/agents/claude.md
+++ b/src/semble/agents/claude.md
@@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/copilot.md b/src/semble/agents/copilot.md
index 515d60e..82d3fd0 100644
--- a/src/semble/agents/copilot.md
+++ b/src/semble/agents/copilot.md
@@ -12,6 +12,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -25,6 +32,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/cursor.md b/src/semble/agents/cursor.md
index 160aac4..62c9fb5 100644
--- a/src/semble/agents/cursor.md
+++ b/src/semble/agents/cursor.md
@@ -11,6 +11,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -24,6 +31,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/gemini.md b/src/semble/agents/gemini.md
index 1ea5440..359f69d 100644
--- a/src/semble/agents/gemini.md
+++ b/src/semble/agents/gemini.md
@@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/kiro.md b/src/semble/agents/kiro.md
index 5177ec5..48381d8 100644
--- a/src/semble/agents/kiro.md
+++ b/src/semble/agents/kiro.md
@@ -14,6 +14,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -27,6 +34,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/agents/opencode.md b/src/semble/agents/opencode.md
index 2c51867..ea9561b 100644
--- a/src/semble/agents/opencode.md
+++ b/src/semble/agents/opencode.md
@@ -15,6 +15,13 @@ semble search "save_pretrained" ./my-project
semble search "save model to disk" ./my-project --top-k 10
```
+Use `--content docs` to search documentation and prose (markdown, rst, etc.) instead of code, or `--content all` to search everything:
+
+```bash
+semble search "deployment guide" ./my-project --content docs
+semble search "authentication" ./my-project --content all
+```
+
Use `semble find-related` to discover code similar to a known location (pass `file_path` and `line` from a prior search result):
```bash
@@ -28,6 +35,7 @@ If `semble` is not on `$PATH`, use `uvx --from "semble[mcp]" semble` in its plac
## Workflow
1. Start with `semble search` to find relevant chunks.
-2. Inspect full files only when the returned chunk is not enough context.
-3. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
-4. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
+2. Use `--content docs` when looking for documentation, READMEs, or prose files.
+3. Inspect full files only when the returned chunk is not enough context.
+4. Optionally use `semble find-related` with a promising result's `file_path` and `line` to discover related implementations.
+5. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string.
diff --git a/src/semble/cli.py b/src/semble/cli.py
index d7c831f..5f3fd10 100644
--- a/src/semble/cli.py
+++ b/src/semble/cli.py
@@ -1,6 +1,7 @@
import argparse
import asyncio
import sys
+import warnings
from enum import Enum
from importlib.resources import files
from importlib.util import find_spec
@@ -10,8 +11,11 @@
from semble.index import SembleIndex
from semble.stats import format_savings_report
+from semble.types import ContentType
from semble.utils import _format_results, _is_git_url, _resolve_chunk
+_CONTENT_CHOICES = [ct.value for ct in ContentType]
+
class Agent(str, Enum):
CLAUDE = "claude"
@@ -32,6 +36,21 @@ def _agent_path(agent: Agent) -> Path:
return Path(base_dir) / "agents" / "semble-search.md"
+def _add_content_args(p: argparse.ArgumentParser) -> None:
+ """Add --content and deprecated --include-text-files to a subparser."""
+ p.add_argument(
+ "--content",
+ default=ContentType.CODE.value,
+ choices=_CONTENT_CHOICES,
+ help="Content type to index: 'code' (default), 'docs', or 'all'.",
+ )
+ p.add_argument(
+ "--include-text-files",
+ action="store_true",
+ help="Deprecated. Use --content all instead.",
+ )
+
+
def main() -> None:
"""Entry point for the semble command-line tool."""
if len(sys.argv) > 1 and sys.argv[1] in _CLI_DISPATCH_ARGS:
@@ -52,18 +71,15 @@ def _mcp_main() -> None:
help="Local directory or git URL to pre-index at startup (optional).",
)
parser.add_argument("--ref", default=None, help="Branch or tag to check out (git URLs only).")
- parser.add_argument(
- "--include-text-files",
- action="store_true",
- help="Also index non-code text files (.md, .yaml, .json, etc.).",
- )
+ _add_content_args(parser)
args = parser.parse_args()
if any(find_spec(dep) is None for dep in get_package_extras("semble", "mcp")):
print("MCP dependencies are not installed. Run: pip install 'semble[mcp]'", file=sys.stderr)
raise SystemExit(1)
from semble.mcp import serve
- asyncio.run(serve(args.path, ref=args.ref, include_text_files=args.include_text_files))
+ content = _resolve_content(args.content, args.include_text_files)
+ asyncio.run(serve(args.path, ref=args.ref, content=content))
def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
@@ -78,6 +94,18 @@ def _run_init(*, agent: Agent = _DEFAULT_AGENT, force: bool = False) -> None:
print(f"Created {dest}")
+def _resolve_content(content_arg: str, include_text_files: bool) -> ContentType:
+ """Resolve --content and the deprecated --include-text-files into a ContentType."""
+ if include_text_files:
+ warnings.warn(
+ "--include-text-files is deprecated and will be removed in a future version. Use --content all instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return ContentType.ALL
+ return ContentType(content_arg)
+
+
def _cli_main() -> None:
parser = argparse.ArgumentParser(prog="semble")
sub = parser.add_subparsers(dest="command")
@@ -86,22 +114,14 @@ def _cli_main() -> None:
search_p.add_argument("query", help="Natural language or code query.")
search_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
search_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
- search_p.add_argument(
- "--include-text-files",
- action="store_true",
- help="Also index non-code text files (.md, .yaml, .json, etc.).",
- )
+ _add_content_args(search_p)
related_p = sub.add_parser("find-related", help="Find code similar to a specific location.")
related_p.add_argument("file_path", help="File path as shown in search results.")
related_p.add_argument("line", type=int, help="Line number (1-indexed).")
related_p.add_argument("path", nargs="?", default=".", help="Local path or git URL (default: current directory).")
related_p.add_argument("-k", "--top-k", type=int, default=5, help="Number of results (default: 5).")
- related_p.add_argument(
- "--include-text-files",
- action="store_true",
- help="Also index non-code text files (.md, .yaml, .json, etc.).",
- )
+ _add_content_args(related_p)
init_p = sub.add_parser("init", help="Write a semble sub-agent file for your coding agent.")
init_p.add_argument(
@@ -126,11 +146,11 @@ def _cli_main() -> None:
print(format_savings_report(verbose=args.verbose), end="")
return
- include_text = args.include_text_files
+ content = _resolve_content(args.content, args.include_text_files)
index = (
- SembleIndex.from_git(args.path, include_text_files=include_text)
+ SembleIndex.from_git(args.path, content=content)
if _is_git_url(args.path)
- else SembleIndex.from_path(args.path, include_text_files=include_text)
+ else SembleIndex.from_path(args.path, content=content)
)
if args.command == "search":
diff --git a/src/semble/index/create.py b/src/semble/index/create.py
index 168f8ef..55ed253 100644
--- a/src/semble/index/create.py
+++ b/src/semble/index/create.py
@@ -11,7 +11,7 @@
from semble.index.files import detect_language, get_extensions
from semble.index.sparse import enrich_for_bm25
from semble.tokens import tokenize
-from semble.types import Chunk, Encoder
+from semble.types import Chunk, ContentType, Encoder
_MAX_FILE_BYTES = 1_000_000 # 1 MB max file size to read and index
@@ -20,7 +20,7 @@ def create_index_from_path(
path: Path,
model: Encoder,
extensions: Sequence[str] | None = None,
- include_text_files: bool = False,
+ content: ContentType = ContentType.CODE,
display_root: Path | None = None,
) -> tuple[bm25s.BM25, SelectableBasicBackend, list[Chunk]]:
"""Create an index from a resolved directory, optionally storing chunk paths relative to display_root.
@@ -28,14 +28,14 @@ def create_index_from_path(
:param path: Resolved absolute path to index.
:param model: The model to use for indexing.
:param extensions: File extensions to include.
- :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
+ :param content: Content type to index.
:param display_root: If set, chunk file paths are stored relative to this root.
:raises ValueError: if no items were found, no index can be created.
:return: A bm25 index, vicinity index and list of chunks
"""
chunks: list[Chunk] = []
- extensions = get_extensions(include_text_files, extensions)
- for file_path in walk_files(path, extensions):
+ resolved_extensions = get_extensions(content, extensions)
+ for file_path in walk_files(path, resolved_extensions):
language = detect_language(file_path)
with contextlib.suppress(OSError):
if file_path.stat().st_size > _MAX_FILE_BYTES:
diff --git a/src/semble/index/files.py b/src/semble/index/files.py
index e79d7c7..a9a5492 100644
--- a/src/semble/index/files.py
+++ b/src/semble/index/files.py
@@ -2,6 +2,8 @@
from collections.abc import Sequence
from pathlib import Path
+from semble.types import ContentType
+
_EXTENSION_TO_LANGUAGE = {
".4th": "forth",
".ada": "ada",
@@ -357,8 +359,29 @@
_DOC_LANGUAGES = {
"asciidoc",
- "beancount",
"bibtex",
+ "djot",
+ "doxygen",
+ "html",
+ "javadoc",
+ "jsdoc",
+ "latex",
+ "luadoc",
+ "markdown",
+ "markdown_inline",
+ "mermaid",
+ "norg",
+ "norg_meta",
+ "org",
+ "phpdoc",
+ "po",
+ "rst",
+ "rtf",
+ "vimdoc",
+}
+
+_CONFIG_LANGUAGES = {
+ "beancount",
"capnp",
"cedarschema",
"comment",
@@ -368,8 +391,6 @@
"desktop",
"devicetree",
"diff",
- "djot",
- "doxygen",
"dtd",
"editorconfig",
"ebnf",
@@ -384,33 +405,18 @@
"gpg",
"hjson",
"hocon",
- "html",
"ini",
- "javadoc",
- "jsdoc",
"json",
"json5",
"kdl",
- "latex",
"ledger",
- "luadoc",
- "markdown",
- "markdown_inline",
- "mermaid",
- "norg",
- "norg_meta",
- "org",
"pem",
"pgn",
- "phpdoc",
- "po",
"properties",
"proto",
"psv",
"requirements",
"ron",
- "rst",
- "rtf",
"smithy",
"ssh_config",
"textproto",
@@ -420,7 +426,6 @@
"tsv",
"turtle",
"typespec",
- "vimdoc",
"wit",
"xcompose",
"xml",
@@ -438,7 +443,7 @@ def _inv_mapping(mapping: dict[str, str]) -> dict[str, list[str]]:
ALL_LANGUAGES = frozenset(_EXTENSION_TO_LANGUAGE.values())
-_WITHOUT_DOC = ALL_LANGUAGES - _DOC_LANGUAGES
+_CODE_LANGUAGES = ALL_LANGUAGES - _DOC_LANGUAGES - _CONFIG_LANGUAGES
_LANGUAGE_TO_EXTENSION = _inv_mapping(_EXTENSION_TO_LANGUAGE)
@@ -447,12 +452,15 @@ def detect_language(file_name: Path) -> str | None:
return _EXTENSION_TO_LANGUAGE.get(file_name.suffix.lower())
-def get_extensions(include_text_files: bool, extensions: Sequence[str] | None) -> list[str]:
+def get_extensions(content: ContentType, extensions: Sequence[str] | None) -> list[str]:
"""Returns a list of supported file extensions."""
- if include_text_files:
+ languages: set[str] | frozenset[str]
+ if content == ContentType.ALL:
languages = ALL_LANGUAGES
+ elif content == ContentType.DOCS:
+ languages = _DOC_LANGUAGES
else:
- languages = _WITHOUT_DOC
+ languages = _CODE_LANGUAGES
all_extensions: set[str] = set()
for language in languages:
all_extensions.update(_LANGUAGE_TO_EXTENSION.get(language, set()))
diff --git a/src/semble/index/index.py b/src/semble/index/index.py
index 56d51e3..91d0771 100644
--- a/src/semble/index/index.py
+++ b/src/semble/index/index.py
@@ -3,6 +3,7 @@
import os
import subprocess
import tempfile
+import warnings
from collections import defaultdict
from collections.abc import Sequence
from pathlib import Path
@@ -15,11 +16,24 @@
from semble.index.dense import SelectableBasicBackend, load_model
from semble.search import _search_semantic, search
from semble.stats import save_search_stats
-from semble.types import CallType, Chunk, Encoder, IndexStats, SearchResult
+from semble.types import CallType, Chunk, ContentType, Encoder, IndexStats, SearchResult
_GIT_CLONE_TIMEOUT = int(os.environ.get("SEMBLE_CLONE_TIMEOUT", 60))
+def _apply_include_text_files(content: ContentType, include_text_files: bool | None) -> ContentType:
+ """Apply the deprecated include_text_files override, emitting a DeprecationWarning."""
+ if include_text_files is None:
+ return content
+ warnings.warn(
+ "include_text_files is deprecated and will be removed in a future version."
+ " Use content=ContentType.ALL instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return ContentType.ALL if include_text_files else ContentType.CODE
+
+
class SembleIndex:
"""Fast local code index with hybrid search."""
@@ -30,6 +44,7 @@ def __init__(
semantic_index: SelectableBasicBackend,
chunks: list[Chunk],
root: Path | None = None,
+ content: ContentType = ContentType.CODE,
) -> None:
"""Initialize a SembleIndex. Should be created with from_path or from_git.
@@ -38,12 +53,14 @@ def __init__(
:param semantic_index: The semantic index.
:param chunks: The found chunks.
:param root: Root directory used to read file sizes for token-savings stats.
+ :param content: Content type used when indexing; controls the search pipeline.
"""
self.model: Encoder = model
self.chunks: list[Chunk] = chunks
self._bm25_index: BM25 = bm25_index
self._semantic_index: SelectableBasicBackend = semantic_index
self._root: Path | None = root
+ self._content: ContentType = content
self._file_sizes: dict[str, int] = self._compute_file_sizes(root) if root else {}
self._file_mapping, self._language_mapping = self._populate_mapping()
@@ -91,18 +108,22 @@ def from_path(
path: str | Path,
model: Encoder | None = None,
extensions: Sequence[str] | None = None,
- include_text_files: bool = False,
+ content: ContentType = ContentType.CODE,
+ include_text_files: bool | None = None,
) -> SembleIndex:
"""Create and index a SembleIndex from a directory.
:param path: Root directory to index.
:param model: Embedding model to use. Defaults to potion-code-16M.
:param extensions: File extensions to include. Defaults to a standard set of code extensions.
- :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
- :return: An indexed SembleIndex. Chunk file paths are relative to ``path``.
+ :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS,
+ or ContentType.ALL.
+ :param include_text_files: Deprecated. Use content=ContentType.ALL instead.
+ :return: An indexed SembleIndex. Chunk file paths are relative to path.
:raises FileNotFoundError: If `path` does not exist.
:raises NotADirectoryError: If `path` exists but is not a directory.
"""
+ normalized = _apply_include_text_files(content, include_text_files)
model = model or load_model()
path = Path(path)
if not path.exists():
@@ -114,11 +135,11 @@ def from_path(
path,
model=model,
extensions=extensions,
- include_text_files=include_text_files,
+ content=normalized,
display_root=path,
)
- return SembleIndex(model, bm25, vicinity, chunks, root=path)
+ return SembleIndex(model, bm25, vicinity, chunks, root=path, content=normalized)
@classmethod
def from_git(
@@ -127,23 +148,27 @@ def from_git(
ref: str | None = None,
model: Encoder | None = None,
extensions: Sequence[str] | None = None,
- include_text_files: bool = False,
+ content: ContentType = ContentType.CODE,
+ include_text_files: bool | None = None,
) -> SembleIndex:
"""Clone a git repository and index it.
The repository is cloned into a temporary directory that is removed once
indexing finishes. Chunk content is preserved in-memory, but
- ``chunk.file_path`` will not point to a readable file after this call
+ chunk.file_path will not point to a readable file after this call
returns — it is a repo-relative label, not a filesystem path.
:param url: URL of the git repository to clone (any git provider).
:param ref: Branch or tag to check out. Defaults to the remote HEAD.
:param model: Embedding model to use. Defaults to potion-code-16M.
:param extensions: File extensions to include. Defaults to a standard set of code extensions.
- :param include_text_files: If True, also index non-code text files (.md, .yaml, .json, etc.).
- :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. ``src/foo.py``).
+ :param content: Content type to index: ContentType.CODE (default), ContentType.DOCS,
+ or ContentType.ALL.
+ :param include_text_files: Deprecated. Use content=ContentType.ALL instead.
+ :return: An indexed SembleIndex. Chunk file paths are repo-relative (e.g. src/foo.py).
:raises RuntimeError: If git is not on PATH, the clone fails, or times out.
"""
+ normalized = _apply_include_text_files(content, include_text_files)
with tempfile.TemporaryDirectory() as tmp_dir:
# `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`).
cmd = ["git", "clone", "--depth", "1", *(["--branch", ref] if ref else []), "--", url, tmp_dir]
@@ -163,11 +188,11 @@ def from_git(
resolved_path,
model=model,
extensions=extensions,
- include_text_files=include_text_files,
+ content=normalized,
display_root=resolved_path,
)
- return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path)
+ return SembleIndex(model, bm25, vicinity, chunks, root=resolved_path, content=normalized)
def find_related(self, source: Chunk | SearchResult, *, top_k: int = 5) -> list[SearchResult]:
"""Return chunks semantically similar to the given chunk or search result.
@@ -202,38 +227,38 @@ def search(
alpha: float | None = None,
filter_languages: list[str] | None = None,
filter_paths: list[str] | None = None,
- rerank: bool = True,
+ rerank: bool | None = None,
) -> list[SearchResult]:
"""Search the index and return the top-k most relevant chunks.
:param query: Natural-language or keyword query string.
:param top_k: Maximum number of results to return.
:param alpha: Blend weight for hybrid score combination; 1.0 = full semantic
- weight, 0.0 = full BM25 weight. File-path penalties and diversity reranking
- are applied regardless. ``None`` auto-detects from query type.
+ weight, 0.0 = full BM25 weight. None auto-detects from query type.
:param filter_languages: Optional list of language codes; if set, only chunks in
these languages are returned.
:param filter_paths: Optional list of repo-relative file paths; if set, only
chunks from these files are returned.
- :param rerank: Whether to rerank the top-k results using custom reranking logic.
- :return: Ranked list of :class:`SearchResult` objects, best match first.
+ :param rerank: Apply code-tuned reranking (file boost, identifier boost, path penalties).
+ Defaults to True when ContentType.CODE was indexed.
+ :return: Ranked list of SearchResult objects, best match first.
"""
- bm25_index, semantic_index = self._bm25_index, self._semantic_index
if not self.chunks or not query.strip():
return []
- selector = self._get_selector_vector(filter_languages, filter_paths)
+ resolved_rerank = (self._content != ContentType.DOCS) if rerank is None else rerank
+ selector = self._get_selector_vector(filter_languages, filter_paths)
results = search(
query,
self.model,
- semantic_index,
- bm25_index,
+ self._semantic_index,
+ self._bm25_index,
self.chunks,
top_k,
alpha=alpha,
selector=selector,
- rerank=rerank,
+ rerank=resolved_rerank,
)
save_search_stats(results, CallType.SEARCH, self._file_sizes)
return results
diff --git a/src/semble/mcp.py b/src/semble/mcp.py
index 5993c6c..b19b762 100644
--- a/src/semble/mcp.py
+++ b/src/semble/mcp.py
@@ -12,7 +12,7 @@
from semble.index import SembleIndex
from semble.index.dense import load_model
-from semble.types import Encoder
+from semble.types import ContentType, Encoder
from semble.utils import _format_results, _is_git_url, _resolve_chunk
logger = logging.getLogger(__name__)
@@ -112,9 +112,13 @@ async def find_related(
return server
-async def serve(path: str | None = None, ref: str | None = None, include_text_files: bool = False) -> None:
+async def serve(
+ path: str | None = None,
+ ref: str | None = None,
+ content: ContentType = ContentType.CODE,
+) -> None:
"""Start an MCP stdio server, optionally pre-indexing a default source."""
- cache = _IndexCache(include_text_files=include_text_files)
+ cache = _IndexCache(content=content)
async def _load_and_prewarm() -> None:
"""Pre-load the model and optionally pre-index the default source in parallel with starting the server."""
@@ -146,14 +150,14 @@ async def _load_and_prewarm() -> None:
class _IndexCache:
"""Cache of indexed repos and local paths for the lifetime of the MCP server process."""
- def __init__(self, model: Encoder | None = None, include_text_files: bool = False) -> None:
+ def __init__(self, model: Encoder | None = None, content: ContentType = ContentType.CODE) -> None:
"""Initialise an empty cache."""
self._model: Encoder | None = model
self._model_error: BaseException | None = None
self._model_ready = asyncio.Event()
if model is not None:
self._model_ready.set()
- self._include_text_files = include_text_files
+ self._content = content
self._tasks: OrderedDict[str, asyncio.Task[SembleIndex]] = OrderedDict() # ordered for LRU eviction
self._watcher_task: asyncio.Task[None] | None = None
@@ -206,7 +210,7 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
source,
ref=ref,
model=model,
- include_text_files=self._include_text_files,
+ content=self._content,
)
)
else:
@@ -215,7 +219,7 @@ async def get(self, source: str, ref: str | None = None) -> SembleIndex:
SembleIndex.from_path,
cache_key,
model=model,
- include_text_files=self._include_text_files,
+ content=self._content,
)
)
self._tasks.move_to_end(cache_key)
diff --git a/src/semble/search.py b/src/semble/search.py
index d0a9d25..5b88229 100644
--- a/src/semble/search.py
+++ b/src/semble/search.py
@@ -86,7 +86,7 @@ def search(
:param top_k: Number of results to return.
:param alpha: Weight for semantic score (1-alpha goes to BM25). None = auto-detect based on query type.
:param selector: Optional array of chunk indices to filter results by.
- :param rerank: Whether to perform reranking. This should be done, and is mainly here for benchmarking.
+ :param rerank: Whether to perform code-tuned reranking. On by default for code search, off for docs search.
:return: List of search results sorted by combined score descending.
"""
alpha_weight = resolve_alpha(query, alpha)
diff --git a/src/semble/types.py b/src/semble/types.py
index d01c774..1da4c7f 100644
--- a/src/semble/types.py
+++ b/src/semble/types.py
@@ -16,6 +16,14 @@ class CallType(str, Enum):
FIND_RELATED = "find_related"
+class ContentType(str, Enum):
+ """Content type for indexing and search pipeline selection."""
+
+ CODE = "code"
+ DOCS = "docs"
+ ALL = "all"
+
+
class Encoder(Protocol):
"""Protocol for embedding models."""
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 28d19cd..c9caa11 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -195,6 +195,26 @@ def test_mcp_main_exits_with_message_when_extras_missing(
assert "pip install 'semble[mcp]'" in capsys.readouterr().err
+def test_include_text_files_cli_deprecated(
+ monkeypatch: pytest.MonkeyPatch,
+ capsys: pytest.CaptureFixture[str],
+) -> None:
+ """--include-text-files on CLI raises DeprecationWarning."""
+ import warnings
+
+ chunk = make_chunk("def foo(): pass", "src/foo.py")
+ fake_index = MagicMock()
+ fake_index.search.return_value = [SearchResult(chunk=chunk, score=0.9)]
+ monkeypatch.setattr(sys, "argv", ["semble", "search", "query", "/some/path", "--include-text-files"])
+ with patch("semble.cli.SembleIndex.from_path", return_value=fake_index):
+ with warnings.catch_warnings(record=True) as caught:
+ warnings.simplefilter("always")
+ _cli_main()
+ assert any(
+ "include-text-files" in str(w.message).lower() for w in caught if issubclass(w.category, DeprecationWarning)
+ )
+
+
def test_agent_file_tools_are_bash_only() -> None:
"""The agent file must list only Bash and Read — no MCP tools that require schema loading."""
frontmatter = files("semble").joinpath("agents/claude.md").read_text(encoding="utf-8").split("---")[1]
diff --git a/tests/test_files.py b/tests/test_files.py
index 71ede4c..bcb747f 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -1,6 +1,9 @@
from pathlib import Path
-from semble.index.files import _DOC_LANGUAGES, _EXTENSION_TO_LANGUAGE, detect_language, get_extensions
+import pytest
+
+from semble.index.files import _CODE_LANGUAGES, _CONFIG_LANGUAGES, _DOC_LANGUAGES, detect_language, get_extensions
+from semble.types import ContentType
def test_detect_language() -> None:
@@ -10,32 +13,36 @@ def test_detect_language() -> None:
assert detect_language(Path("c.txt")) is None
-def test_get_extensions() -> None:
- """Test the get_extensions function."""
- all_extensions = get_extensions(True, None)
- without_doc_extensions = get_extensions(False, None)
+def test_language_sets_are_consistent() -> None:
+ """Code, doc, and config language sets are mutually disjoint."""
+ assert _CODE_LANGUAGES.isdisjoint(_DOC_LANGUAGES)
+ assert _CODE_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES)
+ assert _DOC_LANGUAGES.isdisjoint(_CONFIG_LANGUAGES)
- doc_extensions = set(all_extensions) - set(without_doc_extensions)
- for extension in doc_extensions:
- assert _EXTENSION_TO_LANGUAGE[extension] in _DOC_LANGUAGES
- for extension in without_doc_extensions:
- assert _EXTENSION_TO_LANGUAGE[extension] not in _DOC_LANGUAGES
+@pytest.mark.parametrize(
+ ("content", "includes", "excludes"),
+ [
+ (ContentType.CODE, [".py"], [".md"]),
+ (ContentType.DOCS, [".md"], [".py"]),
+ (ContentType.ALL, [".py", ".md"], []),
+ ],
+)
+def test_get_extensions(content: ContentType, includes: list[str], excludes: list[str]) -> None:
+ """get_extensions returns the right extensions for each content type."""
+ exts = set(get_extensions(content, None))
+ for ext in includes:
+ assert ext in exts
+ for ext in excludes:
+ assert ext not in exts
def test_get_extensions_additional() -> None:
- """Test the get_extensions function."""
- all_extensions = get_extensions(True, None)
- all_extensions_extra = get_extensions(True, [".kjs"])
-
- assert set(all_extensions_extra) == set(all_extensions) | {".kjs"}
-
- all_extensions = get_extensions(False, None)
- all_extensions_extra = get_extensions(False, [".kjs"])
-
- assert set(all_extensions_extra) == set(all_extensions) | {".kjs"}
-
- all_extensions = get_extensions(False, None)
- all_extensions_extra = get_extensions(False, [".py"])
-
- assert set(all_extensions_extra) == set(all_extensions)
+ """Extra extensions are appended and existing ones are not duplicated."""
+ base = get_extensions(ContentType.ALL, None)
+ with_extra = get_extensions(ContentType.ALL, [".kjs"])
+ assert set(with_extra) == set(base) | {".kjs"}
+
+ base_code = get_extensions(ContentType.CODE, None)
+ with_existing = get_extensions(ContentType.CODE, [".py"])
+ assert set(with_existing) == set(base_code)
diff --git a/tests/test_index.py b/tests/test_index.py
index 3f90fcb..5abf90e 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1,12 +1,12 @@
from pathlib import Path
from typing import Any
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
import pytest
from semble import SembleIndex
from semble.index.create import _MAX_FILE_BYTES, create_index_from_path
-from semble.types import Encoder
+from semble.types import ContentType, Encoder
from tests.conftest import make_chunk
@@ -17,18 +17,40 @@ def indexed_index(mock_model: Any, tmp_project: Path) -> SembleIndex:
@pytest.mark.parametrize(
- ("include_text_files", "md_in_results"),
- [(False, False), (True, True)],
+ ("content", "md_in_results"),
+ [
+ (ContentType.CODE, False),
+ (ContentType.DOCS, True),
+ (ContentType.ALL, True),
+ ],
)
def test_index_markdown_inclusion(
- mock_model: Encoder, tmp_project: Path, include_text_files: bool, md_in_results: bool
+ mock_model: Encoder, tmp_project: Path, content: ContentType, md_in_results: bool
) -> None:
- """Markdown files are excluded by default and included when include_text_files=True."""
- _, _, chunks = create_index_from_path(tmp_project, mock_model, include_text_files=include_text_files)
+ """Markdown files are excluded for code and included for docs/all."""
+ _, _, chunks = create_index_from_path(tmp_project, mock_model, content=content)
has_md = ".md" in {Path(c.file_path).suffix for c in chunks}
assert has_md is md_in_results
+@pytest.mark.parametrize("include_text_files", [True, False])
+def test_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path, include_text_files: bool) -> None:
+ """include_text_files raises DeprecationWarning on from_path."""
+ with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
+ SembleIndex.from_path(tmp_project, model=mock_model, include_text_files=include_text_files)
+
+
+def test_from_git_include_text_files_deprecated(mock_model: Encoder, tmp_project: Path) -> None:
+ """from_git raises DeprecationWarning when include_text_files is passed."""
+ fake_result = MagicMock()
+ fake_result.returncode = 0
+ with patch("subprocess.run", return_value=fake_result):
+ with patch("semble.index.index.create_index_from_path") as mock_create:
+ mock_create.return_value = (MagicMock(), MagicMock(), [make_chunk("x = 1", "f.py")])
+ with pytest.warns(DeprecationWarning, match="include_text_files is deprecated"):
+ SembleIndex.from_git("https://example.com/repo", model=mock_model, include_text_files=True)
+
+
def test_index_empty_returns_zero_chunks(mock_model: Encoder, tmp_path: Path) -> None:
"""Indexing an empty directory yields zero files and chunks."""
with pytest.raises(ValueError):
@@ -85,6 +107,27 @@ def test_search_without_reranking(indexed_index: SembleIndex) -> None:
mock.assert_called()
+@pytest.mark.parametrize(
+ ("content", "expect_rerank"),
+ [
+ (ContentType.CODE, True),
+ (ContentType.ALL, True),
+ (ContentType.DOCS, False),
+ ],
+)
+def test_search_rerank_default_by_content_type(
+ mock_model: Encoder, tmp_project: Path, content: ContentType, expect_rerank: bool
+) -> None:
+ """Reranking is on by default for code/all content, off for docs-only."""
+ index = SembleIndex.from_path(tmp_project, model=mock_model, content=content)
+ with patch("semble.search.rerank_topk") as mock:
+ index.search("function", top_k=3)
+ if expect_rerank:
+ mock.assert_called()
+ else:
+ mock.assert_not_called()
+
+
@pytest.mark.parametrize("query", ["", " ", "\n\n"])
def test_search_empty_query_returns_empty(indexed_index: SembleIndex, query: str) -> None:
"""Empty / whitespace-only queries return [] across all modes."""