From bdbbf07714babb4eb1c4bf94a8fd93f4572fa4f0 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 2 Mar 2026 12:22:17 +0100 Subject: [PATCH 1/2] docs: Add coding agent docs, `SKILL.md` --- docs/conf.py | 3 +- docs/guides/coding-agents/SKILL.md | 118 +++++++++++++++++++++ docs/guides/coding-agents/coding-agents.md | 65 ++++++++++++ 3 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 docs/guides/coding-agents/SKILL.md create mode 100644 docs/guides/coding-agents/coding-agents.md diff --git a/docs/conf.py b/docs/conf.py index 73dc6117..462171aa 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,6 @@ _mod = importlib.import_module("dataframely") - project = "dataframely" copyright = f"{datetime.date.today().year}, QuantCo, Inc" author = "QuantCo, Inc." @@ -71,7 +70,7 @@ maximum_signature_line_length = 88 # source files -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "SKILL.md"] source_suffix = { ".rst": "restructuredtext", ".txt": "markdown", diff --git a/docs/guides/coding-agents/SKILL.md b/docs/guides/coding-agents/SKILL.md new file mode 100644 index 00000000..99df3758 --- /dev/null +++ b/docs/guides/coding-agents/SKILL.md @@ -0,0 +1,118 @@ +--- +name: dataframely +description: A declarative, Polars-native data frame validation library. Use when implementing data processing logic in polars. +license: BSD-3-Clause +--- + +# Dataframely skill + +`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames. + +## `dy.Schema` example + +A `dy.Schema` describes the structure of a single dataframe. + +```python +class HouseSchema(dy.Schema): + """A schema for a dataframe describing houses.""" + + street: dy.String(primary_key=True) + number: dy.UInt16(primary_key=True) + # Number of rooms + rooms: dy.UInt8() + # Area in square meters + area: dy.UInt16() +``` + +## `dy.Collection` example + +A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection should share at least a subset of their primary key. + +```python +class MyStreetSchema(dy.Schema): + """A schema for a dataframe describing streets.""" + + # Shared primary key component with MyHouseSchema + street: dy.String(primary_key=True) + city: dy.String() + + +class MyCollection(dy.Collection): + """A collection of related dataframes.""" + + houses: MyHouseSchema + streets: MyStreetSchema +``` + +# Usage conventions + +## Use clear interfaces + +Structure data processing code with clear interfaces documented using `dataframely` type hints: + +```python +def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]: + # Internal dataframes do not require schemas + df: pl.LazyFrame = ... + return MyPreprocessedSchema.validate(df, cast=True) +``` + +Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary dataframes and private helper functions (prefixed with `_`). + +## `filter` vs `validate` + +Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting. + +- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data). +- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are possible and should be handled gracefully (e.g. logging and skipping invalid rows). + +## Testing + +Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties already guaranteed by the schema. + +### Test structure + +1. Create synthetic input data +2. Define the expected output +3. Execute the transformation +4. Compare using `assert_frame_equal` from `polars.testing` (or `diffly.testing` if installed) + +```python +from polars.testing import assert_frame_equal + + +def test_grouped_sum(): + df = pl.DataFrame({ + "col1": [1, 2, 3], + "col2": ["a", "a", "b"], + }).pipe(MyInputSchema.validate, cast=True) + + expected = pl.DataFrame({ + "col1": ["a", "b"], + "col2": [3, 3], + }) + + result = my_code(df) + + assert assert_frame_equal(expected, result) +``` + +### Generating synthetic input data + +For complex schemas where only some columns are relevant to the test, use `dataframely`'s synthetic data generation: + +```python +# Random data meeting all schema constraints +random_data = MyInputSchema.sample(num_rows=100) +``` + +Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while randomly sampling the rest: + +```python +random_data_with_overrides = HouseSchema.sample( + num_rows=5, + overrides={ + "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."], + } +) +``` diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents/coding-agents.md new file mode 100644 index 00000000..0358a218 --- /dev/null +++ b/docs/guides/coding-agents/coding-agents.md @@ -0,0 +1,65 @@ +# Using `dataframely` with coding agents + +Coding agents are particularly powerful when two criteria are met: + +1. The agent can know all required information and does not need to guess. +2. The results of the agent's work can be easily verified. + +`dataframely` helps you fulfill these criteria. + +To help your coding agent write good `dataframely` code, we provide a +`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md) +following the [ +`agentskills.io` spec](https://agentskills.io/specification). You can install +it by placing it where your agent can find it. For example, if you are using `claude`: + +```bash +mkdir -p .claude/skills/dataframely/ +curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md +``` + +Refer to the documentation of your coding agent for instructions on how to add custom skills. + +## Tell the agent about your data with `dataframely` schemas + +`dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding +agents understand your code base. We recommend structuring your data processing code using clear interfaces that are +documented using +`dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the right time. + +For example: + +```python +def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]: + ... +``` + +gives a coding agent much more information than the schema-less alternative: + +```python +def load_data(raw: pl.LazyFrame) -> pl.DataFrame: + ... +``` + +This convention also makes your code more readable and maintainable for human developers. + +If there is additional domain information that is not natively expressed through the structure of the schema, +we recommend documenting this as docstrings on the definition of the schema columns. One common example would be the +semantic meanings of enum values referring to conventions in the data: + +```python +class HospitalStaySchema(dy.Schema): + # Reason for admission to the hospital + # N = Emergency + # V = Transfer from another hospital + # ... + admission_reason = dy.Enum(["N", "V", ...]) +``` + +## Verifying results + +`dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant +bottle neck is the generation of appropriate test data. Check +out [our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help +you generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to +write tests using this functionality to verify its work. From 228019ac13c8659f55e9d8dc99dd052b041d024f Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 2 Mar 2026 12:44:21 +0100 Subject: [PATCH 2/2] add llms.txt --- docs/conf.py | 1 + .../{coding-agents => }/coding-agents.md | 0 docs/guides/index.md | 1 + pixi.lock | 41 +++++++++++++++++++ pixi.toml | 2 + 5 files changed, 45 insertions(+) rename docs/guides/{coding-agents => }/coding-agents.md (100%) diff --git a/docs/conf.py b/docs/conf.py index 462171aa..596bd891 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,6 +41,7 @@ "sphinx_copybutton", "sphinx_design", "sphinx_toolbox.more_autodoc.overloads", + "sphinx_llms_txt", ] ## sphinx diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents.md similarity index 100% rename from docs/guides/coding-agents/coding-agents.md rename to docs/guides/coding-agents.md diff --git a/docs/guides/index.md b/docs/guides/index.md index d0e20ebc..538b63ed 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -7,6 +7,7 @@ quickstart examples/index features/index +coding-agents development migration/index faq diff --git a/pixi.lock b/pixi.lock index 06db5af4..7f29ace5 100644 --- a/pixi.lock +++ b/pixi.lock @@ -3,6 +3,8 @@ environments: build: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -448,6 +450,8 @@ environments: default: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -1992,6 +1996,8 @@ environments: default-polars-minimal: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3536,6 +3542,8 @@ environments: docs: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3673,6 +3681,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -3835,6 +3844,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -3991,6 +4001,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4148,6 +4159,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4301,6 +4313,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4337,6 +4350,8 @@ environments: lint: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -4663,6 +4678,8 @@ environments: nightly: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -5669,6 +5686,8 @@ environments: polars-minimal: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -6349,6 +6368,8 @@ environments: py310: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7044,6 +7065,8 @@ environments: py311: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7720,6 +7743,8 @@ environments: py312: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -8396,6 +8421,8 @@ environments: py313: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9066,6 +9093,8 @@ environments: py314: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9746,6 +9775,8 @@ environments: py314-optionals: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -29295,6 +29326,16 @@ packages: license_family: MIT size: 12320 timestamp: 1754550385132 +- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784 + md5: 6b170f1a7d5c1729073c354b2d0ac32d + depends: + - python >=3.10 + - sphinx + license: MIT + license_family: MIT + size: 25685 + timestamp: 1765935234507 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda sha256: 3d2e0d961b38f66ea3e7decd04917bf69104b6683dae778e4d3ef5291c04b861 md5: bfc047865de18ef2657bd8a95d7b8b49 diff --git a/pixi.toml b/pixi.toml index 855b3bc3..9a164c12 100644 --- a/pixi.toml +++ b/pixi.toml @@ -36,6 +36,8 @@ sphinx = ">=8.2" sphinx-copybutton = "*" sphinx-design = "*" sphinx-toolbox = "*" +sphinx-llms-txt = "*" + [feature.docs.tasks] docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" } readthedocs = { cmd = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html", depends-on = "docs" }