diff --git a/.gitignore b/.gitignore
index 10ac8e45..654b686d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,10 @@ ENV/
 env.bak/
 venv.bak/
 
+# Benchmarks (new pytest-benchmark suite)
+.benchmarks/
+
+# Benchmarks (old Snakemake suite in benchmark/)
 benchmark/*.pdf
 benchmark/benchmarks
 benchmark/.snakemake
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..22ac73ce
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,94 @@
+# Internal Performance Benchmarks
+
+Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
+
+> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
+
+## Setup
+
+```bash
+pip install -e ".[benchmarks]"
+```
+
+## Running benchmarks
+
+```bash
+# Quick smoke test (small sizes only)
+pytest benchmarks/ --quick
+
+# Full timing benchmarks
+pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py
+
+# Run a specific model
+pytest benchmarks/test_build.py -k basic
+```
+
+## Comparing timing between branches
+
+```bash
+# Save baseline results on master
+git checkout master
+pytest benchmarks/test_build.py --benchmark-save=master
+
+# Switch to feature branch and compare
+git checkout my-feature
+pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
+
+# Compare saved results without re-running
+pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
+```
+
+Results are stored in `.benchmarks/` (gitignored).
+
+## Memory benchmarks
+
+`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.
+
+By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
+
+```bash
+# Save baseline on master
+git checkout master
+python benchmarks/memory.py save master
+
+# Save feature branch
+git checkout my-feature
+python benchmarks/memory.py save my-feature
+
+# Compare
+python benchmarks/memory.py compare master my-feature
+
+# Quick mode (smaller sizes, faster)
+python benchmarks/memory.py save master --quick
+
+# Measure a specific phase (includes build overhead)
+python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
+```
+
+Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).
+
+> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.
+
+## Models
+
+| Model | Description | Sizes |
+|-------|-------------|-------|
+| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
+| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
+| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
+| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
+| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |
+
+## Phases
+
+| Phase | File | What it measures |
+|-------|------|------------------|
+| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
+| LP write | `test_lp_write.py` | Writing the model to an LP file |
+| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |
+
+## Adding a new model
+
+1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
+2. Add parametrized tests in the relevant `test_*.py` files
+3. Add a quick threshold in `conftest.py`
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 00000000..6bf202cc
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes)."""
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
new file mode 100644
index 00000000..6f9a9467
--- /dev/null
+++ b/benchmarks/conftest.py
@@ -0,0 +1,30 @@
+"""Benchmark configuration and shared fixtures."""
+
+from __future__ import annotations
+
+import pytest
+
+QUICK_THRESHOLD = {
+    "basic": 100,
+    "knapsack": 10_000,
+    "pypsa_scigrid": 50,
+    "expression_arithmetic": 100,
+    "sparse_network": 100,
+}
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--quick",
+        action="store_true",
+        default=False,
+        help="Use smaller problem sizes for quick benchmarking",
+    )
+
+
+def skip_if_quick(request, model: str, size: int):
+    """Skip large sizes when --quick is passed."""
+    if request.config.getoption("--quick"):
+        threshold = QUICK_THRESHOLD.get(model, float("inf"))
+        if size > threshold:
+            pytest.skip(f"--quick: skipping {model} size {size}")
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
new file mode 100644
index 00000000..20af4b8a
--- /dev/null
+++ b/benchmarks/memory.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+Measure and compare peak memory using pytest-memray.
+
+Usage:
+    # Save a baseline (on master)
+    python benchmarks/memory.py save master
+
+    # Save current branch
+    python benchmarks/memory.py save my-feature
+
+    # Compare two saved runs
+    python benchmarks/memory.py compare master my-feature
+
+    # Quick mode (smaller sizes)
+    python benchmarks/memory.py save master --quick
+
+Results are stored in .benchmarks/memory/.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import platform
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+if platform.system() == "Windows":
+    raise RuntimeError(
+        "memory.py requires pytest-memray which is not available on Windows. "
+        "Run memory benchmarks on Linux or macOS."
+    )
+
+RESULTS_DIR = Path(".benchmarks/memory")
+MEMORY_RE = re.compile(
+    r"Allocation results for (.+?) at the high watermark\s+"
+    r"📦 Total memory allocated: ([\d.]+)(MiB|KiB|GiB|B)",
+)
+# Only the build phase is measured by default. Unlike timing benchmarks (where
+# pytest-benchmark isolates the measured function), memray tracks all allocations
+# within a test — including model construction in setup. This means LP write and
+# matrix tests would report build + phase memory combined, making the phase-specific
+# contribution hard to isolate. Since model construction dominates memory usage,
+# measuring build alone gives the most accurate and actionable numbers.
+DEFAULT_TEST_PATHS = [
+    "benchmarks/test_build.py",
+]
+
+
+def _to_mib(value: float, unit: str) -> float:
+    factors = {"B": 1 / 1048576, "KiB": 1 / 1024, "MiB": 1, "GiB": 1024}
+    return value * factors[unit]
+
+
+def _collect_test_ids(test_paths: list[str], quick: bool) -> list[str]:
+    """Collect test IDs without running them."""
+    cmd = [
+        sys.executable,
+        "-m",
+        "pytest",
+        *test_paths,
+        "--collect-only",
+        "-q",
+    ]
+    if quick:
+        cmd.append("--quick")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    return [
+        line.strip()
+        for line in result.stdout.splitlines()
+        if "::" in line and not line.startswith(("=", "-", " "))
+    ]
+
+
+def save(label: str, quick: bool = False, test_paths: list[str] | None = None) -> Path:
+    """Run each benchmark in a separate process for accurate memory measurement."""
+    if test_paths is None:
+        test_paths = DEFAULT_TEST_PATHS
+    test_ids = _collect_test_ids(test_paths, quick)
+    if not test_ids:
+        print("No tests collected.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Running {len(test_ids)} tests (each in a separate process)...")
+    entries = {}
+    for i, test_id in enumerate(test_ids, 1):
+        short = test_id.split("::")[-1]
+        print(f"  [{i}/{len(test_ids)}] {short}...", end=" ", flush=True)
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "pytest",
+            test_id,
+            "--memray",
+            "--benchmark-disable",
+            "-v",
+            "--tb=short",
+            "-q",
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        output = result.stdout + result.stderr
+
+        match = MEMORY_RE.search(output)
+        if match:
+            value = float(match.group(2))
+            unit = match.group(3)
+            mib = round(_to_mib(value, unit), 3)
+            entries[test_id] = mib
+            print(f"{mib:.1f} MiB")
+        elif "SKIPPED" in output or "skipped" in output:
+            print("skipped")
+        else:
+            print(
+                "WARNING: no memray data (pytest-memray output format may have changed)",
+                file=sys.stderr,
+            )
+
+    if not entries:
+        print("No memray results found. Is pytest-memray installed?", file=sys.stderr)
+        sys.exit(1)
+
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = RESULTS_DIR / f"{label}.json"
+    out_path.write_text(json.dumps({"label": label, "peak_mib": entries}, indent=2))
+    print(f"\nSaved {len(entries)} results to {out_path}")
+    return out_path
+
+
+def compare(label_a: str, label_b: str) -> None:
+    """Compare two saved memory results."""
+    path_a = RESULTS_DIR / f"{label_a}.json"
+    path_b = RESULTS_DIR / f"{label_b}.json"
+    for p in (path_a, path_b):
+        if not p.exists():
+            print(f"Not found: {p}. Run 'save {p.stem}' first.", file=sys.stderr)
+            sys.exit(1)
+
+    data_a = json.loads(path_a.read_text())["peak_mib"]
+    data_b = json.loads(path_b.read_text())["peak_mib"]
+
+    all_tests = sorted(set(data_a) | set(data_b))
+
+    print(f"\n{'Test':<60} {label_a:>10} {label_b:>10} {'Change':>10}")
+    print("-" * 94)
+
+    for test in all_tests:
+        a = data_a.get(test)
+        b = data_b.get(test)
+        a_str = f"{a:.1f}" if a is not None else "—"
+        b_str = f"{b:.1f}" if b is not None else "—"
+        if a is not None and b is not None and a > 0:
+            pct = (b - a) / a * 100
+            change = f"{pct:+.1f}%"
+        else:
+            change = "—"
+        # Shorten test name for readability
+        short = test.split("::")[-1] if "::" in test else test
+        print(f"{short:<60} {a_str:>10} {b_str:>10} {change:>10}")
+
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p_save = sub.add_parser("save", help="Run benchmarks and save memory results")
+    p_save.add_argument(
+        "label", help="Label for this run (e.g. 'master', 'my-feature')"
+    )
+    p_save.add_argument(
+        "--quick", action="store_true", help="Use smaller problem sizes"
+    )
+    p_save.add_argument(
+        "--test-path",
+        nargs="+",
+        default=None,
+        help="Test file(s) to run (default: all phases)",
+    )
+
+    p_cmp = sub.add_parser("compare", help="Compare two saved runs")
+    p_cmp.add_argument("label_a", help="First run label (baseline)")
+    p_cmp.add_argument("label_b", help="Second run label")
+
+    args = parser.parse_args()
+    if args.cmd == "save":
+        save(args.label, quick=args.quick, test_paths=args.test_path)
+    elif args.cmd == "compare":
+        compare(args.label_a, args.label_b)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
new file mode 100644
index 00000000..fcff9caf
--- /dev/null
+++ b/benchmarks/models/__init__.py
@@ -0,0 +1,21 @@
+"""Model builders for benchmarks."""
+
+from benchmarks.models.basic import SIZES as BASIC_SIZES
+from benchmarks.models.basic import build_basic
+from benchmarks.models.expression_arithmetic import SIZES as EXPR_SIZES
+from benchmarks.models.expression_arithmetic import build_expression_arithmetic
+from benchmarks.models.knapsack import SIZES as KNAPSACK_SIZES
+from benchmarks.models.knapsack import build_knapsack
+from benchmarks.models.sparse_network import SIZES as SPARSE_SIZES
+from benchmarks.models.sparse_network import build_sparse_network
+
+__all__ = [
+    "BASIC_SIZES",
+    "EXPR_SIZES",
+    "KNAPSACK_SIZES",
+    "SPARSE_SIZES",
+    "build_basic",
+    "build_expression_arithmetic",
+    "build_knapsack",
+    "build_sparse_network",
+]
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
new file mode 100644
index 00000000..2aea49d9
--- /dev/null
+++ b/benchmarks/models/basic.py
@@ -0,0 +1,18 @@
+"""Basic benchmark model: 2*N^2 variables and constraints."""
+
+from __future__ import annotations
+
+import linopy
+
+SIZES = [10, 50, 100, 250, 500, 1000, 1600]
+
+
+def build_basic(n: int) -> linopy.Model:
+    """Build a basic N*N model with 2*N^2 vars and 2*N^2 constraints."""
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="x")
+    y = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="y")
+    m.add_constraints(x + y <= 10, name="upper")
+    m.add_constraints(x - y >= -5, name="lower")
+    m.add_objective(x.sum() + 2 * y.sum())
+    return m
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
new file mode 100644
index 00000000..339c651d
--- /dev/null
+++ b/benchmarks/models/expression_arithmetic.py
@@ -0,0 +1,30 @@
+"""Expression arithmetic benchmark: stress-tests +, *, sum, broadcasting."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+
+SIZES = [10, 50, 100, 250, 500, 1000]
+
+
+def build_expression_arithmetic(n: int) -> linopy.Model:
+    """Build a model that exercises expression arithmetic heavily."""
+    m = linopy.Model()
+
+    # Variables on different dimensions to trigger broadcasting
+    x = m.add_variables(coords=[range(n), range(n)], dims=["i", "j"], name="x")
+    y = m.add_variables(coords=[range(n)], dims=["i"], name="y")
+    z = m.add_variables(coords=[range(n)], dims=["j"], name="z")
+
+    # Expression arithmetic: broadcasting y (dim i) and z (dim j) against x (dim i,j)
+    coeffs = np.linspace(-1, 1, n * n).reshape(n, n)
+    expr1 = x * coeffs + y - z
+    expr2 = 2 * x - 3 * y + z
+    combined = expr1 + expr2
+
+    m.add_constraints(combined <= 100, name="combined")
+    m.add_constraints(expr1.sum("j") >= -10, name="row_sum")
+    m.add_objective(combined.sum())
+    return m
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
new file mode 100644
index 00000000..83ce7394
--- /dev/null
+++ b/benchmarks/models/knapsack.py
@@ -0,0 +1,23 @@
+"""Knapsack benchmark model: N binary variables, 1 constraint."""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+
+SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
+
+
+def build_knapsack(n: int) -> linopy.Model:
+    """Build a knapsack model with N items."""
+    rng = np.random.default_rng(42)
+    weights = rng.integers(1, 100, size=n)
+    values = rng.integers(1, 100, size=n)
+    capacity = int(weights.sum() * 0.5)
+
+    m = linopy.Model()
+    x = m.add_variables(coords=[range(n)], dims=["item"], binary=True, name="x")
+    m.add_constraints((x * weights).sum() <= capacity, name="capacity")
+    m.add_objective(-(x * values).sum())
+    return m
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
new file mode 100644
index 00000000..2fcce217
--- /dev/null
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -0,0 +1,20 @@
+"""PyPSA SciGrid-DE benchmark model."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import linopy
+
+SIZES = [10, 50, 100, 200]
+
+
+def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
+    """Build PyPSA SciGrid model. Requires pypsa to be installed."""
+    import pypsa
+
+    n = pypsa.examples.scigrid_de()
+    n.set_snapshots(n.snapshots[:snapshots])
+    n.optimize.create_model()
+    return n.model
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
new file mode 100644
index 00000000..afc6be06
--- /dev/null
+++ b/benchmarks/models/sparse_network.py
@@ -0,0 +1,50 @@
+"""Sparse network benchmark: variables on mismatched coordinate subsets."""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+import linopy
+
+SIZES = [10, 50, 100, 250, 500, 1000]
+
+
+def build_sparse_network(n_buses: int) -> linopy.Model:
+    """Build a ring network model with mismatched bus/line coordinate subsets."""
+    rng = np.random.default_rng(42)
+    n_lines = n_buses  # ring topology
+    n_time = min(n_buses, 24)
+
+    buses = pd.RangeIndex(n_buses, name="bus")
+    lines = pd.RangeIndex(n_lines, name="line")
+    time = pd.RangeIndex(n_time, name="time")
+
+    # Ring topology: line i connects bus i -> bus (i+1) % n
+    bus_from = np.arange(n_lines)
+    bus_to = (bus_from + 1) % n_buses
+
+    m = linopy.Model()
+
+    # Bus-level variables (bus × time)
+    gen = m.add_variables(lower=0, coords=[buses, time], name="gen")
+
+    # Line-level variables (line × time)
+    flow = m.add_variables(lower=-100, upper=100, coords=[lines, time], name="flow")
+
+    # Incidence matrix (bus × line): +1 for incoming, -1 for outgoing
+    incidence = np.zeros((n_buses, n_lines))
+    incidence[bus_to, np.arange(n_lines)] = 1  # incoming
+    incidence[bus_from, np.arange(n_lines)] = -1  # outgoing
+    incidence_da = xr.DataArray(incidence, coords=[buses, lines])
+
+    # Vectorized flow balance: gen - demand + incidence @ flow == 0
+    demand = xr.DataArray(
+        rng.uniform(10, 100, size=(n_buses, n_time)), coords=[buses, time]
+    )
+    net_flow = (flow * incidence_da).sum("line")
+    m.add_constraints(gen + net_flow == demand, name="balance")
+
+    m.add_objective(gen.sum())
+    return m
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
new file mode 100644
index 00000000..f657715e
--- /dev/null
+++ b/benchmarks/test_build.py
@@ -0,0 +1,53 @@
+"""Benchmarks for model construction speed."""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.conftest import skip_if_quick
+from benchmarks.models import (
+    BASIC_SIZES,
+    EXPR_SIZES,
+    KNAPSACK_SIZES,
+    SPARSE_SIZES,
+    build_basic,
+    build_expression_arithmetic,
+    build_knapsack,
+    build_sparse_network,
+)
+from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+
+
+@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
+def test_build_basic(benchmark, n, request):
+    skip_if_quick(request, "basic", n)
+    benchmark(build_basic, n)
+
+
+@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
+def test_build_knapsack(benchmark, n, request):
+    skip_if_quick(request, "knapsack", n)
+    benchmark(build_knapsack, n)
+
+
+@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
+def test_build_expression_arithmetic(benchmark, n, request):
+    skip_if_quick(request, "expression_arithmetic", n)
+    benchmark(build_expression_arithmetic, n)
+
+
+@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
+def test_build_sparse_network(benchmark, n, request):
+    skip_if_quick(request, "sparse_network", n)
+    benchmark(build_sparse_network, n)
+
+
+@pytest.mark.parametrize(
+    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
+)
+def test_build_pypsa_scigrid(benchmark, snapshots, request):
+    pytest.importorskip("pypsa")
+    skip_if_quick(request, "pypsa_scigrid", snapshots)
+    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
+
+    benchmark(build_pypsa_scigrid, snapshots)
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
new file mode 100644
index 00000000..6442ccd6
--- /dev/null
+++ b/benchmarks/test_lp_write.py
@@ -0,0 +1,63 @@
+"""Benchmarks for LP file writing speed."""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.conftest import skip_if_quick
+from benchmarks.models import (
+    BASIC_SIZES,
+    EXPR_SIZES,
+    KNAPSACK_SIZES,
+    SPARSE_SIZES,
+    build_basic,
+    build_expression_arithmetic,
+    build_knapsack,
+    build_sparse_network,
+)
+from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+
+
+@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
+def test_lp_write_basic(benchmark, n, request, tmp_path):
+    skip_if_quick(request, "basic", n)
+    m = build_basic(n)
+    lp_file = tmp_path / "model.lp"
+    benchmark(m.to_file, lp_file, progress=False)
+
+
+@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
+def test_lp_write_knapsack(benchmark, n, request, tmp_path):
+    skip_if_quick(request, "knapsack", n)
+    m = build_knapsack(n)
+    lp_file = tmp_path / "model.lp"
+    benchmark(m.to_file, lp_file, progress=False)
+
+
+@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
+def test_lp_write_expression_arithmetic(benchmark, n, request, tmp_path):
+    skip_if_quick(request, "expression_arithmetic", n)
+    m = build_expression_arithmetic(n)
+    lp_file = tmp_path / "model.lp"
+    benchmark(m.to_file, lp_file, progress=False)
+
+
+@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
+def test_lp_write_sparse_network(benchmark, n, request, tmp_path):
+    skip_if_quick(request, "sparse_network", n)
+    m = build_sparse_network(n)
+    lp_file = tmp_path / "model.lp"
+    benchmark(m.to_file, lp_file, progress=False)
+
+
+@pytest.mark.parametrize(
+    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
+)
+def test_lp_write_pypsa_scigrid(benchmark, snapshots, request, tmp_path):
+    pytest.importorskip("pypsa")
+    skip_if_quick(request, "pypsa_scigrid", snapshots)
+    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
+
+    m = build_pypsa_scigrid(snapshots)
+    lp_file = tmp_path / "model.lp"
+    benchmark(m.to_file, lp_file, progress=False)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
new file mode 100644
index 00000000..03c6ee63
--- /dev/null
+++ b/benchmarks/test_matrices.py
@@ -0,0 +1,49 @@
+"""Benchmarks for matrix generation (model -> sparse matrices)."""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.conftest import skip_if_quick
+from benchmarks.models import (
+    BASIC_SIZES,
+    EXPR_SIZES,
+    SPARSE_SIZES,
+    build_basic,
+    build_expression_arithmetic,
+    build_sparse_network,
+)
+
+
+def _access_matrices(m):
+    """Access all matrix properties to force computation."""
+    m.matrices.clean_cached_properties()
+    _ = m.matrices.A
+    _ = m.matrices.b
+    _ = m.matrices.c
+    _ = m.matrices.lb
+    _ = m.matrices.ub
+    _ = m.matrices.sense
+    _ = m.matrices.vlabels
+    _ = m.matrices.clabels
+
+
+@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
+def test_matrices_basic(benchmark, n, request):
+    skip_if_quick(request, "basic", n)
+    m = build_basic(n)
+    benchmark(_access_matrices, m)
+
+
+@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
+def test_matrices_expression_arithmetic(benchmark, n, request):
+    skip_if_quick(request, "expression_arithmetic", n)
+    m = build_expression_arithmetic(n)
+    benchmark(_access_matrices, m)
+
+
+@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
+def test_matrices_sparse_network(benchmark, n, request):
+    skip_if_quick(request, "sparse_network", n)
+    m = build_sparse_network(n)
+    benchmark(_access_matrices, m)
diff --git a/codecov.yml b/codecov.yml
index 69cb7601..74a549c1 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1 +1,4 @@
 comment: false
+
+ignore:
+  - "benchmarks/**"
diff --git a/doc/contributing.rst b/doc/contributing.rst
index 02162694..120683cb 100644
--- a/doc/contributing.rst
+++ b/doc/contributing.rst
@@ -68,6 +68,30 @@ GPU tests are automatically detected based on solver capabilities - no manual ma
 
 See the :doc:`gpu-acceleration` guide for more information about GPU solver setup and usage.
 
+Performance Benchmarks
+======================
+
+When working on performance-sensitive code, use the internal benchmark suite in ``benchmarks/`` to check for regressions.
+
+.. code-block:: bash
+
+    # Install benchmark dependencies
+    pip install -e ".[benchmarks]"
+
+    # Quick timing benchmarks
+    pytest benchmarks/ --quick
+
+    # Compare timing between branches
+    pytest benchmarks/test_build.py --benchmark-save=master
+    pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
+
+    # Compare peak memory between branches
+    python benchmarks/memory.py save master --quick
+    python benchmarks/memory.py save my-feature --quick
+    python benchmarks/memory.py compare master my-feature
+
+See ``benchmarks/README.md`` for full details on models, phases, and usage.
+
 Contributing examples
 =====================
 
diff --git a/pyproject.toml b/pyproject.toml
index 14a53a22..2d6b5302 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,6 +76,10 @@ dev = [
     "gurobipy",
     "highspy",
 ]
+benchmarks = [
+    "pytest-benchmark",
+    "pytest-memray",
+]
 solvers = [
     "gurobipy",
     "highspy>=1.5.0; python_version < '3.12'",
@@ -102,7 +106,7 @@ version_scheme = "no-guess-dev"
 
 [tool.pytest.ini_options]
 testpaths = ["test"]
-norecursedirs = ["dev-scripts", "doc", "examples", "benchmark"]
+norecursedirs = ["dev-scripts", "doc", "examples", "benchmark", "benchmarks"]
 markers = [
     "gpu: marks tests as requiring GPU hardware (deselect with '-m \"not gpu\"')",
 ]
@@ -115,7 +119,7 @@ omit = ["test/*"]
 exclude_also = ["if TYPE_CHECKING:"]
 
 [tool.mypy]
-exclude = ['dev/*', 'examples/*', 'benchmark/*', 'doc/*']
+exclude = ['dev/*', 'examples/*', 'benchmark/*', 'benchmarks/*', 'doc/*']
 ignore_missing_imports = true
 no_implicit_optional = true
 warn_unused_ignores = true