diff --git a/scripts/generate_datasheet.py b/scripts/generate_datasheet.py new file mode 100644 index 0000000..e2004fd --- /dev/null +++ b/scripts/generate_datasheet.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +""" +Generate a Datasheet for a ClimateVision training dataset. + +Usage: + python scripts/generate_datasheet.py \\ + --manifest data/manifests/sentinel2-deforestation.yaml \\ + --output-dir outputs/datasheets/ + +Runs inside the release CI pipeline so every dataset version published +ships with a Gebru-style datasheet alongside its model cards. +""" + +from __future__ import annotations + +import argparse +import logging +import sys +from pathlib import Path + +from climatevision.governance.datasheet import generate + +logger = logging.getLogger("generate_datasheet") + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--manifest", type=Path, required=True, help="Dataset manifest (yaml/json)") + parser.add_argument("--output-dir", type=Path, default=None, help="Where to write the datasheet") + parser.add_argument("--name", default=None, help="Override dataset name") + parser.add_argument("--version", default=None, help="Override dataset version") + parser.add_argument("-v", "--verbose", action="store_true") + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + ) + + paths = generate( + manifest=args.manifest, + output_dir=args.output_dir, + name=args.name, + version=args.version, + ) + for label, path in paths.items(): + print(f"{label}: {path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/climatevision/governance/__init__.py b/src/climatevision/governance/__init__.py index 0f6cc09..19bb5e9 100644 --- a/src/climatevision/governance/__init__.py +++ b/src/climatevision/governance/__init__.py @@ -6,6 +6,7 @@ - Regional bias and fairness auditing - Anomaly detection for inference inputs/outputs - Model audit trails and version tracking +- Datasheets for training datasets (Gebru et al., 2018) """ from .explainability import ( @@ -42,6 +43,13 @@ check_fairness_gate, SUPPORTED_REGIONS, ) +from .datasheet import ( + Datasheet, + build_datasheet, + generate as generate_datasheet, + render_markdown as render_datasheet_markdown, + write_datasheet, +) __all__ = [ # Explainability @@ -73,4 +81,10 @@ "RegionMetrics", "check_fairness_gate", "SUPPORTED_REGIONS", + # Datasheet + "Datasheet", + "build_datasheet", + "generate_datasheet", + "render_datasheet_markdown", + "write_datasheet", ] diff --git a/src/climatevision/governance/datasheet.py b/src/climatevision/governance/datasheet.py new file mode 100644 index 0000000..b4cc05d --- /dev/null +++ b/src/climatevision/governance/datasheet.py @@ -0,0 +1,215 @@ +""" +Datasheets for the datasets that train ClimateVision models. + +Companion to the Mitchell-style model cards in ``governance.model_card``: +where a model card describes the *model*, a datasheet describes the +*dataset* the model was trained on (Gebru et al., 2018, "Datasheets for +Datasets"). The two artifacts answer different questions and both need +to ship with a release. + +The module mirrors the model_card public surface (``build``, ``render``, +``write``, ``generate``) so contributors only have to learn one pattern, +and the release CI pipeline can call them in sequence. + +Sections covered: + +- Motivation +- Composition +- Collection process +- Preprocessing, cleaning, labeling +- Uses (intended and inappropriate) +- Distribution +- Maintenance + +Every section is a free-form ``dict`` of question -> answer so the schema +can grow without code changes; ``REQUIRED_QUESTIONS`` enforces the bare +minimum a release datasheet must answer. +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional, Union + +logger = logging.getLogger(__name__) + +_PROJECT_ROOT = Path(__file__).resolve().parents[3] +_DEFAULT_OUTPUT_DIR = _PROJECT_ROOT / "outputs" / "datasheets" + +REQUIRED_QUESTIONS = { + "motivation": ("purpose", "creators"), + "composition": ("instances", "labels", "splits"), + "collection_process": ("source", "timeframe"), + "uses": ("intended_uses", "inappropriate_uses"), +} + + +@dataclass +class Datasheet: + """Structured datasheet for a single training dataset.""" + + name: str + version: str + motivation: dict + composition: dict + collection_process: dict + preprocessing: dict + uses: dict + distribution: dict + maintenance: dict + generated_at: str = field( + default_factory=lambda: datetime.now(timezone.utc).isoformat() + ) + + def to_dict(self) -> dict: + return { + "name": self.name, + "version": self.version, + "motivation": self.motivation, + "composition": self.composition, + "collection_process": self.collection_process, + "preprocessing": self.preprocessing, + "uses": self.uses, + "distribution": self.distribution, + "maintenance": self.maintenance, + "generated_at": self.generated_at, + } + + +_DEFAULT_INAPPROPRIATE_USES = [ + "Training models for real-time legal enforcement against individual landowners.", + "Land-rights or sovereignty disputes without on-the-ground verification.", + "Generative model training where label provenance is required to be human-verified.", +] + +_DEFAULT_MAINTENANCE = { + "owner": "ClimateVision Governance ", + "update_cadence": "Reviewed each minor release; refreshed when source providers change.", + "deprecation_policy": ( + "Versions are retained for two minor releases after supersession; " + "models trained on deprecated versions are flagged in their model cards." + ), +} + + +def _coerce_config(config: Union[dict, str, Path]) -> dict: + if isinstance(config, dict): + return config + path = Path(config) + text = path.read_text() + if path.suffix in {".yml", ".yaml"}: + try: + import yaml + except ImportError as exc: # pragma: no cover - import guard + raise RuntimeError("PyYAML is required to load YAML configs") from exc + return yaml.safe_load(text) + return json.loads(text) + + +def _validate(datasheet: "Datasheet") -> None: + missing: list[str] = [] + for section_name, required_keys in REQUIRED_QUESTIONS.items(): + section = getattr(datasheet, section_name) + for key in required_keys: + if key not in section or section[key] in (None, "", []): + missing.append(f"{section_name}.{key}") + if missing: + raise ValueError(f"datasheet missing required answers: {missing}") + + +def build_datasheet( + manifest: Union[dict, str, Path], + *, + name: Optional[str] = None, + version: Optional[str] = None, +) -> Datasheet: + """Build a Datasheet from a structured dataset manifest.""" + m = _coerce_config(manifest) + + resolved_name = name or m.get("name") or "climatevision-dataset" + resolved_version = version or m.get("version") or "0.0.0" + + uses = dict(m.get("uses", {})) + uses.setdefault("inappropriate_uses", list(_DEFAULT_INAPPROPRIATE_USES)) + + sheet = Datasheet( + name=resolved_name, + version=resolved_version, + motivation=dict(m.get("motivation", {})), + composition=dict(m.get("composition", {})), + collection_process=dict(m.get("collection_process", {})), + preprocessing=dict(m.get("preprocessing", {})), + uses=uses, + distribution=dict(m.get("distribution", {})), + maintenance=dict(m.get("maintenance", _DEFAULT_MAINTENANCE)), + ) + _validate(sheet) + return sheet + + +def _render_section(title: str, body: dict) -> list[str]: + if not body: + return [f"## {title}", "_Not documented._", ""] + lines = [f"## {title}"] + for key, value in body.items(): + pretty_key = key.replace("_", " ").title() + if isinstance(value, list): + lines.append(f"### {pretty_key}") + lines.extend(f"- {item}" for item in value) + elif isinstance(value, dict): + lines.append(f"### {pretty_key}") + lines.append(f"```json\n{json.dumps(value, indent=2)}\n```") + else: + lines.append(f"- **{pretty_key}**: {value}") + lines.append("") + return lines + + +def render_markdown(sheet: Datasheet) -> str: + sections = [ + f"# Datasheet: {sheet.name} ({sheet.version})", + f"_Generated {sheet.generated_at}_", + "", + "_Format: Gebru et al., 2018, \"Datasheets for Datasets\"._", + "", + ] + sections += _render_section("Motivation", sheet.motivation) + sections += _render_section("Composition", sheet.composition) + sections += _render_section("Collection Process", sheet.collection_process) + sections += _render_section("Preprocessing, Cleaning, Labeling", sheet.preprocessing) + sections += _render_section("Uses", sheet.uses) + sections += _render_section("Distribution", sheet.distribution) + sections += _render_section("Maintenance", sheet.maintenance) + return "\n".join(sections) + "\n" + + +def write_datasheet( + sheet: Datasheet, + output_dir: Optional[Union[str, Path]] = None, +) -> dict[str, Path]: + output_dir = Path(output_dir) if output_dir else _DEFAULT_OUTPUT_DIR + output_dir.mkdir(parents=True, exist_ok=True) + + base = f"{sheet.name}_{sheet.version}" + md_path = output_dir / f"{base}.md" + json_path = output_dir / f"{base}.json" + + md_path.write_text(render_markdown(sheet)) + json_path.write_text(json.dumps(sheet.to_dict(), indent=2)) + + logger.info("Wrote datasheet to %s and %s", md_path, json_path) + return {"markdown": md_path, "json": json_path} + + +def generate( + manifest: Union[dict, str, Path], + output_dir: Optional[Union[str, Path]] = None, + **kwargs: Any, +) -> dict[str, Path]: + """End-to-end: load manifest, build the datasheet, render to disk.""" + sheet = build_datasheet(manifest, **kwargs) + return write_datasheet(sheet, output_dir=output_dir) diff --git a/tests/test_datasheet.py b/tests/test_datasheet.py new file mode 100644 index 0000000..45a83f9 --- /dev/null +++ b/tests/test_datasheet.py @@ -0,0 +1,148 @@ +"""Tests for governance.datasheet.""" + +from __future__ import annotations + +import json + +import pytest + +from climatevision.governance.datasheet import ( + Datasheet, + build_datasheet, + generate, + render_markdown, + write_datasheet, +) + + +def _valid_manifest() -> dict: + return { + "name": "sentinel2-deforestation", + "version": "1.0.0", + "motivation": { + "purpose": "Detect Amazon basin deforestation events from Sentinel-2.", + "creators": "ClimateVision Data Pipeline team", + "funding": "Self-funded open-source initiative.", + }, + "composition": { + "instances": "12,480 256x256 tiles", + "labels": "Binary deforestation mask per tile", + "splits": "70/15/15 train/val/test by spatial cluster", + "label_source": "Hansen Global Forest Change v1.10", + }, + "collection_process": { + "source": "Sentinel-2 L2A via Google Earth Engine", + "timeframe": "2020-01-01 to 2023-12-31", + "consent": "Public open-data licence; no human subjects.", + }, + "preprocessing": { + "cloud_masking": "QA60 + s2cloudless threshold 0.4", + "normalisation": "Per-band z-score against training set means", + "augmentation": "Random flip / 90deg rotate at train time only", + }, + "uses": { + "intended_uses": [ + "Training U-Net segmentation models for deforestation detection.", + "Evaluating fairness of detection across forest biomes.", + ] + }, + "distribution": { + "license": "CC-BY-4.0 (derived data)", + "redistribution": "Allowed with attribution; do not redistribute raw Sentinel-2 tiles.", + }, + } + + +def test_build_datasheet_returns_typed_object(): + sheet = build_datasheet(_valid_manifest()) + assert isinstance(sheet, Datasheet) + assert sheet.name == "sentinel2-deforestation" + assert sheet.version == "1.0.0" + assert sheet.motivation["purpose"].startswith("Detect") + + +def test_inappropriate_uses_default_when_omitted(): + sheet = build_datasheet(_valid_manifest()) + assert sheet.uses["inappropriate_uses"], "default inappropriate_uses should be populated" + + +def test_inappropriate_uses_respect_override(): + manifest = _valid_manifest() + manifest["uses"]["inappropriate_uses"] = ["custom override"] + sheet = build_datasheet(manifest) + assert sheet.uses["inappropriate_uses"] == ["custom override"] + + +def test_maintenance_has_default(): + sheet = build_datasheet(_valid_manifest()) + assert "owner" in sheet.maintenance + assert "update_cadence" in sheet.maintenance + + +def test_validate_rejects_missing_required_section(): + manifest = _valid_manifest() + del manifest["motivation"]["purpose"] + with pytest.raises(ValueError, match="motivation.purpose"): + build_datasheet(manifest) + + +def test_validate_rejects_empty_required_field(): + manifest = _valid_manifest() + manifest["composition"]["labels"] = "" + with pytest.raises(ValueError, match="composition.labels"): + build_datasheet(manifest) + + +def test_validate_rejects_missing_collection_timeframe(): + manifest = _valid_manifest() + del manifest["collection_process"]["timeframe"] + with pytest.raises(ValueError, match="collection_process.timeframe"): + build_datasheet(manifest) + + +def test_render_markdown_includes_section_headings(): + sheet = build_datasheet(_valid_manifest()) + md = render_markdown(sheet) + for heading in ( + "# Datasheet:", + "## Motivation", + "## Composition", + "## Collection Process", + "## Uses", + "## Distribution", + "## Maintenance", + ): + assert heading in md, f"missing heading: {heading}" + + +def test_render_markdown_renders_lists_as_bullets(): + sheet = build_datasheet(_valid_manifest()) + md = render_markdown(sheet) + assert "- Training U-Net segmentation models" in md + + +def test_write_datasheet_round_trips_json(tmp_path): + sheet = build_datasheet(_valid_manifest()) + paths = write_datasheet(sheet, output_dir=tmp_path) + loaded = json.loads(paths["json"].read_text()) + assert loaded["name"] == sheet.name + assert loaded["composition"]["splits"] == "70/15/15 train/val/test by spatial cluster" + + +def test_generate_end_to_end(tmp_path): + manifest_path = tmp_path / "manifest.json" + manifest_path.write_text(json.dumps(_valid_manifest())) + paths = generate(manifest_path, output_dir=tmp_path / "out") + assert paths["markdown"].exists() + assert paths["json"].exists() + assert "Datasheet:" in paths["markdown"].read_text() + + +def test_generate_loads_yaml(tmp_path): + pytest.importorskip("yaml") + import yaml + + manifest_path = tmp_path / "manifest.yaml" + manifest_path.write_text(yaml.safe_dump(_valid_manifest())) + paths = generate(manifest_path, output_dir=tmp_path / "out") + assert paths["markdown"].exists()