Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions scripts/generate_datasheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python
"""
Generate a Datasheet for a ClimateVision training dataset.

Usage:
python scripts/generate_datasheet.py \\
--manifest data/manifests/sentinel2-deforestation.yaml \\
--output-dir outputs/datasheets/

Runs inside the release CI pipeline so every dataset version published
ships with a Gebru-style datasheet alongside its model cards.
"""

from __future__ import annotations

import argparse
import logging
import sys
from pathlib import Path

from climatevision.governance.datasheet import generate

logger = logging.getLogger("generate_datasheet")


def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--manifest", type=Path, required=True, help="Dataset manifest (yaml/json)")
parser.add_argument("--output-dir", type=Path, default=None, help="Where to write the datasheet")
parser.add_argument("--name", default=None, help="Override dataset name")
parser.add_argument("--version", default=None, help="Override dataset version")
parser.add_argument("-v", "--verbose", action="store_true")
return parser.parse_args(argv)


def main(argv: list[str] | None = None) -> int:
args = parse_args(argv)
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(name)s %(message)s",
)

paths = generate(
manifest=args.manifest,
output_dir=args.output_dir,
name=args.name,
version=args.version,
)
for label, path in paths.items():
print(f"{label}: {path}")
return 0


if __name__ == "__main__":
sys.exit(main())
14 changes: 14 additions & 0 deletions src/climatevision/governance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Regional bias and fairness auditing
- Anomaly detection for inference inputs/outputs
- Model audit trails and version tracking
- Datasheets for training datasets (Gebru et al., 2018)
"""

from .explainability import (
Expand Down Expand Up @@ -42,6 +43,13 @@
check_fairness_gate,
SUPPORTED_REGIONS,
)
from .datasheet import (
Datasheet,
build_datasheet,
generate as generate_datasheet,
render_markdown as render_datasheet_markdown,
write_datasheet,
)

__all__ = [
# Explainability
Expand Down Expand Up @@ -73,4 +81,10 @@
"RegionMetrics",
"check_fairness_gate",
"SUPPORTED_REGIONS",
# Datasheet
"Datasheet",
"build_datasheet",
"generate_datasheet",
"render_datasheet_markdown",
"write_datasheet",
]
215 changes: 215 additions & 0 deletions src/climatevision/governance/datasheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""
Datasheets for the datasets that train ClimateVision models.

Companion to the Mitchell-style model cards in ``governance.model_card``:
where a model card describes the *model*, a datasheet describes the
*dataset* the model was trained on (Gebru et al., 2018, "Datasheets for
Datasets"). The two artifacts answer different questions and both need
to ship with a release.

The module mirrors the model_card public surface (``build``, ``render``,
``write``, ``generate``) so contributors only have to learn one pattern,
and the release CI pipeline can call them in sequence.

Sections covered:

- Motivation
- Composition
- Collection process
- Preprocessing, cleaning, labeling
- Uses (intended and inappropriate)
- Distribution
- Maintenance

Every section is a free-form ``dict`` of question -> answer so the schema
can grow without code changes; ``REQUIRED_QUESTIONS`` enforces the bare
minimum a release datasheet must answer.
"""

from __future__ import annotations

import json
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional, Union

logger = logging.getLogger(__name__)

_PROJECT_ROOT = Path(__file__).resolve().parents[3]
_DEFAULT_OUTPUT_DIR = _PROJECT_ROOT / "outputs" / "datasheets"

REQUIRED_QUESTIONS = {
"motivation": ("purpose", "creators"),
"composition": ("instances", "labels", "splits"),
"collection_process": ("source", "timeframe"),
"uses": ("intended_uses", "inappropriate_uses"),
}


@dataclass
class Datasheet:
"""Structured datasheet for a single training dataset."""

name: str
version: str
motivation: dict
composition: dict
collection_process: dict
preprocessing: dict
uses: dict
distribution: dict
maintenance: dict
generated_at: str = field(
default_factory=lambda: datetime.now(timezone.utc).isoformat()
)

def to_dict(self) -> dict:
return {
"name": self.name,
"version": self.version,
"motivation": self.motivation,
"composition": self.composition,
"collection_process": self.collection_process,
"preprocessing": self.preprocessing,
"uses": self.uses,
"distribution": self.distribution,
"maintenance": self.maintenance,
"generated_at": self.generated_at,
}


_DEFAULT_INAPPROPRIATE_USES = [
"Training models for real-time legal enforcement against individual landowners.",
"Land-rights or sovereignty disputes without on-the-ground verification.",
"Generative model training where label provenance is required to be human-verified.",
]

_DEFAULT_MAINTENANCE = {
"owner": "ClimateVision Governance <governance@climate-vision.org>",
"update_cadence": "Reviewed each minor release; refreshed when source providers change.",
"deprecation_policy": (
"Versions are retained for two minor releases after supersession; "
"models trained on deprecated versions are flagged in their model cards."
),
}


def _coerce_config(config: Union[dict, str, Path]) -> dict:
if isinstance(config, dict):
return config
path = Path(config)
text = path.read_text()
if path.suffix in {".yml", ".yaml"}:
try:
import yaml
except ImportError as exc: # pragma: no cover - import guard
raise RuntimeError("PyYAML is required to load YAML configs") from exc
return yaml.safe_load(text)
return json.loads(text)


def _validate(datasheet: "Datasheet") -> None:
missing: list[str] = []
for section_name, required_keys in REQUIRED_QUESTIONS.items():
section = getattr(datasheet, section_name)
for key in required_keys:
if key not in section or section[key] in (None, "", []):
missing.append(f"{section_name}.{key}")
if missing:
raise ValueError(f"datasheet missing required answers: {missing}")


def build_datasheet(
manifest: Union[dict, str, Path],
*,
name: Optional[str] = None,
version: Optional[str] = None,
) -> Datasheet:
"""Build a Datasheet from a structured dataset manifest."""
m = _coerce_config(manifest)

resolved_name = name or m.get("name") or "climatevision-dataset"
resolved_version = version or m.get("version") or "0.0.0"

uses = dict(m.get("uses", {}))
uses.setdefault("inappropriate_uses", list(_DEFAULT_INAPPROPRIATE_USES))

sheet = Datasheet(
name=resolved_name,
version=resolved_version,
motivation=dict(m.get("motivation", {})),
composition=dict(m.get("composition", {})),
collection_process=dict(m.get("collection_process", {})),
preprocessing=dict(m.get("preprocessing", {})),
uses=uses,
distribution=dict(m.get("distribution", {})),
maintenance=dict(m.get("maintenance", _DEFAULT_MAINTENANCE)),
)
_validate(sheet)
return sheet


def _render_section(title: str, body: dict) -> list[str]:
if not body:
return [f"## {title}", "_Not documented._", ""]
lines = [f"## {title}"]
for key, value in body.items():
pretty_key = key.replace("_", " ").title()
if isinstance(value, list):
lines.append(f"### {pretty_key}")
lines.extend(f"- {item}" for item in value)
elif isinstance(value, dict):
lines.append(f"### {pretty_key}")
lines.append(f"```json\n{json.dumps(value, indent=2)}\n```")
else:
lines.append(f"- **{pretty_key}**: {value}")
lines.append("")
return lines


def render_markdown(sheet: Datasheet) -> str:
sections = [
f"# Datasheet: {sheet.name} ({sheet.version})",
f"_Generated {sheet.generated_at}_",
"",
"_Format: Gebru et al., 2018, \"Datasheets for Datasets\"._",
"",
]
sections += _render_section("Motivation", sheet.motivation)
sections += _render_section("Composition", sheet.composition)
sections += _render_section("Collection Process", sheet.collection_process)
sections += _render_section("Preprocessing, Cleaning, Labeling", sheet.preprocessing)
sections += _render_section("Uses", sheet.uses)
sections += _render_section("Distribution", sheet.distribution)
sections += _render_section("Maintenance", sheet.maintenance)
return "\n".join(sections) + "\n"


def write_datasheet(
sheet: Datasheet,
output_dir: Optional[Union[str, Path]] = None,
) -> dict[str, Path]:
output_dir = Path(output_dir) if output_dir else _DEFAULT_OUTPUT_DIR
output_dir.mkdir(parents=True, exist_ok=True)

base = f"{sheet.name}_{sheet.version}"
md_path = output_dir / f"{base}.md"
json_path = output_dir / f"{base}.json"

md_path.write_text(render_markdown(sheet))
json_path.write_text(json.dumps(sheet.to_dict(), indent=2))

logger.info("Wrote datasheet to %s and %s", md_path, json_path)
return {"markdown": md_path, "json": json_path}


def generate(
manifest: Union[dict, str, Path],
output_dir: Optional[Union[str, Path]] = None,
**kwargs: Any,
) -> dict[str, Path]:
"""End-to-end: load manifest, build the datasheet, render to disk."""
sheet = build_datasheet(manifest, **kwargs)
return write_datasheet(sheet, output_dir=output_dir)
Loading