diff --git a/changelog.d/1022.added b/changelog.d/1022.added new file mode 100644 index 000000000..1923a8002 --- /dev/null +++ b/changelog.d/1022.added @@ -0,0 +1 @@ +Added typed Stage 5 release candidate schemas and Stage 4 candidate bundle readers. diff --git a/changelog.d/1024.added b/changelog.d/1024.added new file mode 100644 index 000000000..c71ad19c4 --- /dev/null +++ b/changelog.d/1024.added @@ -0,0 +1 @@ +Added a Stage 5 release candidate validation service backed by the shared validation core. diff --git a/changelog.d/1026.added b/changelog.d/1026.added new file mode 100644 index 000000000..8f15a5f66 --- /dev/null +++ b/changelog.d/1026.added @@ -0,0 +1 @@ +Added typed Stage 5 promotion result models around the existing release transaction engine. diff --git a/changelog.d/1037.added b/changelog.d/1037.added new file mode 100644 index 000000000..dc790375c --- /dev/null +++ b/changelog.d/1037.added @@ -0,0 +1 @@ +Added the Stage 5 release promotion contract and runtime manifest output. diff --git a/changelog.d/1044.added b/changelog.d/1044.added new file mode 100644 index 000000000..f9c04d62a --- /dev/null +++ b/changelog.d/1044.added @@ -0,0 +1 @@ +Added the Stage 5 published artifact index JSONL artifact. diff --git a/changelog.d/1066.added b/changelog.d/1066.added new file mode 100644 index 000000000..dc4b4cc92 --- /dev/null +++ b/changelog.d/1066.added @@ -0,0 +1 @@ +Add Stage 5 promoted runs discovery index generation. diff --git a/docs/engineering/skills/README.md b/docs/engineering/skills/README.md index 3e0f5b874..06ddfba94 100644 --- a/docs/engineering/skills/README.md +++ b/docs/engineering/skills/README.md @@ -24,3 +24,10 @@ Current skills: Stage-specific AI-facing engineering guides live under `docs/engineering/stages/`. Use them alongside these cross-cutting skills when modifying a stage-specific pipeline path. + +Current stage guides: + +- `build_outputs.md`: Stage 4 output-build library boundaries and test + expectations. +- `release_promotion.md`: Stage 5 release candidate identity, validation-report + schema, rerun comparison material, and side-effect boundaries. diff --git a/docs/engineering/stages/release_promotion.md b/docs/engineering/stages/release_promotion.md new file mode 100644 index 000000000..b289e7825 --- /dev/null +++ b/docs/engineering/stages/release_promotion.md @@ -0,0 +1,156 @@ +# Release Promotion Stage AI Guide + +This guide is for AI agents and maintainers modifying Stage 5 +(`5_validate_and_promote_release`) code. Stage 5 validates a staged release +candidate, promotes the exact candidate to public Hugging Face and GCS +destinations, writes release/version/completion metadata, and cleans staging +only after completion is certified. + +## Candidate Identity + +Use `policyengine_us_data.release_promotion.ReleasePromotionContext` as the +typed Stage 5 identity boundary. The context must keep these values distinct: + +- `run_id`: the canonical publication run correlation key. +- `candidate_version`: the candidate staging scope used in Hugging Face staging + paths such as `staging/{candidate_version}-{run_id}/...`. +- `release_version`: the final stable public release version. +- `base_release_version` and `release_bump`: optional provenance for how the + candidate scope was chosen. + +Do not resolve a different run ID from the environment inside lower-level +release-promotion logic. Environment resolution belongs at orchestration edges; +Stage 5 library code should receive explicit context. + +## Release Candidate Bundles + +Use `ReleaseCandidateInputBundle` to describe the artifacts Stage 5 is allowed +to validate and promote. Each artifact should be represented by a +`ReleaseArtifactSpec` with a production-relative path, artifact family, source +stage, and optional checksum/size metadata. + +The current compatibility path may build a bundle from the legacy staged path +set produced by Modal orchestration. Mark that reader as compatibility-only and +keep it retirable. + +The Stage 4 contract/inventory reader API now exists for migration work: +`build_release_candidate_bundle_from_stage4_contract()` accepts an in-memory +Stage 4 contract plus inventory records, and +`read_stage4_release_candidate_bundle()` reads the same shape from files. +Production Stage 5 code should not depend on Stage 4 contracts until the +contract and inventory are canonical, complete, and populated with semantic +artifact identity plus checksum/size material. + +## Validation Reports + +Stage 5 must use the shared validation schema for durable validation output: + +- `policyengine_us_data.stage_contracts.ValidationReport` +- `policyengine_us_data.stage_contracts.ValidationFinding` +- `policyengine_us_data.stage_contracts.DiagnosticRef` + +Do not create a Stage 5-specific durable validation report, check, finding, or +error schema for contracts, diagnostics, release candidates, status endpoints, +or step manifests. Release-specific details such as missing staged artifacts, +missing validation reports, finalized-release conflicts, version mismatches, or +destination conflicts should live in canonical finding metadata. + +Use `ReleaseCandidateValidator` for the `5a_validate_outputs` library seam. +It wraps `policyengine_us_data.validation_core` and calls existing release +guards through injected dependencies, including staged-artifact presence, +release-manifest preflight, matching finalized-manifest checks, and release +completion marker checks. Keep those dependencies injectable so unit tests do +not need Hugging Face, GCS, Modal, or production credentials. + +## Rerun Comparison Material + +Before public writes, rerun and reuse decisions should compare semantic +candidate identity rather than only checking whether output files exist. The +comparison material should include: + +- run ID, candidate version, release version, HF repository, and GCS bucket; +- Stage 4 output contract fingerprint when available; +- output inventory paths/checksums when available; +- validation report paths and their identities when available; +- expected production-relative artifact paths; +- the Stage 5 candidate bundle fingerprint. + +When required artifacts only have paths and no checksum/size identity, treat +the bundle as path-only and do not use its fingerprint for promotion reuse +decisions. + +Already-finalized releases are an idempotency case, not a shortcut around +candidate identity. A finalized release can be reused only when its completion +marker is valid and it matches the requested candidate. + +## Side Effects + +Candidate builders, schema adapters, and rerun comparison helpers should not +perform Hugging Face writes, GCS uploads, Modal calls, staging cleanup, or +release-manifest publication. Keep those operations behind explicit adapters or +services so tests can exercise candidate shape and validation logic without +credentials or network access. + +Use `FullPromotionResult` and its substep result objects when exposing Stage 5 +promotion outcomes to contracts, status APIs, or orchestration summaries. The +current compatibility wrapper, `promote_full_release_with_result()`, must keep +calling the existing transaction engine first and only wrap its dictionary +output afterward so the promotion order remains unchanged. + +## Release Promotion Contract + +Stage 5 writes `release_promotion_contract.json` under the run-local +`diagnostics/contracts/` directory after the promotion transaction succeeds and +before the Stage 5 step manifest is completed. The contract is the semantic +record for the Stage 5 boundary: it ties the canonical `run_id`, candidate +identity, Stage 4 output contract reference when available, validation report +paths, public Hugging Face and GCS refs, cleanup status, and typed +`FullPromotionResult` into one durable `StageContract`. + +The contract complements the public release files instead of replacing them: + +- `release_manifest.json` and `releases/{version}/release_manifest.json` remain + the public artifact inventory for the stable release. +- `version_manifest.json` remains the public version registry used by clients + and publication checks. +- `releases/{version}/release-complete.json` remains the final completion + marker and tag target proving the release was fully finalized. +- `release_promotion_contract.json` remains run-scoped diagnostics material for + dashboards, AI agents, rerun comparison, and promotion auditability. + +Runtime step manifests for `5_validate_and_promote_release` should include the +contract as a JSON `contract` output. They may still record legacy validated +input artifacts for compatibility, but the contract is the preferred semantic +entry point for Stage 5 status and lineage. + +## Published Artifact Index + +Stage 5 also writes `published_artifact_index.jsonl` under the run-local +`diagnostics/` directory. Each JSONL row describes one promoted artifact or +release metadata artifact with its canonical `run_id`, candidate version, +release version, source-stage metadata, final Hugging Face URI, and GCS URI +when the artifact is mirrored to GCS. + +Build index rows from typed release candidate and promotion-result objects, not +from console logs. Release manifest entries may supply final checksum, size, +revision, and kind fields for promoted data artifacts; the index should leave +the release manifest schema unchanged. The release promotion contract must +reference the index as a `published_artifact_index` output so dashboards and AI +systems can discover the per-artifact rows from the Stage 5 contract. + +## Promoted Runs Index + +Stage 5 writes `calibration/runs/index.json` as the run-oriented discovery +index for promoted releases. Keep this separate from `version_manifest.json`: +the version manifest remains the package-version registry, while the promoted +runs index is keyed by canonical `run_id` and points dashboards, status +surfaces, and AI agents to run diagnostics such as `run_manifest.json`, +`release_promotion_contract.json`, and `published_artifact_index.jsonl`. + +Update the promoted runs index only after release completion succeeds. For an +already-finalized promotion rerun, completion must first be verified by the +existing release completion marker check, then the same `run_id` entry may be +updated idempotently. Repeated promotions must not duplicate run entries or +release-version run lists. The release promotion contract should reference the +index as `promoted_runs_index` and include the update status so readers can see +whether the current promotion created or updated the run discovery record. diff --git a/docs/pipeline_map.yaml b/docs/pipeline_map.yaml index 7bcfdda2c..94fcd29f1 100644 --- a/docs/pipeline_map.yaml +++ b/docs/pipeline_map.yaml @@ -759,7 +759,7 @@ stages: node_type: artifact description: Policy target database copied into the pipeline volume - id: hf_staging_base_s1g - label: HuggingFace staging/{candidate_version}/{run_id} + label: HuggingFace staging/{candidate_version}-{run_id} node_type: external description: Run-scoped staging prefix for base datasets - id: stage_base_datasets @@ -1504,7 +1504,7 @@ stages: node_type: artifact description: Output set from substage 5a - id: hf_staging_s5b - label: HuggingFace staging/{candidate_version}/{run_id} + label: HuggingFace staging/{candidate_version}-{run_id} node_type: external description: Run-scoped staging prefix containing validated artifacts - id: out_hf_prod diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 39a437808..45d0fd985 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -512,6 +512,216 @@ def _promote_full_release_from_staging( ) +def _promotion_result_from_stdout(promotion_stdout: str): + """Parse typed promotion results from the promotion subprocess output.""" + + from policyengine_us_data.release_promotion import FullPromotionResult + + try: + payload = json.loads(promotion_stdout) + except json.JSONDecodeError as exc: + raise RuntimeError( + "Full release promotion subprocess did not return JSON." + ) from exc + return FullPromotionResult.from_legacy_dict(payload) + + +def _release_promotion_context_from_run_context(run_context: RunContext): + """Build the Stage 5 library context from the orchestration run context.""" + + from policyengine_us_data.release_promotion import ReleasePromotionContext + + return ReleasePromotionContext( + run_id=run_context.run_id, + candidate_version=run_context.candidate_version, + release_version=run_context.release_version, + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version=run_context.base_release_version or None, + release_bump=run_context.release_bump or None, + modal_app_name=run_context.modal_app_name or None, + modal_environment=run_context.modal_environment or None, + hf_staging_prefix=run_context.hf_staging_prefix or None, + metadata={"run_context": run_context.to_dict()}, + ) + + +def _release_artifact_metadata_by_path( + run_id: str, + rel_paths: list[str], +) -> dict[str, dict[str, object]]: + """Return local checksum/size metadata for staged release artifacts.""" + + metadata: dict[str, dict[str, object]] = {} + for local_path, rel_path in _full_release_manifest_files(run_id, rel_paths): + path = Path(local_path) + if not path.exists() or not path.is_file(): + continue + reference = ArtifactReference.from_path(path) + metadata[rel_path] = { + "sha256": f"sha256:{reference.sha256}", + "size_bytes": reference.size_bytes, + } + return metadata + + +def _stage4_output_contract_repo_path_if_available(run_id: str) -> str | None: + """Return the run-repo path for the Stage 4 contract when it exists locally.""" + + run_dir = _run_dir(run_id) + candidates = ( + run_dir / "diagnostics" / "contracts" / "output_build_contract.json", + run_dir / "contracts" / "output_build_contract.json", + run_dir / "output_build_contract.json", + ) + for path in candidates: + if path.exists() and path.is_file(): + return f"calibration/runs/{run_id}/{path.relative_to(run_dir).as_posix()}" + return None + + +def _write_release_promotion_contract_for_run( + *, + meta: RunMetadata, + run_context: RunContext, + rel_paths: list[str], + promotion_result, +) -> tuple[ArtifactReference, ...]: + """Write Stage 5's run-local index/contract and return manifest references.""" + + from policyengine_us_data.release_promotion import ( + build_legacy_release_candidate_bundle, + build_published_artifact_index, + build_promoted_run_index_entry, + published_artifact_index_artifact_ref, + published_artifact_index_path, + published_artifact_index_repo_path, + promoted_runs_index_artifact_ref, + promoted_runs_index_path, + release_promotion_contract_repo_path, + release_promotion_contract_path, + update_promoted_runs_index, + write_published_artifact_index, + write_release_promotion_contract, + ) + from policyengine_us_data.stage_contracts import ArtifactRef + + run_dir = _run_dir(run_context.run_id) + release_context = _release_promotion_context_from_run_context(run_context) + created_at = datetime.now(timezone.utc).isoformat() + contract_path = release_promotion_contract_path(run_dir) + candidate_bundle = build_legacy_release_candidate_bundle( + context=release_context, + rel_paths=rel_paths, + artifact_metadata_by_path=_release_artifact_metadata_by_path( + run_context.run_id, + rel_paths, + ), + source_output_contract_path=_stage4_output_contract_repo_path_if_available( + run_context.run_id + ), + ) + contract_artifact = ArtifactRef( + logical_name="release_promotion_contract", + uri=( + f"hf://{release_context.hf_repo_name}/" + f"{release_promotion_contract_repo_path(release_context.run_id)}" + ), + media_type="application/json", + metadata={ + "artifact_family": "stage_contract", + "source_stage_id": "5_validate_and_promote_release", + "relative_path": release_promotion_contract_repo_path( + release_context.run_id + ), + }, + ) + published_index_path = published_artifact_index_path(run_dir) + published_index_rows = build_published_artifact_index( + candidate_bundle=candidate_bundle, + promotion_result=promotion_result, + diagnostic_artifacts=(contract_artifact,), + ) + write_published_artifact_index(published_index_rows, published_index_path) + published_index_manifest_ref = ArtifactReference.from_path( + published_index_path, + role="index", + base_dir=run_dir, + media_type="application/jsonl", + ) + published_index_artifact = published_artifact_index_artifact_ref( + release_context, + row_count=len(published_index_rows), + sha256=f"sha256:{published_index_manifest_ref.sha256}", + size_bytes=published_index_manifest_ref.size_bytes, + ) + promoted_index_path = promoted_runs_index_path(Path(RUNS_DIR)) + promoted_run_entry = build_promoted_run_index_entry( + context=release_context, + promotion_result=promotion_result, + promoted_at=created_at, + release_promotion_contract_path=release_promotion_contract_repo_path( + release_context.run_id + ), + published_artifact_index_path=published_artifact_index_repo_path( + release_context.run_id + ), + run_manifest_path=f"calibration/runs/{release_context.run_id}/run_manifest.json", + step_manifest_path=( + f"calibration/runs/{release_context.run_id}/steps/" + f"{VALIDATE_AND_PROMOTE_RELEASE.id}.json" + ), + metadata={ + "writer": "modal_app.pipeline.promote_run", + "branch": meta.branch, + "code_sha": meta.sha, + "package_version": meta.version, + }, + ) + _, promoted_index_update = update_promoted_runs_index( + path=promoted_index_path, + entry=promoted_run_entry, + updated_at=created_at, + ) + promoted_index_manifest_ref = ArtifactReference.from_path( + promoted_index_path, + role="index", + manifest_path=f"../{promoted_index_path.name}", + media_type="application/json", + ) + promoted_index_artifact = promoted_runs_index_artifact_ref( + release_context, + promoted_index_update, + sha256=f"sha256:{promoted_index_manifest_ref.sha256}", + size_bytes=promoted_index_manifest_ref.size_bytes, + ) + write_release_promotion_contract( + contract_path=contract_path, + candidate_bundle=candidate_bundle, + promotion_result=promotion_result, + created_at=created_at, + code_sha=meta.sha, + package_version=meta.version, + published_artifact_index=published_index_artifact, + promoted_runs_index=promoted_index_artifact, + promoted_runs_index_update=promoted_index_update.to_dict(), + metadata={ + "writer": "modal_app.pipeline.promote_run", + "branch": meta.branch, + }, + ) + return ( + ArtifactReference.from_path( + contract_path, + role="contract", + base_dir=run_dir, + media_type="application/json", + ), + published_index_manifest_ref, + promoted_index_manifest_ref, + ) + + @app.function( image=image, timeout=300, @@ -2037,6 +2247,13 @@ def promote_run( promotion_context.to_dict(), ) print(f" {promotion_stdout}") + promotion_result = _promotion_result_from_stdout(promotion_stdout) + release_promotion_refs = _write_release_promotion_contract_for_run( + meta=meta, + run_context=promotion_context, + rel_paths=rel_paths, + promotion_result=promotion_result, + ) # Update run status only after all required promotion work succeeds. meta.status = "promoted" @@ -2045,8 +2262,11 @@ def promote_run( _complete_step_manifest( promote_manifest, outputs=[ - ArtifactReference.from_dict(artifact) - for artifact in promote_inputs["validated_step_outputs"] + *[ + ArtifactReference.from_dict(artifact) + for artifact in promote_inputs["validated_step_outputs"] + ], + *release_promotion_refs, ], reuse_decision="computed", vol=pipeline_volume, diff --git a/policyengine_us_data/release_promotion/__init__.py b/policyengine_us_data/release_promotion/__init__.py new file mode 100644 index 000000000..12b87b6cf --- /dev/null +++ b/policyengine_us_data/release_promotion/__init__.py @@ -0,0 +1,148 @@ +"""Typed Stage 5 release promotion boundaries. + +This package starts with release-candidate identity and candidate-bundle +schemas. Promotion side effects still live in the existing transaction engine +until later Stage 5 migration slices move them behind typed services. +""" + +from .artifacts import ( + BASE_RELEASE_ARTIFACT_PATHS, + ReleaseArtifactSpec, + dedupe_normalized_release_paths, + infer_artifact_identity, + infer_release_artifact_spec, + logical_name_for_release_path, + normalize_release_path, + strip_staging_prefix, +) +from .candidate import ( + ReleaseCandidateInputBundle, + build_legacy_release_candidate_bundle, + build_release_candidate_bundle_from_stage4_contract, + read_stage4_release_candidate_bundle, +) +from .context import ReleasePromotionContext +from .contract import ( + RELEASE_PROMOTION_CONTRACT_FILENAME, + RELEASE_PROMOTION_CONTRACT_TYPE, + ReleasePromotionContractBuilder, + build_release_promotion_contract, + release_promotion_contract_path, + release_promotion_contract_repo_path, + write_release_promotion_contract, +) +from .published_index import ( + PUBLISHED_ARTIFACT_INDEX_FILENAME, + PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE, + PublishedArtifactIndexRow, + build_published_artifact_index, + published_artifact_index_artifact_ref, + published_artifact_index_from_jsonl, + published_artifact_index_path, + published_artifact_index_repo_path, + published_artifact_index_to_jsonl, + read_published_artifact_index, + write_published_artifact_index, +) +from .promoted_runs_index import ( + PROMOTED_RUNS_INDEX_FILENAME, + PROMOTED_RUNS_INDEX_MEDIA_TYPE, + PromotedReleaseVersionEntry, + PromotedRunIndexEntry, + PromotedRunsIndex, + PromotedRunsIndexUpdate, + build_promoted_run_index_entry, + empty_promoted_runs_index, + load_promoted_runs_index, + promoted_runs_index_artifact_ref, + promoted_runs_index_from_json, + promoted_runs_index_path, + promoted_runs_index_repo_path, + promoted_runs_index_to_json, + read_promoted_runs_index, + update_promoted_runs_index, + upsert_promoted_run, + write_promoted_runs_index, +) +from .results import ( + CleanupPromotionResult, + CompletionMarkerPromotionResult, + FullPromotionResult, + GcsPromotionResult, + HuggingFacePromotionResult, + ReleaseManifestPromotionResult, + VersionManifestPromotionResult, +) +from .validation import build_release_candidate_shape_report +from .validation import ( + DEFAULT_REQUIRED_RELEASE_ARTIFACT_FAMILIES, + RELEASE_VALIDATION_SUBSTAGE_ID, + ReleaseCandidateValidationDependencies, + ReleaseCandidateValidator, + default_release_candidate_validation_dependencies, +) + +__all__ = [ + "BASE_RELEASE_ARTIFACT_PATHS", + "DEFAULT_REQUIRED_RELEASE_ARTIFACT_FAMILIES", + "RELEASE_VALIDATION_SUBSTAGE_ID", + "RELEASE_PROMOTION_CONTRACT_FILENAME", + "RELEASE_PROMOTION_CONTRACT_TYPE", + "PUBLISHED_ARTIFACT_INDEX_FILENAME", + "PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE", + "PROMOTED_RUNS_INDEX_FILENAME", + "PROMOTED_RUNS_INDEX_MEDIA_TYPE", + "CleanupPromotionResult", + "CompletionMarkerPromotionResult", + "FullPromotionResult", + "GcsPromotionResult", + "HuggingFacePromotionResult", + "PublishedArtifactIndexRow", + "PromotedReleaseVersionEntry", + "PromotedRunIndexEntry", + "PromotedRunsIndex", + "PromotedRunsIndexUpdate", + "ReleaseArtifactSpec", + "ReleaseCandidateInputBundle", + "ReleasePromotionContractBuilder", + "ReleasePromotionContext", + "ReleaseCandidateValidationDependencies", + "ReleaseCandidateValidator", + "ReleaseManifestPromotionResult", + "VersionManifestPromotionResult", + "build_legacy_release_candidate_bundle", + "build_published_artifact_index", + "build_promoted_run_index_entry", + "build_release_promotion_contract", + "build_release_candidate_bundle_from_stage4_contract", + "build_release_candidate_shape_report", + "default_release_candidate_validation_dependencies", + "dedupe_normalized_release_paths", + "empty_promoted_runs_index", + "infer_artifact_identity", + "infer_release_artifact_spec", + "load_promoted_runs_index", + "logical_name_for_release_path", + "normalize_release_path", + "published_artifact_index_artifact_ref", + "published_artifact_index_from_jsonl", + "published_artifact_index_path", + "published_artifact_index_repo_path", + "published_artifact_index_to_jsonl", + "promoted_runs_index_artifact_ref", + "promoted_runs_index_from_json", + "promoted_runs_index_path", + "promoted_runs_index_repo_path", + "promoted_runs_index_to_json", + "release_promotion_contract_path", + "release_promotion_contract_repo_path", + "read_published_artifact_index", + "read_promoted_runs_index", + "read_stage4_release_candidate_bundle", + "strip_staging_prefix", + "update_promoted_runs_index", + "upsert_promoted_run", + "write_published_artifact_index", + "write_promoted_runs_index", + "write_release_promotion_contract", +] diff --git a/policyengine_us_data/release_promotion/artifacts.py b/policyengine_us_data/release_promotion/artifacts.py new file mode 100644 index 000000000..c5045550a --- /dev/null +++ b/policyengine_us_data/release_promotion/artifacts.py @@ -0,0 +1,299 @@ +"""Release artifact identity helpers for Stage 5 candidate bundles.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field +from pathlib import PurePosixPath +import posixpath +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ArtifactRef +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + jsonable_value, + mapping_value, + optional_int_value, + optional_string, + optional_string_value, + require_non_empty, + required_string, + schema_version, + validate_optional_int, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION +from policyengine_us_data.stage_contracts.stages import ( + STAGE_1_BUILD_DATASETS, + STAGE_4_BUILD_OUTPUTS, + is_canonical_stage_id, +) + +BASE_RELEASE_ARTIFACT_PATHS = ( + "cps_2024.h5", + "policy_data.db", + "enhanced_cps_2024.h5", + "small_enhanced_cps_2024.h5", +) + +BASE_RELEASE_LOGICAL_NAMES = { + "cps_2024.h5": "cps_2024", + "policy_data.db": "policy_data_db", + "enhanced_cps_2024.h5": "enhanced_cps_2024", + "small_enhanced_cps_2024.h5": "small_enhanced_cps_2024", +} + + +@pipeline_node( + id="release_artifact_spec", + label="ReleaseArtifactSpec", + node_type="library", + description="Normalized per-artifact identity for a Stage 5 release candidate.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class ReleaseArtifactSpec: + """Normalized identity for one artifact in a Stage 5 release candidate.""" + + logical_name: str + relative_path: str + artifact_family: str + source_stage_id: str + area_type: str | None = None + area_id: str | None = None + sha256: str | None = None + size_bytes: int | None = None + required: bool = True + metadata: Mapping[str, Any] = field(default_factory=dict) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "logical_name", + require_non_empty(self.logical_name, "logical_name"), + ) + object.__setattr__( + self, + "relative_path", + normalize_release_path(self.relative_path), + ) + object.__setattr__( + self, + "artifact_family", + require_non_empty(self.artifact_family, "artifact_family"), + ) + object.__setattr__( + self, + "source_stage_id", + require_non_empty(self.source_stage_id, "source_stage_id"), + ) + if not is_canonical_stage_id(self.source_stage_id): + raise ValueError(f"Invalid source_stage_id: {self.source_stage_id!r}") + object.__setattr__( + self, + "area_type", + optional_string_value(self.area_type, "area_type"), + ) + object.__setattr__( + self, + "area_id", + optional_string_value(self.area_id, "area_id"), + ) + object.__setattr__( + self, + "sha256", + optional_string_value(self.sha256, "sha256"), + ) + validate_optional_int(self.size_bytes, "size_bytes") + if self.size_bytes is not None and self.size_bytes < 0: + raise ValueError("size_bytes must be non-negative") + if not isinstance(self.required, bool): + raise ValueError("required must be a boolean") + object.__setattr__( + self, + "metadata", + freeze_mapping(self.metadata, "metadata"), + ) + + def to_artifact_ref(self, *, uri_prefix: str = "") -> ArtifactRef: + """Return a generic stage-contract artifact reference for this artifact.""" + + uri = ( + f"{uri_prefix.rstrip('/')}/{self.relative_path}" + if uri_prefix + else self.relative_path + ) + return ArtifactRef( + logical_name=self.logical_name, + uri=uri, + sha256=self.sha256, + size_bytes=self.size_bytes, + metadata={ + **jsonable_value(self.metadata), + "artifact_family": self.artifact_family, + "source_stage_id": self.source_stage_id, + "area_type": self.area_type, + "area_id": self.area_id, + "required": self.required, + }, + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize the artifact spec to JSON-compatible primitives.""" + + return { + "logical_name": self.logical_name, + "relative_path": self.relative_path, + "artifact_family": self.artifact_family, + "source_stage_id": self.source_stage_id, + "area_type": self.area_type, + "area_id": self.area_id, + "sha256": self.sha256, + "size_bytes": self.size_bytes, + "required": self.required, + "metadata": jsonable_value(self.metadata), + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "ReleaseArtifactSpec": + """Restore a release artifact spec from serialized data.""" + + return cls( + logical_name=required_string(data, "logical_name"), + relative_path=required_string(data, "relative_path"), + artifact_family=required_string(data, "artifact_family"), + source_stage_id=required_string(data, "source_stage_id"), + area_type=optional_string(data, "area_type"), + area_id=optional_string(data, "area_id"), + sha256=optional_string(data, "sha256"), + size_bytes=optional_int_value(data, "size_bytes"), + required=data.get("required", True), + metadata=mapping_value(data, "metadata"), + schema_version=schema_version(data), + ) + + +def normalize_release_path(path: str) -> str: + """Normalize a release repo path and reject absolute or parent paths.""" + + if not isinstance(path, str): + raise ValueError("path must be a non-empty string") + value = require_non_empty(path.strip().replace("\\", "/"), "path") + if "://" in value: + raise ValueError("release paths must be repo-relative, not URIs") + if ".." in PurePosixPath(value).parts: + raise ValueError(f"release path must not contain parent traversal: {path!r}") + normalized = posixpath.normpath(value) + if normalized in {"", "."}: + raise ValueError("release path must not be empty") + if ":" in normalized.split("/", 1)[0]: + raise ValueError("release path must be repo-relative, not a drive path") + if normalized.startswith("/") or normalized == ".." or normalized.startswith("../"): + raise ValueError(f"release path must stay inside the repo: {path!r}") + return normalized + + +def strip_staging_prefix(path: str, staging_prefix: str | None) -> str: + """Return a production-relative path from a staged HF path when possible.""" + + normalized = normalize_release_path(path) + if not staging_prefix: + return normalized + prefix = normalize_release_path(staging_prefix) + if normalized.startswith(f"{prefix}/"): + return normalized[len(prefix) + 1 :] + if normalized.startswith("staging/"): + raise ValueError( + "staged release path does not match the expected staging prefix" + ) + return normalized + + +def dedupe_normalized_release_paths( + paths: Sequence[str], + *, + staging_prefix: str | None = None, +) -> tuple[str, ...]: + """Normalize and deduplicate release paths while preserving first mention.""" + + seen: set[str] = set() + deduped: list[str] = [] + for path in paths: + normalized = strip_staging_prefix(path, staging_prefix) + if normalized not in seen: + seen.add(normalized) + deduped.append(normalized) + return tuple(deduped) + + +def infer_release_artifact_spec( + relative_path: str, + *, + sha256: str | None = None, + size_bytes: int | None = None, + required: bool = True, + metadata: Mapping[str, Any] | None = None, +) -> ReleaseArtifactSpec: + """Infer a Stage 5 artifact spec from a production-relative repo path.""" + + path = normalize_release_path(relative_path) + family, area_type, area_id, source_stage_id = infer_artifact_identity(path) + return ReleaseArtifactSpec( + logical_name=logical_name_for_release_path(path), + relative_path=path, + artifact_family=family, + source_stage_id=source_stage_id, + area_type=area_type, + area_id=area_id, + sha256=sha256, + size_bytes=size_bytes, + required=required, + metadata=metadata or {}, + ) + + +def infer_artifact_identity( + relative_path: str, +) -> tuple[str, str | None, str | None, str]: + """Infer artifact family, area identity, and source stage from a repo path.""" + + path = normalize_release_path(relative_path) + parts = PurePosixPath(path).parts + if path in BASE_RELEASE_ARTIFACT_PATHS: + return "base_dataset", None, None, STAGE_1_BUILD_DATASETS + if parts == ("national", "US.h5"): + return "national_h5", "national", "US", STAGE_4_BUILD_OUTPUTS + if len(parts) == 2 and parts[0] in {"states", "districts", "cities"}: + area_type = { + "states": "state", + "districts": "district", + "cities": "city", + }[parts[0]] + return ( + f"{area_type}_h5", + area_type, + PurePosixPath(parts[1]).stem, + STAGE_4_BUILD_OUTPUTS, + ) + return "release_artifact", None, None, STAGE_4_BUILD_OUTPUTS + + +def logical_name_for_release_path(relative_path: str) -> str: + """Return a stable logical name for a release repo path.""" + + path = normalize_release_path(relative_path) + if path in BASE_RELEASE_LOGICAL_NAMES: + return BASE_RELEASE_LOGICAL_NAMES[path] + pure_path = PurePosixPath(path) + if pure_path.suffix: + return str(pure_path.with_suffix("")) + return path diff --git a/policyengine_us_data/release_promotion/candidate.py b/policyengine_us_data/release_promotion/candidate.py new file mode 100644 index 000000000..c4f134a53 --- /dev/null +++ b/policyengine_us_data/release_promotion/candidate.py @@ -0,0 +1,1052 @@ +"""Stage 5 release candidate bundle schemas and readers.""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass, field +import json +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ( + ArtifactRef, + StageContract, + read_contract, +) +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + freeze_sequence, + jsonable_value, + optional_string, + optional_string_value, + required_string, + schema_version, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION +from policyengine_us_data.stage_contracts.fingerprints import fingerprint_material +from policyengine_us_data.stage_contracts.stages import ( + STAGE_4_BUILD_OUTPUTS, + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, +) + +from .artifacts import ( + BASE_RELEASE_ARTIFACT_PATHS, + ReleaseArtifactSpec, + dedupe_normalized_release_paths, + infer_artifact_identity, + infer_release_artifact_spec, + logical_name_for_release_path, + normalize_release_path, + strip_staging_prefix, +) +from .context import ReleasePromotionContext + +_INVENTORY_PATH_KEYS = ( + "expected_release_path", + "relative_path", + "output_relative_path", + "repo_path", + "path", + "destination_path", + "staging_path", +) +RELEASE_CANDIDATE_BUNDLE_TYPE = "release_candidate_input_bundle" +RELEASE_SAFE_STAGE4_EXECUTION_STATUSES = frozenset( + {"completed", "reused", "partially_reused"} +) + + +@pipeline_node( + id="release_candidate_input_bundle", + label="ReleaseCandidateInputBundle", + node_type="library", + description="Typed Stage 5 input bundle describing artifacts eligible for release promotion.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class ReleaseCandidateInputBundle: + """Typed Stage 5 input bundle describing a candidate ready for promotion.""" + + context: ReleasePromotionContext + artifacts: tuple[ReleaseArtifactSpec, ...] + source_output_contract_path: str | None = None + release_candidate_fingerprint: str | None = None + validation_report_paths: tuple[str, ...] = () + diagnostics_manifest_path: str | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + bundle_type: str = RELEASE_CANDIDATE_BUNDLE_TYPE + stage_id: str = STAGE_5_VALIDATE_AND_PROMOTE_RELEASE + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + if self.bundle_type != RELEASE_CANDIDATE_BUNDLE_TYPE: + raise ValueError(f"bundle_type must be {RELEASE_CANDIDATE_BUNDLE_TYPE!r}") + if self.stage_id != STAGE_5_VALIDATE_AND_PROMOTE_RELEASE: + raise ValueError( + f"stage_id must be {STAGE_5_VALIDATE_AND_PROMOTE_RELEASE!r}" + ) + if not isinstance(self.context, ReleasePromotionContext): + raise ValueError("context must be ReleasePromotionContext") + object.__setattr__( + self, + "artifacts", + freeze_sequence(self.artifacts, "artifacts", ReleaseArtifactSpec), + ) + if not self.artifacts: + raise ValueError("artifacts must include at least one release artifact") + object.__setattr__( + self, + "source_output_contract_path", + ( + _normalize_run_contract_path( + self.source_output_contract_path, + self.context, + ) + if self.source_output_contract_path is not None + else None + ), + ) + object.__setattr__( + self, + "release_candidate_fingerprint", + optional_string_value( + self.release_candidate_fingerprint, + "release_candidate_fingerprint", + ), + ) + object.__setattr__( + self, + "validation_report_paths", + tuple( + _normalize_run_diagnostic_path(path, self.context) + for path in self.validation_report_paths + ), + ) + object.__setattr__( + self, + "diagnostics_manifest_path", + ( + _normalize_run_diagnostic_path( + self.diagnostics_manifest_path, + self.context, + ) + if self.diagnostics_manifest_path is not None + else None + ), + ) + object.__setattr__( + self, + "metadata", + freeze_mapping(self.metadata, "metadata"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize the candidate bundle to JSON-compatible primitives.""" + + return { + "bundle_type": self.bundle_type, + "stage_id": self.stage_id, + "schema_version": self.schema_version, + "context": self.context.to_dict(), + "source_output_contract_path": self.source_output_contract_path, + "release_candidate_fingerprint": self.release_candidate_fingerprint, + "artifacts": [artifact.to_dict() for artifact in self.artifacts], + "validation_report_paths": list(self.validation_report_paths), + "diagnostics_manifest_path": self.diagnostics_manifest_path, + "metadata": jsonable_value(self.metadata), + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "ReleaseCandidateInputBundle": + """Restore a release candidate bundle from serialized data.""" + + return cls( + context=ReleasePromotionContext.from_dict(data["context"]), + source_output_contract_path=optional_string( + data, + "source_output_contract_path", + ), + release_candidate_fingerprint=optional_string( + data, + "release_candidate_fingerprint", + ), + artifacts=tuple( + ReleaseArtifactSpec.from_dict(item) + for item in data.get("artifacts", ()) + ), + validation_report_paths=tuple( + required_string({"path": item}, "path") + for item in data.get("validation_report_paths", ()) + ), + diagnostics_manifest_path=optional_string( + data, + "diagnostics_manifest_path", + ), + metadata=data.get("metadata", {}), + bundle_type=data.get("bundle_type", RELEASE_CANDIDATE_BUNDLE_TYPE), + stage_id=data.get("stage_id", STAGE_5_VALIDATE_AND_PROMOTE_RELEASE), + schema_version=schema_version(data), + ) + + +@pipeline_node( + id="legacy_release_candidate_bundle_builder", + label="Legacy Release Candidate Bundle Builder", + node_type="library", + description="Compatibility builder for Stage 5 candidates from legacy staged path sets.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +def build_legacy_release_candidate_bundle( + *, + context: ReleasePromotionContext, + rel_paths: Sequence[str], + artifact_metadata_by_path: Mapping[str, Mapping[str, Any]] | None = None, + validation_report_paths: Sequence[str] = (), + source_output_contract_path: str | None = None, + diagnostics_manifest_path: str | None = None, +) -> ReleaseCandidateInputBundle: + """Build a candidate bundle from the current legacy staged relative paths.""" + + artifact_metadata_by_path = _normalize_artifact_metadata_by_path( + artifact_metadata_by_path or {}, + staging_prefix=context.hf_staging_prefix, + ) + artifacts = tuple( + _legacy_artifact_spec( + path, + artifact_metadata_by_path=artifact_metadata_by_path, + ) + for path in dedupe_normalized_release_paths( + rel_paths, + staging_prefix=context.hf_staging_prefix, + ) + ) + return _candidate_bundle_with_fingerprint( + context=context, + artifacts=artifacts, + source_output_contract_path=source_output_contract_path, + validation_report_paths=validation_report_paths, + diagnostics_manifest_path=diagnostics_manifest_path, + reader="legacy_staged_paths", + ) + + +@pipeline_node( + id="stage4_release_candidate_bundle_builder", + label="Stage 4 Release Candidate Bundle Builder", + node_type="library", + description="Build a Stage 5 candidate bundle from Stage 4 contract and inventory records.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +def build_release_candidate_bundle_from_stage4_contract( + *, + context: ReleasePromotionContext, + output_contract: StageContract, + inventory_records: Iterable[Mapping[str, Any]] = (), + source_output_contract_path: str | None = None, + validation_report_paths: Sequence[str] = (), + diagnostics_manifest_path: str | None = None, +) -> ReleaseCandidateInputBundle: + """Build a candidate bundle from a Stage 4 output contract shape.""" + + _validate_stage4_contract_context(output_contract, context) + + inventory_specs = tuple( + _artifact_spec_from_inventory_record( + record, + context=context, + ) + for record in inventory_records + ) + contract_specs = tuple( + spec + for artifact in output_contract.outputs + if ( + spec := _artifact_spec_from_contract_artifact( + artifact, + context=context, + ) + ) + is not None + ) + artifacts = _merge_artifact_specs(contract_specs, inventory_specs) + if not artifacts: + raise ValueError( + "Stage 4 candidate reader needs inventory records or output artifacts " + "with release-relative paths" + ) + + derived_diagnostics_manifest_path = ( + diagnostics_manifest_path + or _diagnostics_manifest_path(output_contract, context=context) + ) + extra_fingerprint_material: dict[str, Any] = { + "source_output_contract_fingerprint": output_contract.fingerprint.value, + "source_output_contract_stage_id": output_contract.stage_id, + } + diagnostics_manifest_identity = _diagnostics_manifest_identity( + output_contract, + context=context, + ) + if diagnostics_manifest_identity is not None: + extra_fingerprint_material["diagnostics_manifest_identity"] = ( + diagnostics_manifest_identity + ) + + return _candidate_bundle_with_fingerprint( + context=context, + artifacts=tuple(sorted(artifacts, key=lambda item: item.relative_path)), + source_output_contract_path=source_output_contract_path, + validation_report_paths=validation_report_paths, + diagnostics_manifest_path=derived_diagnostics_manifest_path, + reader="stage4_contract", + extra_fingerprint_material=extra_fingerprint_material, + ) + + +@pipeline_node( + id="stage4_release_candidate_bundle_reader", + label="Stage 4 Release Candidate Bundle Reader", + node_type="library", + description="Read Stage 4 output contract and inventory files into a Stage 5 candidate bundle.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +def read_stage4_release_candidate_bundle( + *, + context: ReleasePromotionContext, + output_contract_path: str | Path, + output_inventory_path: str | Path | None = None, + source_output_contract_path: str | None = None, + validation_report_paths: Sequence[str] = (), + diagnostics_manifest_path: str | None = None, +) -> ReleaseCandidateInputBundle: + """Read a candidate bundle from Stage 4 contract and optional inventory files.""" + + output_contract = read_contract(output_contract_path) + inventory_records = ( + tuple(_read_jsonl(output_inventory_path)) if output_inventory_path else () + ) + return build_release_candidate_bundle_from_stage4_contract( + context=context, + output_contract=output_contract, + inventory_records=inventory_records, + source_output_contract_path=source_output_contract_path, + validation_report_paths=validation_report_paths, + diagnostics_manifest_path=diagnostics_manifest_path, + ) + + +def _candidate_bundle_with_fingerprint( + *, + context: ReleasePromotionContext, + artifacts: tuple[ReleaseArtifactSpec, ...], + source_output_contract_path: str | None, + validation_report_paths: Sequence[str], + diagnostics_manifest_path: str | None, + reader: str, + extra_fingerprint_material: Mapping[str, Any] | None = None, +) -> ReleaseCandidateInputBundle: + sorted_artifacts = tuple(sorted(artifacts, key=lambda item: item.relative_path)) + normalized_source_output_contract_path = ( + _normalize_run_contract_path(source_output_contract_path, context) + if source_output_contract_path is not None + else None + ) + normalized_validation_report_paths = tuple( + _normalize_run_diagnostic_path(path, context) + for path in validation_report_paths + ) + normalized_diagnostics_manifest_path = ( + _normalize_run_diagnostic_path(diagnostics_manifest_path, context) + if diagnostics_manifest_path is not None + else None + ) + fingerprint_status, missing_identity_paths = _fingerprint_identity_status( + sorted_artifacts + ) + fingerprint = None + if fingerprint_status == "complete": + fingerprint = fingerprint_material( + { + "reader": reader, + "context": _context_fingerprint_material(context), + "artifacts": [ + _artifact_fingerprint_material(artifact) + for artifact in sorted_artifacts + ], + "source_output_contract_path": normalized_source_output_contract_path, + "validation_report_paths": sorted(normalized_validation_report_paths), + "diagnostics_manifest_path": normalized_diagnostics_manifest_path, + **(extra_fingerprint_material or {}), + } + ).value + return ReleaseCandidateInputBundle( + context=context, + artifacts=sorted_artifacts, + source_output_contract_path=normalized_source_output_contract_path, + release_candidate_fingerprint=fingerprint, + validation_report_paths=normalized_validation_report_paths, + diagnostics_manifest_path=normalized_diagnostics_manifest_path, + metadata={ + "reader": reader, + "fingerprint_status": fingerprint_status, + "missing_fingerprint_identity_paths": missing_identity_paths, + }, + ) + + +def _normalize_artifact_metadata_by_path( + artifact_metadata_by_path: Mapping[str, Mapping[str, Any]], + *, + staging_prefix: str | None, +) -> dict[str, Mapping[str, Any]]: + return { + strip_staging_prefix(path, staging_prefix): metadata + for path, metadata in artifact_metadata_by_path.items() + } + + +def _legacy_artifact_spec( + path: str, + *, + artifact_metadata_by_path: Mapping[str, Mapping[str, Any]], +) -> ReleaseArtifactSpec: + metadata = artifact_metadata_by_path.get(path, {}) + return infer_release_artifact_spec( + path, + sha256=_optional_record_string(metadata, "sha256"), + size_bytes=_optional_record_int(metadata, "size_bytes"), + metadata={ + key: value + for key, value in jsonable_value(metadata).items() + if key not in {"sha256", "size_bytes"} + }, + ) + + +def _merge_artifact_specs( + contract_specs: Sequence[ReleaseArtifactSpec], + inventory_specs: Sequence[ReleaseArtifactSpec], +) -> tuple[ReleaseArtifactSpec, ...]: + merged = _artifact_specs_by_path(contract_specs, source="Stage 4 contract") + for spec in inventory_specs: + previous = merged.get(spec.relative_path) + if previous is None: + merged[spec.relative_path] = spec + continue + merged[spec.relative_path] = _merge_duplicate_artifact_spec( + contract_spec=previous, + inventory_spec=spec, + ) + return tuple(merged.values()) + + +def _artifact_specs_by_path( + specs: Sequence[ReleaseArtifactSpec], + *, + source: str, +) -> dict[str, ReleaseArtifactSpec]: + by_path: dict[str, ReleaseArtifactSpec] = {} + for spec in specs: + previous = by_path.get(spec.relative_path) + if previous is None: + by_path[spec.relative_path] = spec + continue + by_path[spec.relative_path] = _merge_duplicate_artifact_spec( + contract_spec=previous, + inventory_spec=spec, + source=source, + ) + return by_path + + +def _merge_duplicate_artifact_spec( + *, + contract_spec: ReleaseArtifactSpec, + inventory_spec: ReleaseArtifactSpec, + source: str = "Stage 4 contract/inventory", +) -> ReleaseArtifactSpec: + comparable_fields = ( + "logical_name", + "artifact_family", + "source_stage_id", + "area_type", + "area_id", + ) + for field_name in comparable_fields: + if getattr(contract_spec, field_name) != getattr(inventory_spec, field_name): + raise ValueError( + "Conflicting Stage 4 artifact identity for " + f"{contract_spec.relative_path}: {field_name}" + ) + if ( + contract_spec.sha256 is not None + and inventory_spec.sha256 is not None + and contract_spec.sha256 != inventory_spec.sha256 + ): + raise ValueError( + f"Conflicting {source} sha256 for {contract_spec.relative_path}" + ) + if ( + contract_spec.size_bytes is not None + and inventory_spec.size_bytes is not None + and contract_spec.size_bytes != inventory_spec.size_bytes + ): + raise ValueError( + f"Conflicting {source} size_bytes for {contract_spec.relative_path}" + ) + return ReleaseArtifactSpec( + logical_name=contract_spec.logical_name, + relative_path=contract_spec.relative_path, + artifact_family=contract_spec.artifact_family, + source_stage_id=contract_spec.source_stage_id, + area_type=contract_spec.area_type, + area_id=contract_spec.area_id, + sha256=inventory_spec.sha256 or contract_spec.sha256, + size_bytes=( + inventory_spec.size_bytes + if inventory_spec.size_bytes is not None + else contract_spec.size_bytes + ), + required=contract_spec.required or inventory_spec.required, + metadata={ + "source_contract": jsonable_value(contract_spec.metadata), + "stage4_inventory": jsonable_value(inventory_spec.metadata), + }, + ) + + +def _fingerprint_identity_status( + artifacts: Sequence[ReleaseArtifactSpec], +) -> tuple[str, tuple[str, ...]]: + missing_identity_paths = tuple( + artifact.relative_path + for artifact in artifacts + if artifact.required + and (artifact.sha256 is None or artifact.size_bytes is None) + ) + if missing_identity_paths: + return "path_only_missing_artifact_identity", missing_identity_paths + return "complete", () + + +def _artifact_spec_from_inventory_record( + record: Mapping[str, Any], + *, + context: ReleasePromotionContext, +) -> ReleaseArtifactSpec: + _validate_inventory_record_context(record, context) + relative_path = _inventory_record_path(record, context=context) + return _artifact_spec_from_stage4_mapping( + record, + relative_path=relative_path, + metadata={"stage4_inventory": jsonable_value(record)}, + ) + + +def _validate_stage4_contract_context( + output_contract: StageContract, + context: ReleasePromotionContext, +) -> None: + if output_contract.stage_id != STAGE_4_BUILD_OUTPUTS: + raise ValueError("output_contract must be a Stage 4 output contract") + if output_contract.run_id and output_contract.run_id != context.run_id: + raise ValueError( + "output_contract.run_id must match release promotion context.run_id" + ) + if output_contract.execution.status not in RELEASE_SAFE_STAGE4_EXECUTION_STATUSES: + raise ValueError( + "output_contract.execution.status must be completed, reused, or " + "partially_reused" + ) + + +def _validate_inventory_record_context( + record: Mapping[str, Any], + context: ReleasePromotionContext, +) -> None: + run_id = _optional_nested_record_string(record, "run_id") + if run_id is not None and run_id != context.run_id: + raise ValueError("inventory record run_id must match context.run_id") + stage_id = _optional_nested_record_string(record, "stage_id") + if stage_id is not None and stage_id != STAGE_4_BUILD_OUTPUTS: + raise ValueError("inventory record stage_id must be 4_build_outputs") + + +def _inventory_record_path( + record: Mapping[str, Any], + *, + context: ReleasePromotionContext, +) -> str: + paths = _inventory_record_paths(record) + if not paths: + raise ValueError("inventory record must include a release path") + normalized_paths = tuple( + strip_staging_prefix(path, context.hf_staging_prefix) for path in paths + ) + if len(set(normalized_paths)) != 1: + raise ValueError("inventory record path fields must agree") + return normalized_paths[0] + + +def _inventory_record_paths(record: Mapping[str, Any]) -> tuple[str, ...]: + paths: list[str] = [] + for key in _INVENTORY_PATH_KEYS: + value = record.get(key) + if isinstance(value, str) and value: + paths.append(value) + artifact = record.get("artifact") + if isinstance(artifact, Mapping): + for key in _INVENTORY_PATH_KEYS: + value = artifact.get(key) + if isinstance(value, str) and value: + paths.append(value) + return tuple(paths) + + +def _artifact_spec_from_contract_artifact( + artifact: ArtifactRef, + *, + context: ReleasePromotionContext, +) -> ReleaseArtifactSpec | None: + path = artifact.metadata.get("relative_path") or artifact.metadata.get( + "output_relative_path" + ) + metadata_path = ( + strip_staging_prefix(path, context.hf_staging_prefix) + if isinstance(path, str) and path + else None + ) + uri_path = _release_path_from_artifact_uri(artifact.uri, context=context) + metadata_path_is_diagnostic = metadata_path is not None and _is_diagnostics_path( + metadata_path + ) + if metadata_path_is_diagnostic or _is_diagnostics_artifact(artifact): + _diagnostic_artifact_path(artifact, context) + return None + if metadata_path is not None and uri_path is not None and metadata_path != uri_path: + raise ValueError("ArtifactRef metadata path must match artifact.uri") + path = metadata_path or uri_path + if path is None: + return None + return _artifact_spec_from_stage4_mapping( + artifact.metadata, + relative_path=path, + default_logical_name=artifact.logical_name, + default_sha256=artifact.sha256, + default_size_bytes=artifact.size_bytes, + allow_inferred_semantics=True, + metadata={ + "source_contract_artifact": artifact.to_dict(), + }, + ) + + +def _diagnostics_manifest_path( + output_contract: StageContract, + *, + context: ReleasePromotionContext, +) -> str | None: + for diagnostic in _diagnostic_refs(output_contract): + artifact = diagnostic.artifact + if artifact is None: + continue + if not _is_diagnostics_manifest_ref(diagnostic.name, diagnostic.kind, artifact): + continue + path = _diagnostic_artifact_path(artifact, context) + if path is not None: + return path + for artifact in output_contract.outputs: + if not _is_diagnostics_artifact(artifact): + continue + path = _diagnostic_artifact_path(artifact, context) + if path is not None: + return path + return None + + +def _diagnostics_manifest_identity( + output_contract: StageContract, + *, + context: ReleasePromotionContext, +) -> dict[str, Any] | None: + for diagnostic in _diagnostic_refs(output_contract): + artifact = diagnostic.artifact + if artifact is None: + continue + if not _is_diagnostics_manifest_ref(diagnostic.name, diagnostic.kind, artifact): + continue + path = _diagnostic_artifact_path(artifact, context) + if path is not None: + return _diagnostic_artifact_identity(path, artifact) + for artifact in output_contract.outputs: + if not _is_diagnostics_artifact(artifact): + continue + path = _diagnostic_artifact_path(artifact, context) + if path is not None: + return _diagnostic_artifact_identity(path, artifact) + return None + + +def _diagnostic_artifact_identity( + path: str, + artifact: ArtifactRef, +) -> dict[str, Any]: + return { + "path": path, + "logical_name": artifact.logical_name, + "uri": artifact.uri, + "sha256": artifact.sha256, + "size_bytes": artifact.size_bytes, + } + + +def _optional_record_string(record: Mapping[str, Any], key: str) -> str | None: + value = record.get(key) + return value if isinstance(value, str) and value else None + + +def _optional_nested_record_string( + record: Mapping[str, Any], + key: str, +) -> str | None: + value = _record_value(record, key) + return value if isinstance(value, str) and value else None + + +def _optional_record_int(record: Mapping[str, Any], key: str) -> int | None: + value = _record_value(record, key) + if value is None: + return None + if isinstance(value, bool) or not isinstance(value, int): + raise ValueError(f"inventory record {key} must be an integer") + return value + + +def _artifact_spec_from_stage4_mapping( + record: Mapping[str, Any], + *, + relative_path: str, + metadata: Mapping[str, Any], + default_logical_name: str | None = None, + default_sha256: str | None = None, + default_size_bytes: int | None = None, + allow_inferred_semantics: bool = False, +) -> ReleaseArtifactSpec: + inferred_family, inferred_area_type, inferred_area_id, inferred_stage_id = ( + infer_artifact_identity(relative_path) + ) + size_bytes = _optional_record_int(record, "size_bytes") + return ReleaseArtifactSpec( + logical_name=_stage4_string( + record, + "logical_name", + default=default_logical_name, + inferred=( + logical_name_for_release_path(relative_path) + if allow_inferred_semantics + else None + ), + ), + relative_path=relative_path, + artifact_family=_stage4_string( + record, + "artifact_family", + inferred=inferred_family if allow_inferred_semantics else None, + ), + source_stage_id=_stage4_string( + record, + "source_stage_id", + inferred=inferred_stage_id if allow_inferred_semantics else None, + ), + area_type=( + _optional_nested_record_string(record, "area_type") + or (inferred_area_type if allow_inferred_semantics else None) + ), + area_id=( + _optional_nested_record_string(record, "area_id") + or (inferred_area_id if allow_inferred_semantics else None) + ), + sha256=_optional_nested_record_string(record, "sha256") or default_sha256, + size_bytes=size_bytes if size_bytes is not None else default_size_bytes, + required=_record_value(record, "required", default=True), + metadata=metadata, + ) + + +def _stage4_string( + record: Mapping[str, Any], + key: str, + *, + default: str | None = None, + inferred: str | None = None, +) -> str: + value = _record_value(record, key, default=default or inferred) + if not isinstance(value, str) or not value: + raise ValueError(f"Stage 4 candidate records must include {key}") + return value + + +def _release_path_from_artifact_uri( + uri: str, + *, + context: ReleasePromotionContext, +) -> str | None: + parsed = urlparse(uri) + if parsed.scheme or parsed.netloc: + _validate_uri_repo(parsed, context) + raw_path = parsed.path.lstrip("/") if parsed.scheme else uri + candidate_paths = [raw_path] + if parsed.netloc: + candidate_paths.append(f"{parsed.netloc}/{raw_path}") + for candidate in candidate_paths: + if context.hf_staging_prefix and context.hf_staging_prefix in candidate: + return strip_staging_prefix( + candidate[candidate.index(context.hf_staging_prefix) :], + context.hf_staging_prefix, + ) + if parsed.scheme or parsed.netloc: + if _contains_release_artifact_path(candidate): + raise ValueError( + "external artifact URI must point under the expected staging prefix" + ) + continue + if candidate.startswith("staging/"): + return strip_staging_prefix(candidate, context.hf_staging_prefix) + for prefix in ("states/", "districts/", "cities/", "national/"): + if prefix in candidate: + return normalize_release_path(candidate[candidate.index(prefix) :]) + for path in BASE_RELEASE_ARTIFACT_PATHS: + if candidate == path or candidate.endswith(f"/{path}"): + return normalize_release_path( + candidate[candidate.rindex(path) :], + ) + return None + + +def _contains_release_artifact_path(candidate: str) -> bool: + if any( + prefix in candidate + for prefix in ("states/", "districts/", "cities/", "national/") + ): + return True + return any( + candidate.endswith(f"/{path}") or candidate == path + for path in BASE_RELEASE_ARTIFACT_PATHS + ) + + +def _validate_uri_repo( + parsed_uri, + context: ReleasePromotionContext, +) -> None: + if not parsed_uri.scheme or not parsed_uri.netloc: + return + path_parts = parsed_uri.path.strip("/").split("/") + if not path_parts or not path_parts[0]: + return + repo_name = f"{parsed_uri.netloc}/{path_parts[0]}" + if repo_name != context.hf_repo_name: + raise ValueError("external artifact URI repo must match context.hf_repo_name") + + +def _is_diagnostics_manifest_ref( + name: str, + kind: str, + artifact: ArtifactRef, +) -> bool: + return ( + name == "diagnostics_manifest" + or kind == "diagnostics_manifest" + or artifact.logical_name == "diagnostics_manifest" + ) + + +def _diagnostic_refs(output_contract: StageContract): + seen: set[tuple[str, str, str | None]] = set() + refs = list(output_contract.diagnostics) + if output_contract.validation is not None: + refs.extend(output_contract.validation.diagnostics) + for diagnostic in refs: + artifact_uri = diagnostic.artifact.uri if diagnostic.artifact else None + key = (diagnostic.name, diagnostic.kind, artifact_uri) + if key in seen: + continue + seen.add(key) + yield diagnostic + + +def _diagnostic_artifact_path( + artifact: ArtifactRef, + context: ReleasePromotionContext, +) -> str | None: + path = artifact.metadata.get("relative_path") or artifact.metadata.get( + "output_relative_path" + ) + metadata_path = ( + _normalize_run_diagnostic_path(path, context) + if isinstance(path, str) and path + else None + ) + uri_path = _diagnostic_path_from_uri(artifact.uri, context) + if metadata_path is not None and uri_path is not None and metadata_path != uri_path: + raise ValueError("Diagnostic artifact metadata path must match artifact.uri") + return metadata_path or uri_path + + +def _diagnostic_path_from_uri( + uri: str, + context: ReleasePromotionContext, +) -> str | None: + parsed = urlparse(uri) + if parsed.scheme or parsed.netloc: + _validate_uri_repo(parsed, context) + raw_path = parsed.path.lstrip("/") if parsed.scheme else uri + candidate_paths = [raw_path] + if parsed.netloc: + candidate_paths.append(f"{parsed.netloc}/{raw_path}") + marker = f"calibration/runs/{context.run_id}/diagnostics/" + for candidate in candidate_paths: + if marker in candidate: + return _normalize_run_diagnostic_path( + candidate[candidate.index(marker) :], + context, + ) + if "calibration/runs/" in candidate: + raise ValueError("diagnostic artifact URI must match context.run_id") + return None + + +def _normalize_run_contract_path( + path: str, + context: ReleasePromotionContext, +) -> str: + normalized = strip_staging_prefix(path, context.hf_staging_prefix) + required_prefix = f"calibration/runs/{context.run_id}/" + if not normalized.startswith(required_prefix): + raise ValueError( + "source_output_contract_path must live under " + f"{required_prefix} for context.run_id" + ) + return normalized + + +def _normalize_run_diagnostic_path( + path: str, + context: ReleasePromotionContext, +) -> str: + normalized = strip_staging_prefix(path, context.hf_staging_prefix) + required_prefix = f"calibration/runs/{context.run_id}/diagnostics/" + if not normalized.startswith(required_prefix): + raise ValueError( + "diagnostic and validation report paths must live under " + f"{required_prefix} for context.run_id" + ) + return normalized + + +def _is_diagnostics_artifact(artifact: ArtifactRef) -> bool: + path = artifact.metadata.get("relative_path") or artifact.metadata.get( + "output_relative_path" + ) + return ( + artifact.logical_name == "diagnostics_manifest" + or artifact.metadata.get("artifact_family") == "diagnostics" + or (isinstance(path, str) and _is_diagnostics_path(path)) + ) + + +def _is_diagnostics_path(path: str) -> bool: + normalized = normalize_release_path(path) + parts = normalized.split("/") + return ( + len(parts) >= 5 + and parts[:2] == ["calibration", "runs"] + and parts[3] == "diagnostics" + ) + + +def _context_fingerprint_material( + context: ReleasePromotionContext, +) -> dict[str, Any]: + return { + "run_id": context.run_id, + "candidate_version": context.candidate_version, + "release_version": context.release_version, + "hf_repo_name": context.hf_repo_name, + "hf_repo_type": context.hf_repo_type, + "gcs_bucket_name": context.gcs_bucket_name, + "base_release_version": context.base_release_version, + "release_bump": context.release_bump, + "modal_app_name": context.modal_app_name, + "modal_environment": context.modal_environment, + "hf_staging_prefix": context.hf_staging_prefix, + "schema_version": context.schema_version, + } + + +def _artifact_fingerprint_material( + artifact: ReleaseArtifactSpec, +) -> dict[str, Any]: + return { + "logical_name": artifact.logical_name, + "relative_path": artifact.relative_path, + "artifact_family": artifact.artifact_family, + "source_stage_id": artifact.source_stage_id, + "area_type": artifact.area_type, + "area_id": artifact.area_id, + "sha256": artifact.sha256, + "size_bytes": artifact.size_bytes, + "required": artifact.required, + "schema_version": artifact.schema_version, + } + + +def _record_value( + record: Mapping[str, Any], + key: str, + *, + default: Any = None, +) -> Any: + if key in record: + return record[key] + artifact = record.get("artifact") + if isinstance(artifact, Mapping) and key in artifact: + return artifact[key] + return default + + +def _read_jsonl(path: str | Path) -> Iterable[Mapping[str, Any]]: + with Path(path).open(encoding="utf-8") as input_file: + for line in input_file: + stripped = line.strip() + if not stripped: + continue + payload = json.loads(stripped) + if not isinstance(payload, Mapping): + raise ValueError("output inventory JSONL rows must be mappings") + yield payload diff --git a/policyengine_us_data/release_promotion/context.py b/policyengine_us_data/release_promotion/context.py new file mode 100644 index 000000000..298a58b68 --- /dev/null +++ b/policyengine_us_data/release_promotion/context.py @@ -0,0 +1,181 @@ +"""Typed identity for Stage 5 release promotion.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + jsonable_value, + mapping_value, + optional_string, + optional_string_value, + require_non_empty, + required_string, + schema_version, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION +from policyengine_us_data.utils.run_context import ( + normalize_release_bump, + sanitize_run_id, + sanitize_staging_version, + stable_release_version, + staging_prefix, +) + + +@pipeline_node( + id="release_promotion_context", + label="ReleasePromotionContext", + node_type="library", + description="Typed Stage 5 run, candidate, release, and destination identity.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class ReleasePromotionContext: + """Canonical run, candidate, release, and destination identity for Stage 5.""" + + run_id: str + candidate_version: str + release_version: str + hf_repo_name: str + gcs_bucket_name: str + hf_repo_type: str = "model" + base_release_version: str | None = None + release_bump: str | None = None + modal_app_name: str | None = None + modal_environment: str | None = None + hf_staging_prefix: str | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__(self, "run_id", sanitize_run_id(self.run_id)) + object.__setattr__( + self, + "candidate_version", + sanitize_staging_version(self.candidate_version), + ) + object.__setattr__( + self, + "release_version", + stable_release_version(self.release_version), + ) + object.__setattr__( + self, + "hf_repo_name", + require_non_empty(self.hf_repo_name, "hf_repo_name"), + ) + object.__setattr__( + self, + "hf_repo_type", + require_non_empty(self.hf_repo_type, "hf_repo_type"), + ) + object.__setattr__( + self, + "gcs_bucket_name", + require_non_empty(self.gcs_bucket_name, "gcs_bucket_name"), + ) + object.__setattr__( + self, + "base_release_version", + ( + stable_release_version(self.base_release_version) + if self.base_release_version is not None + else None + ), + ) + object.__setattr__( + self, + "release_bump", + ( + normalize_release_bump(self.release_bump) + if self.release_bump is not None + else None + ), + ) + object.__setattr__( + self, + "modal_app_name", + optional_string_value(self.modal_app_name, "modal_app_name"), + ) + object.__setattr__( + self, + "modal_environment", + optional_string_value(self.modal_environment, "modal_environment"), + ) + derived_prefix = staging_prefix( + self.run_id, + candidate_version=self.candidate_version, + ) + prefix = self.hf_staging_prefix or derived_prefix + if prefix != derived_prefix: + raise ValueError( + "hf_staging_prefix must match run_id and candidate_version: " + f"{derived_prefix!r}" + ) + object.__setattr__( + self, + "hf_staging_prefix", + require_non_empty(prefix, "hf_staging_prefix"), + ) + object.__setattr__( + self, + "metadata", + freeze_mapping(self.metadata or {}, "metadata"), + ) + + @property + def candidate_scope(self) -> str: + """Return the candidate staging scope used for run-scoped HF paths.""" + + return self.candidate_version + + def to_dict(self) -> dict[str, Any]: + """Serialize the context to JSON-compatible primitives.""" + + return { + "run_id": self.run_id, + "candidate_version": self.candidate_version, + "release_version": self.release_version, + "hf_repo_name": self.hf_repo_name, + "hf_repo_type": self.hf_repo_type, + "gcs_bucket_name": self.gcs_bucket_name, + "base_release_version": self.base_release_version, + "release_bump": self.release_bump, + "modal_app_name": self.modal_app_name, + "modal_environment": self.modal_environment, + "hf_staging_prefix": self.hf_staging_prefix, + "metadata": jsonable_value(self.metadata), + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "ReleasePromotionContext": + """Restore a release promotion context from serialized data.""" + + return cls( + run_id=required_string(data, "run_id"), + candidate_version=required_string(data, "candidate_version"), + release_version=required_string(data, "release_version"), + hf_repo_name=required_string(data, "hf_repo_name"), + hf_repo_type=data.get("hf_repo_type", "model"), + gcs_bucket_name=required_string(data, "gcs_bucket_name"), + base_release_version=optional_string(data, "base_release_version"), + release_bump=optional_string(data, "release_bump"), + modal_app_name=optional_string(data, "modal_app_name"), + modal_environment=optional_string(data, "modal_environment"), + hf_staging_prefix=optional_string(data, "hf_staging_prefix"), + metadata=mapping_value(data, "metadata"), + schema_version=schema_version(data), + ) diff --git a/policyengine_us_data/release_promotion/contract.py b/policyengine_us_data/release_promotion/contract.py new file mode 100644 index 000000000..eadba1e37 --- /dev/null +++ b/policyengine_us_data/release_promotion/contract.py @@ -0,0 +1,562 @@ +"""Stage 5 release promotion contract assembly.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ( + ArtifactRef, + DiagnosticRef, + ExecutionRecord, + ReuseSummary, + StageContract, + SubstageRecord, + ValidationReport, + contract_type_for_stage, + write_contract, +) +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + freeze_sequence, +) +from policyengine_us_data.stage_contracts.fingerprints import fingerprint_material +from policyengine_us_data.stage_contracts.stages import ( + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, +) + +from .candidate import ReleaseCandidateInputBundle +from .context import ReleasePromotionContext +from .results import FullPromotionResult + +RELEASE_PROMOTION_CONTRACT_FILENAME = "release_promotion_contract.json" +RELEASE_PROMOTION_CONTRACT_TYPE = contract_type_for_stage( + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE +) + + +def release_promotion_contract_repo_path(run_id: str) -> str: + """Return the run-scoped repository path for the Stage 5 contract.""" + + return ( + f"calibration/runs/{run_id}/diagnostics/contracts/" + f"{RELEASE_PROMOTION_CONTRACT_FILENAME}" + ) + + +def release_promotion_contract_path(run_dir: str | Path) -> Path: + """Return the run-local diagnostics/contracts path for the Stage 5 contract.""" + + return ( + Path(run_dir) + / "diagnostics" + / "contracts" + / RELEASE_PROMOTION_CONTRACT_FILENAME + ) + + +@pipeline_node( + id="release_promotion_contract_builder", + label="ReleasePromotionContractBuilder", + node_type="library", + description="Build the canonical Stage 5 release promotion contract.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["release candidate bundle", "typed promotion result"], + artifacts_out=["release_promotion_contract.json"], + validation_commands=["uv run pytest tests/unit/release_promotion/test_contract.py"], +) +@dataclass(frozen=True, kw_only=True) +class ReleasePromotionContractBuilder: + """Build a Stage 5 contract from candidate identity and promotion results.""" + + candidate_bundle: ReleaseCandidateInputBundle + promotion_result: FullPromotionResult + created_at: str + code_sha: str | None = None + package_version: str | None = None + validation: ValidationReport | None = None + diagnostics: Sequence[DiagnosticRef] = () + published_artifact_index: ArtifactRef | None = None + promoted_runs_index: ArtifactRef | None = None + promoted_runs_index_update: Mapping[str, Any] | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + if not isinstance(self.candidate_bundle, ReleaseCandidateInputBundle): + raise ValueError("candidate_bundle must be ReleaseCandidateInputBundle") + if not isinstance(self.promotion_result, FullPromotionResult): + raise ValueError("promotion_result must be FullPromotionResult") + if self.published_artifact_index is not None and not isinstance( + self.published_artifact_index, ArtifactRef + ): + raise ValueError("published_artifact_index must be ArtifactRef") + if self.promoted_runs_index is not None and not isinstance( + self.promoted_runs_index, ArtifactRef + ): + raise ValueError("promoted_runs_index must be ArtifactRef") + if self.promoted_runs_index_update is not None: + object.__setattr__( + self, + "promoted_runs_index_update", + freeze_mapping( + self.promoted_runs_index_update, + "promoted_runs_index_update", + ), + ) + object.__setattr__( + self, + "diagnostics", + freeze_sequence(self.diagnostics, "diagnostics", DiagnosticRef), + ) + _validate_result_matches_candidate( + self.promotion_result, + self.candidate_bundle, + ) + + def build(self) -> StageContract: + """Return the canonical Stage 5 release promotion contract.""" + + context = self.candidate_bundle.context + inputs = _contract_inputs(self.candidate_bundle) + outputs = _contract_outputs( + context, + self.promotion_result, + published_artifact_index=self.published_artifact_index, + promoted_runs_index=self.promoted_runs_index, + ) + parameters = _contract_parameters( + self.candidate_bundle, + self.promotion_result, + published_artifact_index=self.published_artifact_index, + promoted_runs_index=self.promoted_runs_index, + ) + return StageContract( + contract_type=RELEASE_PROMOTION_CONTRACT_TYPE, + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + run_id=context.run_id, + created_at=self.created_at, + code_sha=self.code_sha, + package_version=self.package_version, + inputs=inputs, + outputs=outputs, + parameters=parameters, + fingerprint=fingerprint_material( + { + "stage_id": STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + "contract_type": RELEASE_PROMOTION_CONTRACT_TYPE, + "context": context.to_dict(), + "candidate_bundle": self.candidate_bundle.to_dict(), + "promotion_result": self.promotion_result.to_dict(), + "published_artifact_index": ( + self.published_artifact_index.to_dict() + if self.published_artifact_index is not None + else None + ), + "promoted_runs_index": ( + self.promoted_runs_index.to_dict() + if self.promoted_runs_index is not None + else None + ), + "promoted_runs_index_update": ( + dict(self.promoted_runs_index_update) + if self.promoted_runs_index_update is not None + else None + ), + "outputs": [output.to_dict() for output in outputs], + } + ), + substages=_substage_records( + candidate_inputs=inputs, + public_outputs=outputs, + promotion_result=self.promotion_result, + ), + execution=_execution_record(self.promotion_result), + validation=self.validation, + diagnostics=tuple(self.diagnostics), + metadata=_contract_metadata( + context=context, + candidate_bundle=self.candidate_bundle, + promotion_result=self.promotion_result, + outputs=outputs, + promoted_runs_index_update=self.promoted_runs_index_update, + extra=self.metadata, + ), + ) + + +def build_release_promotion_contract( + *, + candidate_bundle: ReleaseCandidateInputBundle, + promotion_result: FullPromotionResult, + created_at: str, + code_sha: str | None = None, + package_version: str | None = None, + validation: ValidationReport | None = None, + diagnostics: Sequence[DiagnosticRef] = (), + published_artifact_index: ArtifactRef | None = None, + promoted_runs_index: ArtifactRef | None = None, + promoted_runs_index_update: Mapping[str, Any] | None = None, + metadata: Mapping[str, Any] | None = None, +) -> StageContract: + """Build the Stage 5 release promotion contract.""" + + return ReleasePromotionContractBuilder( + candidate_bundle=candidate_bundle, + promotion_result=promotion_result, + created_at=created_at, + code_sha=code_sha, + package_version=package_version, + validation=validation, + diagnostics=diagnostics, + published_artifact_index=published_artifact_index, + promoted_runs_index=promoted_runs_index, + promoted_runs_index_update=promoted_runs_index_update, + metadata=metadata or {}, + ).build() + + +def write_release_promotion_contract( + *, + contract_path: str | Path, + candidate_bundle: ReleaseCandidateInputBundle, + promotion_result: FullPromotionResult, + created_at: str, + code_sha: str | None = None, + package_version: str | None = None, + validation: ValidationReport | None = None, + diagnostics: Sequence[DiagnosticRef] = (), + published_artifact_index: ArtifactRef | None = None, + promoted_runs_index: ArtifactRef | None = None, + promoted_runs_index_update: Mapping[str, Any] | None = None, + metadata: Mapping[str, Any] | None = None, +) -> StageContract: + """Build, write, and return the Stage 5 release promotion contract.""" + + contract = build_release_promotion_contract( + candidate_bundle=candidate_bundle, + promotion_result=promotion_result, + created_at=created_at, + code_sha=code_sha, + package_version=package_version, + validation=validation, + diagnostics=diagnostics, + published_artifact_index=published_artifact_index, + promoted_runs_index=promoted_runs_index, + promoted_runs_index_update=promoted_runs_index_update, + metadata=metadata, + ) + write_contract(contract, contract_path) + return contract + + +def _validate_result_matches_candidate( + result: FullPromotionResult, + candidate_bundle: ReleaseCandidateInputBundle, +) -> None: + context = candidate_bundle.context + if result.run_id != context.run_id: + raise ValueError("promotion_result.run_id must match context.run_id") + if result.candidate_version != context.candidate_version: + raise ValueError( + "promotion_result.candidate_version must match context.candidate_version" + ) + if result.release_version != context.release_version: + raise ValueError( + "promotion_result.release_version must match context.release_version" + ) + if result.artifact_count != len(candidate_bundle.artifacts): + raise ValueError( + "promotion_result.artifact_count must match candidate artifacts" + ) + + +def _contract_inputs( + candidate_bundle: ReleaseCandidateInputBundle, +) -> tuple[ArtifactRef, ...]: + context = candidate_bundle.context + inputs = [ + artifact.to_artifact_ref( + uri_prefix=f"hf://{context.hf_repo_name}/{context.hf_staging_prefix}", + ) + for artifact in candidate_bundle.artifacts + ] + if candidate_bundle.source_output_contract_path is not None: + inputs.append( + ArtifactRef( + logical_name="stage4_output_contract", + uri=f"hf://{context.hf_repo_name}/{candidate_bundle.source_output_contract_path}", + media_type="application/json", + metadata={ + "artifact_family": "stage_contract", + "source_stage_id": "4_build_outputs", + }, + ) + ) + for index, path in enumerate(candidate_bundle.validation_report_paths, start=1): + inputs.append( + ArtifactRef( + logical_name=f"validation_report_{index}", + uri=f"hf://{context.hf_repo_name}/{path}", + media_type="application/json", + metadata={"artifact_family": "validation_report"}, + ) + ) + if candidate_bundle.diagnostics_manifest_path is not None: + inputs.append( + ArtifactRef( + logical_name="diagnostics_manifest", + uri=( + f"hf://{context.hf_repo_name}/" + f"{candidate_bundle.diagnostics_manifest_path}" + ), + media_type="application/json", + metadata={"artifact_family": "diagnostics_manifest"}, + ) + ) + return tuple(inputs) + + +def _contract_outputs( + context: ReleasePromotionContext, + result: FullPromotionResult, + *, + published_artifact_index: ArtifactRef | None = None, + promoted_runs_index: ArtifactRef | None = None, +) -> tuple[ArtifactRef, ...]: + hf_base = f"hf://{context.hf_repo_name}" + completion_marker_path = ( + result.completion_marker.marker_path + or f"releases/{context.release_version}/release-complete.json" + ) + outputs = ( + ArtifactRef( + logical_name="huggingface_release_artifacts", + uri=f"{hf_base}/", + metadata={ + "artifact_family": "release_artifact_collection", + "artifact_count": result.artifact_count, + "promoted_count": result.hf.promoted_count, + "already_finalized": result.already_finalized, + }, + ), + ArtifactRef( + logical_name="gcs_release_artifacts", + uri=f"gs://{context.gcs_bucket_name}/", + metadata={ + "artifact_family": "release_artifact_collection", + "artifact_count": result.artifact_count, + "uploaded_count": result.gcs.uploaded_count, + "already_finalized": result.already_finalized, + }, + ), + ArtifactRef( + logical_name="release_manifest", + uri=f"{hf_base}/release_manifest.json", + media_type="application/json", + metadata={ + "artifact_family": "release_manifest", + "artifact_count": result.release_manifest.artifact_count, + }, + ), + ArtifactRef( + logical_name="versioned_release_manifest", + uri=f"{hf_base}/releases/{context.release_version}/release_manifest.json", + media_type="application/json", + metadata={ + "artifact_family": "release_manifest", + "artifact_count": result.release_manifest.artifact_count, + }, + ), + ArtifactRef( + logical_name="trace_tro", + uri=f"{hf_base}/trace.tro.jsonld", + media_type="application/ld+json", + metadata={"artifact_family": "trace_tro"}, + ), + ArtifactRef( + logical_name="versioned_trace_tro", + uri=f"{hf_base}/releases/{context.release_version}/trace.tro.jsonld", + media_type="application/ld+json", + metadata={"artifact_family": "trace_tro"}, + ), + ArtifactRef( + logical_name="version_manifest", + uri=f"{hf_base}/version_manifest.json", + media_type="application/json", + metadata={ + "artifact_family": "version_manifest", + "updated": result.version_manifest.updated, + }, + ), + ArtifactRef( + logical_name="release_completion_marker", + uri=f"{hf_base}/{completion_marker_path}", + media_type="application/json", + metadata={"artifact_family": "release_completion_marker"}, + ), + ) + if published_artifact_index is not None: + outputs = (*outputs, published_artifact_index) + if promoted_runs_index is not None: + outputs = (*outputs, promoted_runs_index) + return outputs + + +def _contract_parameters( + candidate_bundle: ReleaseCandidateInputBundle, + result: FullPromotionResult, + *, + published_artifact_index: ArtifactRef | None = None, + promoted_runs_index: ArtifactRef | None = None, +) -> dict[str, Any]: + context = candidate_bundle.context + return { + "run_id": context.run_id, + "candidate_version": context.candidate_version, + "release_version": context.release_version, + "base_release_version": context.base_release_version, + "release_bump": context.release_bump, + "hf_repo_name": context.hf_repo_name, + "hf_repo_type": context.hf_repo_type, + "gcs_bucket_name": context.gcs_bucket_name, + "hf_staging_prefix": context.hf_staging_prefix, + "artifact_count": result.artifact_count, + "release_candidate_fingerprint": ( + candidate_bundle.release_candidate_fingerprint + ), + "source_output_contract_path": candidate_bundle.source_output_contract_path, + "validation_report_paths": list(candidate_bundle.validation_report_paths), + "diagnostics_manifest_path": candidate_bundle.diagnostics_manifest_path, + "published_artifact_index_path": _artifact_relative_path( + published_artifact_index + ), + "promoted_runs_index_path": _artifact_relative_path(promoted_runs_index), + } + + +def _contract_metadata( + *, + context: ReleasePromotionContext, + candidate_bundle: ReleaseCandidateInputBundle, + promotion_result: FullPromotionResult, + outputs: Sequence[ArtifactRef], + promoted_runs_index_update: Mapping[str, Any] | None, + extra: Mapping[str, Any], +) -> dict[str, Any]: + outputs_by_name = {output.logical_name: output for output in outputs} + return { + **dict(extra), + "contract_file": RELEASE_PROMOTION_CONTRACT_FILENAME, + "contract_repo_path": release_promotion_contract_repo_path(context.run_id), + "candidate_bundle_type": candidate_bundle.bundle_type, + "candidate_metadata": candidate_bundle.metadata, + "cleanup": promotion_result.cleanup.to_dict(), + "already_finalized": promotion_result.already_finalized, + "promotion_result": promotion_result.to_dict(), + "published_artifact_index": ( + outputs_by_name["published_artifact_index"].to_dict() + if "published_artifact_index" in outputs_by_name + else None + ), + "promoted_runs_index": ( + outputs_by_name["promoted_runs_index"].to_dict() + if "promoted_runs_index" in outputs_by_name + else None + ), + "promoted_runs_index_update": ( + dict(promoted_runs_index_update) + if promoted_runs_index_update is not None + else None + ), + "public_refs": {output.logical_name: output.uri for output in outputs}, + } + + +def _artifact_relative_path(artifact: ArtifactRef | None) -> str | None: + if artifact is None: + return None + relative_path = artifact.metadata.get("relative_path") + return relative_path if isinstance(relative_path, str) and relative_path else None + + +def _execution_record(result: FullPromotionResult) -> ExecutionRecord: + return ExecutionRecord( + status="completed", + reuse_decision="reused" if result.already_finalized else "computed", + reuse_reason=( + "already_finalized" if result.already_finalized else "fresh_promotion" + ), + reuse_summary=ReuseSummary( + expected_outputs=result.artifact_count, + valid_reused_outputs=( + result.artifact_count if result.already_finalized else 0 + ), + recomputed_outputs=0 if result.already_finalized else result.artifact_count, + ), + ) + + +def _substage_records( + *, + candidate_inputs: Sequence[ArtifactRef], + public_outputs: Sequence[ArtifactRef], + promotion_result: FullPromotionResult, +) -> tuple[SubstageRecord, ...]: + outputs_by_name = {artifact.logical_name: artifact for artifact in public_outputs} + finalization_outputs = [ + outputs_by_name["release_manifest"], + outputs_by_name["versioned_release_manifest"], + outputs_by_name["trace_tro"], + outputs_by_name["versioned_trace_tro"], + outputs_by_name["version_manifest"], + outputs_by_name["release_completion_marker"], + ] + if "published_artifact_index" in outputs_by_name: + finalization_outputs.append(outputs_by_name["published_artifact_index"]) + if "promoted_runs_index" in outputs_by_name: + finalization_outputs.append(outputs_by_name["promoted_runs_index"]) + return ( + SubstageRecord( + substage_id="5a_validate_outputs", + status="completed", + inputs=tuple(candidate_inputs), + reuse_mode="observed_only", + metadata={"artifact_count": promotion_result.artifact_count}, + ), + SubstageRecord( + substage_id="5b_promote_huggingface", + status="completed", + outputs=(outputs_by_name["huggingface_release_artifacts"],), + reuse_mode="handoff", + metadata={ + "promoted_count": promotion_result.hf.promoted_count, + "already_finalized": promotion_result.already_finalized, + }, + ), + SubstageRecord( + substage_id="5c_promote_gcs", + status="completed", + outputs=(outputs_by_name["gcs_release_artifacts"],), + reuse_mode="handoff", + metadata={ + "uploaded_count": promotion_result.gcs.uploaded_count, + "already_finalized": promotion_result.already_finalized, + }, + ), + SubstageRecord( + substage_id="5d_write_version_manifest", + status="completed", + outputs=tuple(finalization_outputs), + reuse_mode="handoff", + metadata={ + "version_manifest_updated": promotion_result.version_manifest.updated, + "cleanup": promotion_result.cleanup.to_dict(), + "already_finalized": promotion_result.already_finalized, + }, + ), + ) diff --git a/policyengine_us_data/release_promotion/promoted_runs_index.py b/policyengine_us_data/release_promotion/promoted_runs_index.py new file mode 100644 index 000000000..5faaeb76b --- /dev/null +++ b/policyengine_us_data/release_promotion/promoted_runs_index.py @@ -0,0 +1,675 @@ +"""Run-oriented promoted release discovery index for Stage 5.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ArtifactRef +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + jsonable_value, + mapping_value, + optional_string, + optional_string_value, + require_non_empty, + required_string, + schema_version, + validate_optional_int, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION +from policyengine_us_data.stage_contracts.stages import ( + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, +) +from policyengine_us_data.utils.canonical_json import ( + canonical_json_dumps, + canonical_json_loads, +) + +from .context import ReleasePromotionContext +from .results import FullPromotionResult + +PROMOTED_RUNS_INDEX_FILENAME = "index.json" +PROMOTED_RUNS_INDEX_MEDIA_TYPE = "application/json" + + +def promoted_runs_index_repo_path() -> str: + """Return the repository path for the promoted runs discovery index.""" + + return f"calibration/runs/{PROMOTED_RUNS_INDEX_FILENAME}" + + +def promoted_runs_index_path(runs_dir: str | Path) -> Path: + """Return the pipeline-volume path for the promoted runs discovery index.""" + + return Path(runs_dir) / PROMOTED_RUNS_INDEX_FILENAME + + +def promoted_runs_index_artifact_ref( + context: ReleasePromotionContext, + update: "PromotedRunsIndexUpdate", + *, + sha256: str | None = None, + size_bytes: int | None = None, +) -> ArtifactRef: + """Return a stage-contract reference to the promoted runs index.""" + + return ArtifactRef( + logical_name="promoted_runs_index", + uri=f"hf://{context.hf_repo_name}/{promoted_runs_index_repo_path()}", + sha256=sha256, + size_bytes=size_bytes, + media_type=PROMOTED_RUNS_INDEX_MEDIA_TYPE, + metadata={ + "artifact_family": "promoted_runs_index", + "source_stage_id": STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + "relative_path": promoted_runs_index_repo_path(), + "run_id": update.run_id, + "release_version": update.release_version, + "update_status": update.status, + "run_count": update.run_count, + "release_version_run_count": update.release_version_run_count, + "already_finalized": update.already_finalized, + }, + ) + + +@pipeline_node( + id="promoted_run_index_entry", + label="PromotedRunIndexEntry", + node_type="library", + description="One run-oriented discovery entry emitted by Stage 5 promotion.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["typed promotion result", "release promotion contract"], + artifacts_out=["promoted_runs_index.json"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_promoted_runs_index.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class PromotedRunIndexEntry: + """One promoted run entry keyed by canonical run ID.""" + + run_id: str + candidate_version: str + release_version: str + status: str + promoted_at: str + updated_at: str + artifact_count: int + hf_promoted_count: int + gcs_uploaded_count: int + release_manifest_artifacts: int + version_manifest_updated: bool + completion_marker_path: str + already_finalized: bool = False + release_promotion_contract_path: str | None = None + published_artifact_index_path: str | None = None + run_manifest_path: str | None = None + step_manifest_path: str | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + for field_name in ( + "run_id", + "candidate_version", + "release_version", + "status", + "promoted_at", + "updated_at", + "completion_marker_path", + ): + object.__setattr__( + self, + field_name, + require_non_empty(getattr(self, field_name), field_name), + ) + for field_name in ( + "artifact_count", + "hf_promoted_count", + "gcs_uploaded_count", + "release_manifest_artifacts", + ): + _nonnegative_int_value(getattr(self, field_name), field_name) + object.__setattr__( + self, + "version_manifest_updated", + _bool_value(self.version_manifest_updated, "version_manifest_updated"), + ) + object.__setattr__( + self, + "already_finalized", + _bool_value(self.already_finalized, "already_finalized"), + ) + for field_name in ( + "release_promotion_contract_path", + "published_artifact_index_path", + "run_manifest_path", + "step_manifest_path", + ): + object.__setattr__( + self, + field_name, + optional_string_value(getattr(self, field_name), field_name), + ) + object.__setattr__(self, "metadata", freeze_mapping(self.metadata, "metadata")) + + def to_dict(self) -> dict[str, Any]: + """Serialize the promoted run entry to JSON-compatible primitives.""" + + return { + "run_id": self.run_id, + "candidate_version": self.candidate_version, + "release_version": self.release_version, + "status": self.status, + "promoted_at": self.promoted_at, + "updated_at": self.updated_at, + "artifact_count": self.artifact_count, + "hf_promoted_count": self.hf_promoted_count, + "gcs_uploaded_count": self.gcs_uploaded_count, + "release_manifest_artifacts": self.release_manifest_artifacts, + "version_manifest_updated": self.version_manifest_updated, + "completion_marker_path": self.completion_marker_path, + "already_finalized": self.already_finalized, + "release_promotion_contract_path": self.release_promotion_contract_path, + "published_artifact_index_path": self.published_artifact_index_path, + "run_manifest_path": self.run_manifest_path, + "step_manifest_path": self.step_manifest_path, + "metadata": jsonable_value(self.metadata), + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "PromotedRunIndexEntry": + """Restore a promoted run entry from serialized data.""" + + return cls( + run_id=required_string(data, "run_id"), + candidate_version=required_string(data, "candidate_version"), + release_version=required_string(data, "release_version"), + status=required_string(data, "status"), + promoted_at=required_string(data, "promoted_at"), + updated_at=required_string(data, "updated_at"), + artifact_count=_nonnegative_int(data, "artifact_count"), + hf_promoted_count=_nonnegative_int(data, "hf_promoted_count"), + gcs_uploaded_count=_nonnegative_int(data, "gcs_uploaded_count"), + release_manifest_artifacts=_nonnegative_int( + data, + "release_manifest_artifacts", + ), + version_manifest_updated=_bool_field(data, "version_manifest_updated"), + completion_marker_path=required_string(data, "completion_marker_path"), + already_finalized=_bool_field(data, "already_finalized", default=False), + release_promotion_contract_path=optional_string( + data, + "release_promotion_contract_path", + ), + published_artifact_index_path=optional_string( + data, + "published_artifact_index_path", + ), + run_manifest_path=optional_string(data, "run_manifest_path"), + step_manifest_path=optional_string(data, "step_manifest_path"), + metadata=mapping_value(data, "metadata"), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class PromotedReleaseVersionEntry: + """Release-version lookup entry inside the promoted runs index.""" + + release_version: str + latest_run_id: str + run_ids: tuple[str, ...] + updated_at: str + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "release_version", + require_non_empty(self.release_version, "release_version"), + ) + object.__setattr__( + self, + "latest_run_id", + require_non_empty(self.latest_run_id, "latest_run_id"), + ) + run_ids = _string_tuple(self.run_ids, "run_ids") + if not run_ids: + raise ValueError("run_ids must not be empty") + if len(set(run_ids)) != len(run_ids): + raise ValueError("run_ids must not contain duplicates") + if self.latest_run_id not in run_ids: + raise ValueError("latest_run_id must be present in run_ids") + object.__setattr__(self, "run_ids", run_ids) + object.__setattr__( + self, + "updated_at", + require_non_empty(self.updated_at, "updated_at"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize the release-version lookup entry.""" + + return { + "release_version": self.release_version, + "latest_run_id": self.latest_run_id, + "run_ids": list(self.run_ids), + "updated_at": self.updated_at, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + ) -> "PromotedReleaseVersionEntry": + """Restore a release-version lookup entry.""" + + return cls( + release_version=required_string(data, "release_version"), + latest_run_id=required_string(data, "latest_run_id"), + run_ids=_string_tuple(data.get("run_ids", ()), "run_ids"), + updated_at=required_string(data, "updated_at"), + schema_version=schema_version(data), + ) + + +@pipeline_node( + id="promoted_runs_index", + label="PromotedRunsIndex", + node_type="library", + description="Run-oriented discovery index for promoted Stage 5 releases.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["typed promotion result", "release promotion contract"], + artifacts_out=["calibration/runs/index.json"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_promoted_runs_index.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class PromotedRunsIndex: + """Promoted run discovery index keyed by canonical run ID.""" + + updated_at: str + runs: Mapping[str, PromotedRunIndexEntry] = field(default_factory=dict) + release_versions: Mapping[str, PromotedReleaseVersionEntry] = field( + default_factory=dict + ) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "updated_at", + require_non_empty(self.updated_at, "updated_at"), + ) + runs = _coerce_runs(self.runs) + release_versions = _coerce_release_versions(self.release_versions) + _validate_release_version_entries(runs, release_versions) + object.__setattr__(self, "runs", runs) + object.__setattr__(self, "release_versions", release_versions) + + def to_dict(self) -> dict[str, Any]: + """Serialize the promoted runs index.""" + + return { + "updated_at": self.updated_at, + "runs": { + run_id: self.runs[run_id].to_dict() for run_id in sorted(self.runs) + }, + "release_versions": { + release_version: self.release_versions[release_version].to_dict() + for release_version in sorted(self.release_versions) + }, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "PromotedRunsIndex": + """Restore the promoted runs index from serialized data.""" + + return cls( + updated_at=required_string(data, "updated_at"), + runs=mapping_value(data, "runs"), + release_versions=mapping_value(data, "release_versions"), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class PromotedRunsIndexUpdate: + """Status for one promoted runs index upsert.""" + + status: str + run_id: str + release_version: str + run_count: int + release_version_run_count: int + already_finalized: bool + updated_at: str + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + if self.status not in {"created", "updated"}: + raise ValueError("status must be 'created' or 'updated'") + for field_name in ("run_id", "release_version", "updated_at"): + object.__setattr__( + self, + field_name, + require_non_empty(getattr(self, field_name), field_name), + ) + for field_name in ("run_count", "release_version_run_count"): + _nonnegative_int_value(getattr(self, field_name), field_name) + object.__setattr__( + self, + "already_finalized", + _bool_value(self.already_finalized, "already_finalized"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize the index update status.""" + + return { + "status": self.status, + "run_id": self.run_id, + "release_version": self.release_version, + "run_count": self.run_count, + "release_version_run_count": self.release_version_run_count, + "already_finalized": self.already_finalized, + "updated_at": self.updated_at, + "schema_version": self.schema_version, + } + + +def empty_promoted_runs_index(updated_at: str) -> PromotedRunsIndex: + """Return an empty promoted runs index.""" + + return PromotedRunsIndex(updated_at=updated_at) + + +def build_promoted_run_index_entry( + *, + context: ReleasePromotionContext, + promotion_result: FullPromotionResult, + promoted_at: str, + release_promotion_contract_path: str | None = None, + published_artifact_index_path: str | None = None, + run_manifest_path: str | None = None, + step_manifest_path: str | None = None, + metadata: Mapping[str, Any] | None = None, +) -> PromotedRunIndexEntry: + """Build the run discovery entry for a successful Stage 5 promotion.""" + + _validate_result_matches_context(promotion_result, context) + completion_marker = ( + promotion_result.completion_marker.marker_path + or f"releases/{context.release_version}/release-complete.json" + ) + return PromotedRunIndexEntry( + run_id=context.run_id, + candidate_version=context.candidate_version, + release_version=context.release_version, + status="promoted", + promoted_at=promoted_at, + updated_at=promoted_at, + artifact_count=promotion_result.artifact_count, + hf_promoted_count=promotion_result.hf.promoted_count, + gcs_uploaded_count=promotion_result.gcs.uploaded_count, + release_manifest_artifacts=promotion_result.release_manifest.artifact_count, + version_manifest_updated=promotion_result.version_manifest.updated, + completion_marker_path=completion_marker, + already_finalized=promotion_result.already_finalized, + release_promotion_contract_path=release_promotion_contract_path, + published_artifact_index_path=published_artifact_index_path, + run_manifest_path=run_manifest_path, + step_manifest_path=step_manifest_path, + metadata=metadata or {}, + ) + + +def upsert_promoted_run( + index: PromotedRunsIndex, + entry: PromotedRunIndexEntry, + *, + updated_at: str, +) -> tuple[PromotedRunsIndex, PromotedRunsIndexUpdate]: + """Upsert one promoted run without duplicating run or version entries.""" + + existed = entry.run_id in index.runs + runs = dict(index.runs) + runs[entry.run_id] = entry + + release_versions = _release_versions_without_run(index, entry.run_id) + current_version = release_versions.get(entry.release_version) + if current_version is None: + run_ids = (entry.run_id,) + elif entry.run_id in current_version.run_ids: + run_ids = current_version.run_ids + else: + run_ids = (*current_version.run_ids, entry.run_id) + release_versions[entry.release_version] = PromotedReleaseVersionEntry( + release_version=entry.release_version, + latest_run_id=entry.run_id, + run_ids=run_ids, + updated_at=updated_at, + ) + updated = PromotedRunsIndex( + updated_at=updated_at, + runs=runs, + release_versions=release_versions, + ) + update = PromotedRunsIndexUpdate( + status="updated" if existed else "created", + run_id=entry.run_id, + release_version=entry.release_version, + run_count=len(updated.runs), + release_version_run_count=len( + updated.release_versions[entry.release_version].run_ids + ), + already_finalized=entry.already_finalized, + updated_at=updated_at, + ) + return updated, update + + +def promoted_runs_index_to_json(index: PromotedRunsIndex) -> str: + """Serialize the promoted runs index deterministically.""" + + return canonical_json_dumps(index.to_dict()) + + +def promoted_runs_index_from_json(payload: str) -> PromotedRunsIndex: + """Restore the promoted runs index from JSON text.""" + + return PromotedRunsIndex.from_dict(canonical_json_loads(payload)) + + +def read_promoted_runs_index(path: str | Path) -> PromotedRunsIndex: + """Read a promoted runs index from disk.""" + + return promoted_runs_index_from_json(Path(path).read_text(encoding="utf-8")) + + +def load_promoted_runs_index( + path: str | Path, + *, + updated_at: str, +) -> PromotedRunsIndex: + """Read a promoted runs index, returning an empty index when absent.""" + + index_path = Path(path) + if not index_path.exists(): + return empty_promoted_runs_index(updated_at) + return read_promoted_runs_index(index_path) + + +def write_promoted_runs_index( + index: PromotedRunsIndex, + path: str | Path, +) -> PromotedRunsIndex: + """Write the promoted runs index to disk.""" + + index_path = Path(path) + index_path.parent.mkdir(parents=True, exist_ok=True) + index_path.write_text(promoted_runs_index_to_json(index), encoding="utf-8") + return index + + +def update_promoted_runs_index( + *, + path: str | Path, + entry: PromotedRunIndexEntry, + updated_at: str, +) -> tuple[PromotedRunsIndex, PromotedRunsIndexUpdate]: + """Load, upsert, and persist one promoted run entry.""" + + index = load_promoted_runs_index(path, updated_at=updated_at) + updated, update = upsert_promoted_run(index, entry, updated_at=updated_at) + write_promoted_runs_index(updated, path) + return updated, update + + +def _validate_result_matches_context( + result: FullPromotionResult, + context: ReleasePromotionContext, +) -> None: + if result.run_id != context.run_id: + raise ValueError("promotion_result.run_id must match context.run_id") + if result.candidate_version != context.candidate_version: + raise ValueError( + "promotion_result.candidate_version must match context.candidate_version" + ) + if result.release_version != context.release_version: + raise ValueError( + "promotion_result.release_version must match context.release_version" + ) + + +def _release_versions_without_run( + index: PromotedRunsIndex, + run_id: str, +) -> dict[str, PromotedReleaseVersionEntry]: + release_versions: dict[str, PromotedReleaseVersionEntry] = {} + for release_version, entry in index.release_versions.items(): + run_ids = tuple(item for item in entry.run_ids if item != run_id) + if not run_ids: + continue + latest_run_id = ( + entry.latest_run_id if entry.latest_run_id in run_ids else run_ids[-1] + ) + release_versions[release_version] = PromotedReleaseVersionEntry( + release_version=release_version, + latest_run_id=latest_run_id, + run_ids=run_ids, + updated_at=entry.updated_at, + ) + return release_versions + + +def _coerce_runs( + value: Mapping[str, PromotedRunIndexEntry | Mapping[str, Any]], +) -> Mapping[str, PromotedRunIndexEntry]: + if not isinstance(value, Mapping): + raise ValueError("runs must be a mapping") + runs: dict[str, PromotedRunIndexEntry] = {} + for run_id, entry in value.items(): + if isinstance(entry, PromotedRunIndexEntry): + runs[str(run_id)] = entry + elif isinstance(entry, Mapping): + runs[str(run_id)] = PromotedRunIndexEntry.from_dict(entry) + else: + raise ValueError("runs entries must be PromotedRunIndexEntry mappings") + for run_id, entry in runs.items(): + if run_id != entry.run_id: + raise ValueError("runs keys must match entry.run_id") + return freeze_mapping(runs, "runs") + + +def _coerce_release_versions( + value: Mapping[str, PromotedReleaseVersionEntry | Mapping[str, Any]], +) -> Mapping[str, PromotedReleaseVersionEntry]: + if not isinstance(value, Mapping): + raise ValueError("release_versions must be a mapping") + release_versions: dict[str, PromotedReleaseVersionEntry] = {} + for release_version, entry in value.items(): + if isinstance(entry, PromotedReleaseVersionEntry): + release_versions[str(release_version)] = entry + elif isinstance(entry, Mapping): + release_versions[str(release_version)] = ( + PromotedReleaseVersionEntry.from_dict(entry) + ) + else: + raise ValueError( + "release_versions entries must be PromotedReleaseVersionEntry mappings" + ) + for release_version, entry in release_versions.items(): + if release_version != entry.release_version: + raise ValueError("release_versions keys must match entry.release_version") + return freeze_mapping(release_versions, "release_versions") + + +def _validate_release_version_entries( + runs: Mapping[str, PromotedRunIndexEntry], + release_versions: Mapping[str, PromotedReleaseVersionEntry], +) -> None: + for release_version, entry in release_versions.items(): + for run_id in entry.run_ids: + run = runs.get(run_id) + if run is None: + raise ValueError("release_versions run_ids must exist in runs") + if run.release_version != release_version: + raise ValueError( + "release_versions run_ids must match their release_version" + ) + + +def _nonnegative_int(data: Mapping[str, Any], field_name: str) -> int: + value = data.get(field_name) + return _nonnegative_int_value(value, field_name) + + +def _nonnegative_int_value(value: Any, field_name: str) -> int: + validate_optional_int(value, field_name) + if value is None: + raise ValueError(f"{field_name} must be an integer") + if value < 0: + raise ValueError(f"{field_name} must be non-negative") + return value + + +def _bool_value(value: Any, field_name: str) -> bool: + if not isinstance(value, bool): + raise ValueError(f"{field_name} must be a boolean") + return value + + +def _bool_field( + data: Mapping[str, Any], + field_name: str, + *, + default: bool | None = None, +) -> bool: + value = data.get(field_name, default) + return _bool_value(value, field_name) + + +def _string_tuple(value: Any, field_name: str) -> tuple[str, ...]: + if not isinstance(value, tuple | list): + raise ValueError(f"{field_name} must be a tuple or list") + return tuple(require_non_empty(item, field_name) for item in value) diff --git a/policyengine_us_data/release_promotion/published_index.py b/policyengine_us_data/release_promotion/published_index.py new file mode 100644 index 000000000..964e1219f --- /dev/null +++ b/policyengine_us_data/release_promotion/published_index.py @@ -0,0 +1,531 @@ +"""Published artifact index rows for Stage 5 release promotion.""" + +from __future__ import annotations + +from collections.abc import Mapping, Sequence +from dataclasses import dataclass, field +from pathlib import Path, PurePosixPath +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ArtifactRef +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + jsonable_value, + mapping_value, + optional_int_value, + optional_string, + optional_string_value, + require_non_empty, + required_string, + schema_version, + validate_optional_int, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION +from policyengine_us_data.stage_contracts.stages import ( + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, +) +from policyengine_us_data.utils.canonical_json import canonical_json_dumps + +from .candidate import ReleaseCandidateInputBundle +from .context import ReleasePromotionContext +from .results import FullPromotionResult + +PUBLISHED_ARTIFACT_INDEX_FILENAME = "published_artifact_index.jsonl" +PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE = "application/jsonl" + + +def published_artifact_index_repo_path(run_id: str) -> str: + """Return the run-scoped repository path for the published artifact index.""" + + return f"calibration/runs/{run_id}/diagnostics/{PUBLISHED_ARTIFACT_INDEX_FILENAME}" + + +def published_artifact_index_path(run_dir: str | Path) -> Path: + """Return the run-local diagnostics path for the published artifact index.""" + + return Path(run_dir) / "diagnostics" / PUBLISHED_ARTIFACT_INDEX_FILENAME + + +def published_artifact_index_artifact_ref( + context: ReleasePromotionContext, + *, + row_count: int | None = None, + sha256: str | None = None, + size_bytes: int | None = None, +) -> ArtifactRef: + """Return a stage-contract reference to the published artifact index.""" + + metadata: dict[str, Any] = { + "artifact_family": "published_artifact_index", + "source_stage_id": STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + "relative_path": published_artifact_index_repo_path(context.run_id), + } + if row_count is not None: + metadata["row_count"] = row_count + return ArtifactRef( + logical_name="published_artifact_index", + uri=( + f"hf://{context.hf_repo_name}/" + f"{published_artifact_index_repo_path(context.run_id)}" + ), + sha256=sha256, + size_bytes=size_bytes, + media_type=PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE, + metadata=metadata, + ) + + +@pipeline_node( + id="published_artifact_index_row", + label="PublishedArtifactIndexRow", + node_type="library", + description="One published HF/GCS artifact row emitted by Stage 5.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["release candidate bundle", "release manifest"], + artifacts_out=["published_artifact_index.jsonl"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_published_index.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class PublishedArtifactIndexRow: + """One row in the Stage 5 published artifact JSONL index.""" + + run_id: str + candidate_version: str + release_version: str + logical_name: str + relative_path: str + artifact_role: str + artifact_family: str + source_stage_id: str + hf_uri: str + gcs_uri: str | None = None + sha256: str | None = None + size_bytes: int | None = None + area_type: str | None = None + area_id: str | None = None + release_manifest_key: str | None = None + release_manifest_revision: str | None = None + metadata: Mapping[str, Any] = field(default_factory=dict) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + for field_name in ( + "run_id", + "candidate_version", + "release_version", + "logical_name", + "relative_path", + "artifact_role", + "artifact_family", + "source_stage_id", + "hf_uri", + ): + object.__setattr__( + self, + field_name, + require_non_empty(getattr(self, field_name), field_name), + ) + object.__setattr__( + self, + "gcs_uri", + optional_string_value(self.gcs_uri, "gcs_uri"), + ) + object.__setattr__( + self, + "sha256", + optional_string_value(self.sha256, "sha256"), + ) + validate_optional_int(self.size_bytes, "size_bytes") + if self.size_bytes is not None and self.size_bytes < 0: + raise ValueError("size_bytes must be non-negative") + object.__setattr__( + self, + "area_type", + optional_string_value(self.area_type, "area_type"), + ) + object.__setattr__( + self, + "area_id", + optional_string_value(self.area_id, "area_id"), + ) + object.__setattr__( + self, + "release_manifest_key", + optional_string_value( + self.release_manifest_key, + "release_manifest_key", + ), + ) + object.__setattr__( + self, + "release_manifest_revision", + optional_string_value( + self.release_manifest_revision, + "release_manifest_revision", + ), + ) + object.__setattr__( + self, + "metadata", + freeze_mapping(self.metadata, "metadata"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize the row to JSON-compatible primitives.""" + + return { + "run_id": self.run_id, + "candidate_version": self.candidate_version, + "release_version": self.release_version, + "logical_name": self.logical_name, + "relative_path": self.relative_path, + "artifact_role": self.artifact_role, + "artifact_family": self.artifact_family, + "source_stage_id": self.source_stage_id, + "hf_uri": self.hf_uri, + "gcs_uri": self.gcs_uri, + "sha256": self.sha256, + "size_bytes": self.size_bytes, + "area_type": self.area_type, + "area_id": self.area_id, + "release_manifest_key": self.release_manifest_key, + "release_manifest_revision": self.release_manifest_revision, + "metadata": jsonable_value(self.metadata), + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "PublishedArtifactIndexRow": + """Restore an index row from serialized data.""" + + return cls( + run_id=required_string(data, "run_id"), + candidate_version=required_string(data, "candidate_version"), + release_version=required_string(data, "release_version"), + logical_name=required_string(data, "logical_name"), + relative_path=required_string(data, "relative_path"), + artifact_role=required_string(data, "artifact_role"), + artifact_family=required_string(data, "artifact_family"), + source_stage_id=required_string(data, "source_stage_id"), + hf_uri=required_string(data, "hf_uri"), + gcs_uri=optional_string(data, "gcs_uri"), + sha256=optional_string(data, "sha256"), + size_bytes=optional_int_value(data, "size_bytes"), + area_type=optional_string(data, "area_type"), + area_id=optional_string(data, "area_id"), + release_manifest_key=optional_string(data, "release_manifest_key"), + release_manifest_revision=optional_string( + data, + "release_manifest_revision", + ), + metadata=mapping_value(data, "metadata"), + schema_version=schema_version(data), + ) + + +@pipeline_node( + id="published_artifact_index_builder", + label="Published Artifact Index Builder", + node_type="library", + description="Build the Stage 5 published artifact JSONL index.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["release candidate bundle", "typed promotion result"], + artifacts_out=["published_artifact_index.jsonl"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_published_index.py" + ], +) +def build_published_artifact_index( + *, + candidate_bundle: ReleaseCandidateInputBundle, + promotion_result: FullPromotionResult, + release_manifest: Mapping[str, Any] | None = None, + diagnostic_artifacts: Sequence[ArtifactRef] = (), +) -> tuple[PublishedArtifactIndexRow, ...]: + """Build deterministic published artifact rows for a promoted release.""" + + _validate_result_matches_candidate(promotion_result, candidate_bundle) + context = candidate_bundle.context + manifest_artifacts = _release_manifest_artifacts(release_manifest) + rows = [ + *( + _candidate_artifact_row( + context=context, + artifact=artifact, + manifest_artifacts=manifest_artifacts, + ) + for artifact in candidate_bundle.artifacts + ), + *_release_metadata_rows(context, promotion_result), + *( + _diagnostic_artifact_row(context=context, artifact=artifact) + for artifact in sorted( + diagnostic_artifacts, + key=lambda item: item.logical_name, + ) + ), + ] + return tuple(sorted(rows, key=lambda row: (row.artifact_role, row.relative_path))) + + +def published_artifact_index_to_jsonl( + rows: Sequence[PublishedArtifactIndexRow], +) -> str: + """Serialize published artifact rows to deterministic JSONL.""" + + return "".join( + canonical_json_dumps( + row.to_dict(), + compact=True, + trailing_newline=False, + ) + + "\n" + for row in rows + ) + + +def published_artifact_index_from_jsonl( + payload: str, +) -> tuple[PublishedArtifactIndexRow, ...]: + """Read published artifact rows from JSONL text.""" + + import json + + return tuple( + PublishedArtifactIndexRow.from_dict(json.loads(line)) + for line in payload.splitlines() + if line.strip() + ) + + +def write_published_artifact_index( + rows: Sequence[PublishedArtifactIndexRow], + path: str | Path, +) -> tuple[PublishedArtifactIndexRow, ...]: + """Write published artifact rows to an explicit JSONL path.""" + + frozen_rows = tuple(rows) + index_path = Path(path) + index_path.parent.mkdir(parents=True, exist_ok=True) + index_path.write_text( + published_artifact_index_to_jsonl(frozen_rows), + encoding="utf-8", + ) + return frozen_rows + + +def read_published_artifact_index( + path: str | Path, +) -> tuple[PublishedArtifactIndexRow, ...]: + """Read published artifact rows from a JSONL path.""" + + return published_artifact_index_from_jsonl(Path(path).read_text(encoding="utf-8")) + + +def _validate_result_matches_candidate( + result: FullPromotionResult, + candidate_bundle: ReleaseCandidateInputBundle, +) -> None: + context = candidate_bundle.context + if result.run_id != context.run_id: + raise ValueError("promotion_result.run_id must match context.run_id") + if result.candidate_version != context.candidate_version: + raise ValueError( + "promotion_result.candidate_version must match context.candidate_version" + ) + if result.release_version != context.release_version: + raise ValueError( + "promotion_result.release_version must match context.release_version" + ) + if result.artifact_count != len(candidate_bundle.artifacts): + raise ValueError( + "promotion_result.artifact_count must match candidate artifacts" + ) + + +def _release_manifest_artifacts( + release_manifest: Mapping[str, Any] | None, +) -> dict[str, Mapping[str, Any]]: + if not release_manifest: + return {} + artifacts = release_manifest.get("artifacts", {}) + if not isinstance(artifacts, Mapping): + return {} + return { + str(key): artifact + for key, artifact in artifacts.items() + if isinstance(artifact, Mapping) + } + + +def _candidate_artifact_row( + *, + context: ReleasePromotionContext, + artifact, + manifest_artifacts: Mapping[str, Mapping[str, Any]], +) -> PublishedArtifactIndexRow: + manifest_key = _release_manifest_key(artifact.relative_path) + manifest_entry = manifest_artifacts.get(manifest_key, {}) + relative_path = str(manifest_entry.get("path") or artifact.relative_path) + manifest_size_bytes = _optional_manifest_int(manifest_entry, "size_bytes") + return PublishedArtifactIndexRow( + run_id=context.run_id, + candidate_version=context.candidate_version, + release_version=context.release_version, + logical_name=artifact.logical_name, + relative_path=relative_path, + artifact_role="release_artifact", + artifact_family=artifact.artifact_family, + source_stage_id=artifact.source_stage_id, + hf_uri=f"hf://{context.hf_repo_name}/{relative_path}", + gcs_uri=f"gs://{context.gcs_bucket_name}/{relative_path}", + sha256=_optional_manifest_string(manifest_entry, "sha256") or artifact.sha256, + size_bytes=manifest_size_bytes + if manifest_size_bytes is not None + else artifact.size_bytes, + area_type=artifact.area_type, + area_id=artifact.area_id, + release_manifest_key=manifest_key, + release_manifest_revision=_optional_manifest_string( + manifest_entry, + "revision", + ), + metadata={ + "release_manifest_kind": _optional_manifest_string( + manifest_entry, + "kind", + ), + "candidate_metadata": jsonable_value(artifact.metadata), + }, + ) + + +def _release_metadata_rows( + context: ReleasePromotionContext, + result: FullPromotionResult, +) -> tuple[PublishedArtifactIndexRow, ...]: + completion_marker = ( + result.completion_marker.marker_path + or f"releases/{context.release_version}/release-complete.json" + ) + artifacts = ( + ( + "release_manifest", + "release_manifest.json", + "release_manifest", + "application/json", + ), + ( + "versioned_release_manifest", + f"releases/{context.release_version}/release_manifest.json", + "release_manifest", + "application/json", + ), + ("trace_tro", "trace.tro.jsonld", "trace_tro", "application/ld+json"), + ( + "versioned_trace_tro", + f"releases/{context.release_version}/trace.tro.jsonld", + "trace_tro", + "application/ld+json", + ), + ( + "version_manifest", + "version_manifest.json", + "version_manifest", + "application/json", + ), + ( + "release_completion_marker", + completion_marker, + "release_completion_marker", + "application/json", + ), + ) + return tuple( + PublishedArtifactIndexRow( + run_id=context.run_id, + candidate_version=context.candidate_version, + release_version=context.release_version, + logical_name=logical_name, + relative_path=relative_path, + artifact_role="release_metadata", + artifact_family=artifact_family, + source_stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + hf_uri=f"hf://{context.hf_repo_name}/{relative_path}", + metadata={"media_type": media_type}, + ) + for logical_name, relative_path, artifact_family, media_type in artifacts + ) + + +def _diagnostic_artifact_row( + *, + context: ReleasePromotionContext, + artifact: ArtifactRef, +) -> PublishedArtifactIndexRow: + relative_path = _artifact_relative_path(artifact, context) + return PublishedArtifactIndexRow( + run_id=context.run_id, + candidate_version=context.candidate_version, + release_version=context.release_version, + logical_name=artifact.logical_name, + relative_path=relative_path, + artifact_role="diagnostic", + artifact_family=str(artifact.metadata.get("artifact_family") or "diagnostic"), + source_stage_id=str( + artifact.metadata.get("source_stage_id") + or STAGE_5_VALIDATE_AND_PROMOTE_RELEASE + ), + hf_uri=artifact.uri, + sha256=artifact.sha256, + size_bytes=artifact.size_bytes, + metadata={ + key: value + for key, value in jsonable_value(artifact.metadata).items() + if key not in {"artifact_family", "source_stage_id"} + }, + ) + + +def _artifact_relative_path( + artifact: ArtifactRef, + context: ReleasePromotionContext, +) -> str: + metadata_path = artifact.metadata.get("relative_path") + if isinstance(metadata_path, str) and metadata_path: + return metadata_path + hf_prefix = f"hf://{context.hf_repo_name}/" + if artifact.uri.startswith(hf_prefix): + return artifact.uri[len(hf_prefix) :] + return artifact.uri + + +def _release_manifest_key(relative_path: str) -> str: + return PurePosixPath(relative_path).with_suffix("").as_posix() + + +def _optional_manifest_string( + manifest_entry: Mapping[str, Any], + key: str, +) -> str | None: + value = manifest_entry.get(key) + return value if isinstance(value, str) and value else None + + +def _optional_manifest_int( + manifest_entry: Mapping[str, Any], + key: str, +) -> int | None: + value = manifest_entry.get(key) + if isinstance(value, bool): + return None + return value if isinstance(value, int) else None diff --git a/policyengine_us_data/release_promotion/results.py b/policyengine_us_data/release_promotion/results.py new file mode 100644 index 000000000..e527c1f48 --- /dev/null +++ b/policyengine_us_data/release_promotion/results.py @@ -0,0 +1,435 @@ +"""Typed Stage 5 promotion result models.""" + +from __future__ import annotations + +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Any + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts._coercion import ( + freeze_mapping, + jsonable_value, + mapping_value, + optional_string, + optional_string_value, + require_non_empty, + required_string, + schema_version, + validate_schema_version, +) +from policyengine_us_data.stage_contracts.constants import CONTRACT_SCHEMA_VERSION + + +def _nonnegative_int(value: Any, field_name: str) -> int: + if isinstance(value, bool) or not isinstance(value, int): + raise ValueError(f"{field_name} must be an integer") + if value < 0: + raise ValueError(f"{field_name} must be non-negative") + return value + + +def _bool_value(value: Any, field_name: str) -> bool: + if not isinstance(value, bool): + raise ValueError(f"{field_name} must be a boolean") + return value + + +@dataclass(frozen=True, kw_only=True) +class HuggingFacePromotionResult: + """Result for copying staged Hugging Face artifacts to production paths.""" + + promoted_count: int + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "promoted_count", + _nonnegative_int(self.promoted_count, "promoted_count"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "promoted_count": self.promoted_count, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "HuggingFacePromotionResult": + """Restore a Hugging Face promotion result from a mapping.""" + + return cls( + promoted_count=_nonnegative_int( + data.get("promoted_count"), + "promoted_count", + ), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class GcsPromotionResult: + """Result for uploading staged Hugging Face artifacts to GCS.""" + + uploaded_count: int + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "uploaded_count", + _nonnegative_int(self.uploaded_count, "uploaded_count"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "uploaded_count": self.uploaded_count, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "GcsPromotionResult": + """Restore a GCS promotion result from a mapping.""" + + return cls( + uploaded_count=_nonnegative_int( + data.get("uploaded_count"), + "uploaded_count", + ), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class ReleaseManifestPromotionResult: + """Result for writing the release manifest and TRACE TRO artifacts.""" + + artifact_count: int + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "artifact_count", + _nonnegative_int(self.artifact_count, "artifact_count"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "artifact_count": self.artifact_count, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "ReleaseManifestPromotionResult": + """Restore a release manifest result from a mapping.""" + + return cls( + artifact_count=_nonnegative_int( + data.get("artifact_count"), + "artifact_count", + ), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class VersionManifestPromotionResult: + """Result for updating the public version manifest.""" + + updated: bool + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__(self, "updated", _bool_value(self.updated, "updated")) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "updated": self.updated, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "VersionManifestPromotionResult": + """Restore a version manifest result from a mapping.""" + + return cls( + updated=_bool_value(data.get("updated"), "updated"), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class CompletionMarkerPromotionResult: + """Result for writing or verifying the release completion marker.""" + + marker_path: str | None + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "marker_path", + optional_string_value(self.marker_path, "marker_path"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "marker_path": self.marker_path, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "CompletionMarkerPromotionResult": + """Restore a completion marker result from a mapping.""" + + return cls( + marker_path=optional_string(data, "marker_path"), + schema_version=schema_version(data), + ) + + +@dataclass(frozen=True, kw_only=True) +class CleanupPromotionResult: + """Result for post-certification staging cleanup.""" + + cleaned_count: int + attempted: bool = True + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__( + self, + "cleaned_count", + _nonnegative_int(self.cleaned_count, "cleaned_count"), + ) + object.__setattr__(self, "attempted", _bool_value(self.attempted, "attempted")) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "cleaned_count": self.cleaned_count, + "attempted": self.attempted, + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "CleanupPromotionResult": + """Restore a cleanup result from a mapping.""" + + return cls( + cleaned_count=_nonnegative_int( + data.get("cleaned_count"), + "cleaned_count", + ), + attempted=_bool_value(data.get("attempted", True), "attempted"), + schema_version=schema_version(data), + ) + + +@pipeline_node( + id="full_promotion_result", + label="FullPromotionResult", + node_type="library", + description="Typed Stage 5 result model for full release promotion outcomes.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["release promotion transaction output"], + artifacts_out=["typed promotion result"], + validation_commands=["uv run pytest tests/unit/release_promotion/test_results.py"], +) +@dataclass(frozen=True, kw_only=True) +class FullPromotionResult: + """Typed result for a full Stage 5 release promotion transaction.""" + + run_id: str + candidate_version: str + release_version: str + artifact_count: int + hf: HuggingFacePromotionResult + gcs: GcsPromotionResult + release_manifest: ReleaseManifestPromotionResult + version_manifest: VersionManifestPromotionResult + completion_marker: CompletionMarkerPromotionResult + cleanup: CleanupPromotionResult + already_finalized: bool = False + metadata: Mapping[str, Any] = field(default_factory=dict) + schema_version: str = CONTRACT_SCHEMA_VERSION + + def __post_init__(self) -> None: + validate_schema_version(self.schema_version, self.__class__.__name__) + object.__setattr__(self, "run_id", require_non_empty(self.run_id, "run_id")) + object.__setattr__( + self, + "candidate_version", + require_non_empty(self.candidate_version, "candidate_version"), + ) + object.__setattr__( + self, + "release_version", + require_non_empty(self.release_version, "release_version"), + ) + object.__setattr__( + self, + "artifact_count", + _nonnegative_int(self.artifact_count, "artifact_count"), + ) + _require_type(self.hf, "hf", HuggingFacePromotionResult) + _require_type(self.gcs, "gcs", GcsPromotionResult) + _require_type( + self.release_manifest, + "release_manifest", + ReleaseManifestPromotionResult, + ) + _require_type( + self.version_manifest, + "version_manifest", + VersionManifestPromotionResult, + ) + _require_type( + self.completion_marker, + "completion_marker", + CompletionMarkerPromotionResult, + ) + _require_type(self.cleanup, "cleanup", CleanupPromotionResult) + object.__setattr__( + self, + "already_finalized", + _bool_value(self.already_finalized, "already_finalized"), + ) + object.__setattr__( + self, + "metadata", + freeze_mapping(self.metadata, "metadata"), + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this result to JSON-compatible primitives.""" + + return { + "run_id": self.run_id, + "candidate_version": self.candidate_version, + "release_version": self.release_version, + "artifact_count": self.artifact_count, + "hf": self.hf.to_dict(), + "gcs": self.gcs.to_dict(), + "release_manifest": self.release_manifest.to_dict(), + "version_manifest": self.version_manifest.to_dict(), + "completion_marker": self.completion_marker.to_dict(), + "cleanup": self.cleanup.to_dict(), + "already_finalized": self.already_finalized, + "metadata": jsonable_value(self.metadata), + "schema_version": self.schema_version, + } + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "FullPromotionResult": + """Restore a full promotion result from a mapping.""" + + return cls( + run_id=required_string(data, "run_id"), + candidate_version=required_string(data, "candidate_version"), + release_version=required_string(data, "release_version"), + artifact_count=_nonnegative_int( + data.get("artifact_count"), + "artifact_count", + ), + hf=HuggingFacePromotionResult.from_dict(mapping_value(data, "hf")), + gcs=GcsPromotionResult.from_dict(mapping_value(data, "gcs")), + release_manifest=ReleaseManifestPromotionResult.from_dict( + mapping_value(data, "release_manifest"), + ), + version_manifest=VersionManifestPromotionResult.from_dict( + mapping_value(data, "version_manifest"), + ), + completion_marker=CompletionMarkerPromotionResult.from_dict( + mapping_value(data, "completion_marker"), + ), + cleanup=CleanupPromotionResult.from_dict(mapping_value(data, "cleanup")), + already_finalized=_bool_value( + data.get("already_finalized", False), + "already_finalized", + ), + metadata=mapping_value(data, "metadata"), + schema_version=schema_version(data), + ) + + @classmethod + def from_legacy_dict(cls, data: Mapping[str, Any]) -> "FullPromotionResult": + """Build a typed result from the existing promotion dictionary output.""" + + already_finalized = _bool_value( + data.get("already_finalized", False), + "already_finalized", + ) + return cls( + run_id=required_string(data, "run_id"), + candidate_version=required_string(data, "candidate_version"), + release_version=required_string(data, "release_version"), + artifact_count=_nonnegative_int( + data.get("artifact_count"), + "artifact_count", + ), + hf=HuggingFacePromotionResult( + promoted_count=_nonnegative_int( + data.get("hf_promoted"), + "hf_promoted", + ), + ), + gcs=GcsPromotionResult( + uploaded_count=_nonnegative_int( + data.get("gcs_uploaded"), + "gcs_uploaded", + ), + ), + release_manifest=ReleaseManifestPromotionResult( + artifact_count=_nonnegative_int( + data.get("release_manifest_artifacts"), + "release_manifest_artifacts", + ), + ), + version_manifest=VersionManifestPromotionResult( + updated=_bool_value( + data.get("version_manifest_updated", not already_finalized), + "version_manifest_updated", + ), + ), + completion_marker=CompletionMarkerPromotionResult( + marker_path=optional_string(data, "release_completion_marker"), + ), + cleanup=CleanupPromotionResult( + cleaned_count=_nonnegative_int( + data.get("staging_cleaned"), + "staging_cleaned", + ), + attempted=_bool_value( + data.get("staging_cleanup_attempted", True), + "staging_cleanup_attempted", + ), + ), + already_finalized=already_finalized, + ) + + +def _require_type(value: Any, field_name: str, expected_type: type) -> None: + if not isinstance(value, expected_type): + raise ValueError(f"{field_name} must be {expected_type.__name__}") diff --git a/policyengine_us_data/release_promotion/validation.py b/policyengine_us_data/release_promotion/validation.py new file mode 100644 index 000000000..ac5b0ce71 --- /dev/null +++ b/policyengine_us_data/release_promotion/validation.py @@ -0,0 +1,572 @@ +"""Canonical validation-report adapters for Stage 5 release candidates.""" + +from __future__ import annotations + +from collections import Counter +from collections.abc import Callable, Mapping, Sequence +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, TypeAlias + +from policyengine_us_data.pipeline_metadata import pipeline_node +from policyengine_us_data.stage_contracts import ValidationFinding, ValidationReport +from policyengine_us_data.stage_contracts.stages import ( + STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, +) +from policyengine_us_data.utils.error_redaction import ( + DEFAULT_ERROR_MESSAGE_MAX_CHARS, + redacted_bounded_error_text, +) +from policyengine_us_data.validation_core import ( + ValidationArtifactResolver, + ValidationCheck, + ValidationContext, + ValidationRunner, + ValidationSuite, +) + +from .candidate import ReleaseCandidateInputBundle + +ManifestFile: TypeAlias = tuple[Path | str, str] + +RELEASE_VALIDATION_SUBSTAGE_ID = "5a_validate_outputs" +DEFAULT_REQUIRED_RELEASE_ARTIFACT_FAMILIES = ( + "national_h5", + "state_h5", + "district_h5", + "city_h5", +) + + +@dataclass(frozen=True, kw_only=True) +class ReleaseCandidateValidationDependencies: + """Side-effecting release checks used by ``ReleaseCandidateValidator``.""" + + get_matching_finalized_release_manifest: Callable[..., Mapping[str, Any] | None] + list_missing_staged_artifacts: Callable[..., Sequence[str]] + list_missing_validation_reports: Callable[..., Sequence[str]] + preflight_release_manifest_publish: Callable[..., tuple[bool, Sequence[str]]] + release_completion_marker_exists: Callable[..., bool] + + +@dataclass(frozen=True, kw_only=True) +class _FinalizedReleaseState: + manifest: Mapping[str, Any] | None = None + error: Exception | None = None + checked: bool = False + + +def default_release_candidate_validation_dependencies() -> ( + ReleaseCandidateValidationDependencies +): + """Return production adapters for Stage 5 candidate validation checks.""" + + from policyengine_us_data.utils import data_upload + + return ReleaseCandidateValidationDependencies( + get_matching_finalized_release_manifest=( + data_upload.get_matching_finalized_release_manifest + ), + list_missing_staged_artifacts=data_upload.list_missing_staged_artifacts, + list_missing_validation_reports=_list_missing_validation_reports_on_hf, + preflight_release_manifest_publish=( + data_upload.preflight_release_manifest_publish + ), + release_completion_marker_exists=data_upload.release_completion_marker_exists_on_hf, + ) + + +@pipeline_node( + id="release_candidate_shape_report", + label="Release Candidate Shape Report", + node_type="validation", + description="Adapt Stage 5 release-candidate shape checks into the canonical validation report schema.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_candidate.py" + ], +) +def build_release_candidate_shape_report( + bundle: ReleaseCandidateInputBundle, +) -> ValidationReport: + """Describe candidate-bundle shape using the shared validation schema.""" + + families = Counter(artifact.artifact_family for artifact in bundle.artifacts) + return ValidationReport( + status="pass", + findings=( + ValidationFinding( + check_id="release_candidate_identity_declared", + status="pass", + message="Release candidate declares one canonical run and release identity.", + metadata={ + "run_id": bundle.context.run_id, + "candidate_version": bundle.context.candidate_version, + "release_version": bundle.context.release_version, + "hf_staging_prefix": bundle.context.hf_staging_prefix, + "release_candidate_fingerprint": ( + bundle.release_candidate_fingerprint + ), + }, + ), + ValidationFinding( + check_id="release_candidate_artifacts_declared", + status="pass", + message="Release candidate declares typed release artifacts.", + metric="artifact_count", + value=len(bundle.artifacts), + threshold=1, + metadata={ + "artifact_families": dict(sorted(families.items())), + "required_artifacts": sum( + 1 for artifact in bundle.artifacts if artifact.required + ), + }, + ), + ), + metadata={ + "stage_id": STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + "substage_id": RELEASE_VALIDATION_SUBSTAGE_ID, + "run_id": bundle.context.run_id, + "release_candidate_fingerprint": bundle.release_candidate_fingerprint, + "validation_kind": "candidate_shape", + }, + ) + + +@pipeline_node( + id="release_candidate_validator", + label="ReleaseCandidateValidator", + node_type="validation", + description="Stage 5 validation-core adapter for release candidates before promotion side effects.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["ReleaseCandidateInputBundle", "staged release artifacts"], + artifacts_out=["ValidationReport"], + validation_commands=[ + "uv run pytest tests/unit/release_promotion/test_validation.py" + ], +) +@dataclass(frozen=True, kw_only=True) +class ReleaseCandidateValidator: + """Validate a Stage 5 release candidate before public release writes.""" + + dependencies: ReleaseCandidateValidationDependencies = field( + default_factory=default_release_candidate_validation_dependencies, + ) + required_artifact_families: tuple[str, ...] = ( + DEFAULT_REQUIRED_RELEASE_ARTIFACT_FAMILIES + ) + runner: ValidationRunner = field(default_factory=ValidationRunner) + + def validate( + self, + bundle: ReleaseCandidateInputBundle, + *, + files_with_paths: Sequence[ManifestFile] = (), + ) -> ValidationReport: + """Run Stage 5 candidate checks and return a canonical report.""" + + if not isinstance(bundle, ReleaseCandidateInputBundle): + raise TypeError("bundle must be a ReleaseCandidateInputBundle") + files = tuple(files_with_paths) + finalized_state = self._matching_finalized_release(bundle, files) + suite = self._validation_suite( + bundle=bundle, + files_with_paths=files, + finalized_state=finalized_state, + ) + return self.runner.run(suite, _validation_context(bundle, files)) + + def _matching_finalized_release( + self, + bundle: ReleaseCandidateInputBundle, + files_with_paths: Sequence[ManifestFile], + ) -> _FinalizedReleaseState: + if not files_with_paths: + return _FinalizedReleaseState() + try: + manifest = self.dependencies.get_matching_finalized_release_manifest( + files_with_paths=list(files_with_paths), + version=bundle.context.release_version, + hf_repo_name=bundle.context.hf_repo_name, + hf_repo_type=bundle.context.hf_repo_type, + model_package_name="policyengine-us", + ) + except Exception as exc: + return _FinalizedReleaseState(error=exc, checked=True) + return _FinalizedReleaseState(manifest=manifest, checked=True) + + def _validation_suite( + self, + *, + bundle: ReleaseCandidateInputBundle, + files_with_paths: Sequence[ManifestFile], + finalized_state: _FinalizedReleaseState, + ) -> ValidationSuite: + return ValidationSuite( + suite_id="release_candidate_validation", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + checks=( + ValidationCheck( + check_id="release_candidate_required_artifact_families", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + description="Required release artifact families are present.", + run=lambda context: self._check_required_artifact_families( + bundle, + ), + ), + ValidationCheck( + check_id="release_candidate_finalized_release_state", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + description="Already-finalized releases have a completion marker.", + run=lambda context: self._check_finalized_release_state( + bundle, + finalized_state, + ), + ), + ValidationCheck( + check_id="release_candidate_staged_artifacts_present", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + description="All candidate artifacts exist under the staging prefix.", + run=lambda context: self._check_staged_artifacts_present( + bundle, + finalized_state, + ), + ), + ValidationCheck( + check_id="release_candidate_validation_reports_present", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + description="Run-scoped validation reports exist before release completion.", + run=lambda context: self._check_validation_reports_present( + bundle, + finalized_state, + ), + ), + ValidationCheck( + check_id="release_candidate_release_manifest_preflight", + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + description="Release manifest preflight can finalize local-area artifacts.", + run=lambda context: self._check_release_manifest_preflight( + bundle, + files_with_paths, + finalized_state, + ), + ), + ), + ) + + def _check_required_artifact_families( + self, + bundle: ReleaseCandidateInputBundle, + ) -> ValidationFinding: + check_id = "release_candidate_required_artifact_families" + family_counts = Counter( + artifact.artifact_family + for artifact in bundle.artifacts + if artifact.required + ) + missing_families = sorted( + family + for family in self.required_artifact_families + if family_counts.get(family, 0) < 1 + ) + if missing_families: + return _finding( + check_id, + "fail", + "Release candidate is missing required artifact families.", + metric="missing_required_artifact_families", + value=missing_families, + threshold=list(self.required_artifact_families), + artifact_family_counts=dict(sorted(family_counts.items())), + ) + return _finding( + check_id, + "pass", + "Release candidate includes required artifact families.", + metric="required_artifact_family_count", + value=len(self.required_artifact_families), + artifact_family_counts=dict(sorted(family_counts.items())), + ) + + def _check_finalized_release_state( + self, + bundle: ReleaseCandidateInputBundle, + finalized_state: _FinalizedReleaseState, + ) -> ValidationFinding: + check_id = "release_candidate_finalized_release_state" + if not finalized_state.checked: + return _finding( + check_id, + "fail", + "Finalized-release comparison requires local files with repo paths.", + metric="manifest_files", + value=0, + ) + if finalized_state.error is not None: + redacted_error = redacted_bounded_error_text( + str(finalized_state.error), + max_chars=DEFAULT_ERROR_MESSAGE_MAX_CHARS, + ) + return _finding( + check_id, + "fail", + "Could not compare the candidate against finalized releases.", + metric="finalized_release_lookup", + value=finalized_state.error.__class__.__name__, + exception_type=finalized_state.error.__class__.__name__, + exception_message=redacted_error.text, + exception_message_truncated=redacted_error.truncated, + ) + if finalized_state.manifest is None: + return _finding( + check_id, + "pass", + "Release is not already finalized with a matching manifest.", + metric="already_finalized", + value=False, + ) + marker_exists = self.dependencies.release_completion_marker_exists( + version=bundle.context.release_version, + hf_repo_name=bundle.context.hf_repo_name, + hf_repo_type=bundle.context.hf_repo_type, + ) + if not marker_exists: + return _finding( + check_id, + "fail", + "Matching finalized release is missing its completion marker.", + metric="release_completion_marker_exists", + value=False, + already_finalized=True, + ) + return _finding( + check_id, + "pass", + "Matching finalized release has a completion marker.", + metric="release_completion_marker_exists", + value=True, + already_finalized=True, + ) + + def _check_staged_artifacts_present( + self, + bundle: ReleaseCandidateInputBundle, + finalized_state: _FinalizedReleaseState, + ) -> ValidationFinding: + check_id = "release_candidate_staged_artifacts_present" + if _skip_side_effect_checks(finalized_state): + return _skipped_for_finalized_state(check_id, finalized_state) + missing_paths = sorted( + self.dependencies.list_missing_staged_artifacts( + _release_paths(bundle), + candidate_version=bundle.context.candidate_version, + hf_repo_name=bundle.context.hf_repo_name, + hf_repo_type=bundle.context.hf_repo_type, + run_id=bundle.context.run_id, + ) + ) + if missing_paths: + return _finding( + check_id, + "fail", + "Release candidate is missing staged artifacts.", + metric="missing_staged_artifacts", + value=missing_paths, + ) + return _finding( + check_id, + "pass", + "All release candidate artifacts are present in staging.", + metric="missing_staged_artifacts", + value=[], + ) + + def _check_validation_reports_present( + self, + bundle: ReleaseCandidateInputBundle, + finalized_state: _FinalizedReleaseState, + ) -> ValidationFinding: + check_id = "release_candidate_validation_reports_present" + if _skip_side_effect_checks(finalized_state): + return _skipped_for_finalized_state(check_id, finalized_state) + if not bundle.validation_report_paths: + return _finding( + check_id, + "fail", + "Release candidate does not include validation report paths.", + metric="validation_report_paths", + value=[], + ) + missing_paths = sorted( + self.dependencies.list_missing_validation_reports( + bundle.validation_report_paths, + context=bundle.context, + ) + ) + if missing_paths: + return _finding( + check_id, + "fail", + "Release candidate is missing validation reports.", + metric="missing_validation_reports", + value=missing_paths, + ) + return _finding( + check_id, + "pass", + "Release candidate validation reports are present.", + metric="validation_report_paths", + value=list(bundle.validation_report_paths), + ) + + def _check_release_manifest_preflight( + self, + bundle: ReleaseCandidateInputBundle, + files_with_paths: Sequence[ManifestFile], + finalized_state: _FinalizedReleaseState, + ) -> ValidationFinding: + check_id = "release_candidate_release_manifest_preflight" + if _skip_side_effect_checks(finalized_state): + return _skipped_for_finalized_state(check_id, finalized_state) + if not files_with_paths: + return _finding( + check_id, + "fail", + "Release manifest preflight requires local files with repo paths.", + metric="manifest_files", + value=0, + ) + should_finalize, missing_prefixes = ( + self.dependencies.preflight_release_manifest_publish( + list(files_with_paths), + version=bundle.context.release_version, + new_repo_paths=_release_paths(bundle), + hf_repo_name=bundle.context.hf_repo_name, + hf_repo_type=bundle.context.hf_repo_type, + ) + ) + if not should_finalize: + return _finding( + check_id, + "fail", + "Release manifest preflight cannot finalize local-area artifacts.", + metric="missing_local_area_prefixes", + value=sorted(missing_prefixes), + ) + return _finding( + check_id, + "pass", + "Release manifest preflight can finalize the candidate.", + metric="missing_local_area_prefixes", + value=[], + ) + + +def _validation_context( + bundle: ReleaseCandidateInputBundle, + files_with_paths: Sequence[ManifestFile], +) -> ValidationContext: + return ValidationContext( + run_id=bundle.context.run_id, + stage_id=STAGE_5_VALIDATE_AND_PROMOTE_RELEASE, + substage_id=RELEASE_VALIDATION_SUBSTAGE_ID, + resolver=ValidationArtifactResolver( + artifacts={ + artifact.logical_name: artifact.to_artifact_ref( + uri_prefix=bundle.context.hf_staging_prefix, + ) + for artifact in bundle.artifacts + }, + ), + metadata={ + "candidate_version": bundle.context.candidate_version, + "release_version": bundle.context.release_version, + "hf_repo_name": bundle.context.hf_repo_name, + "hf_repo_type": bundle.context.hf_repo_type, + "release_candidate_fingerprint": bundle.release_candidate_fingerprint, + "manifest_file_count": len(files_with_paths), + }, + ) + + +def _release_paths(bundle: ReleaseCandidateInputBundle) -> tuple[str, ...]: + return tuple(artifact.relative_path for artifact in bundle.artifacts) + + +def _finding( + check_id: str, + status: str, + message: str, + *, + metric: str | None = None, + value: Any | None = None, + threshold: Any | None = None, + **metadata: Any, +) -> ValidationFinding: + return ValidationFinding( + check_id=check_id, + status=status, + message=message, + metric=metric, + value=value, + threshold=threshold, + metadata=metadata, + ) + + +def _skip_side_effect_checks(finalized_state: _FinalizedReleaseState) -> bool: + return finalized_state.error is not None or finalized_state.manifest is not None + + +def _skipped_for_finalized_state( + check_id: str, + finalized_state: _FinalizedReleaseState, +) -> ValidationFinding: + if finalized_state.error is not None: + return _finding( + check_id, + "pass", + "Check skipped because finalized-release comparison failed.", + metric="finalized_release_lookup", + value="failed", + ) + return _finding( + check_id, + "pass", + "Check skipped because the matching release is already finalized.", + metric="already_finalized", + value=True, + ) + + +def _list_missing_validation_reports_on_hf( + validation_report_paths: Sequence[str], + *, + context, +) -> list[str]: + import os + + from huggingface_hub import HfApi + + if not validation_report_paths: + return [f"calibration/runs/{context.run_id}/diagnostics/"] + token = os.environ.get("HUGGING_FACE_TOKEN") + repo_files = set( + HfApi().list_repo_files( + repo_id=context.hf_repo_name, + repo_type=context.hf_repo_type, + token=token, + ) + ) + return sorted(path for path in validation_report_paths if path not in repo_files) diff --git a/policyengine_us_data/utils/release_promotion.py b/policyengine_us_data/utils/release_promotion.py index def0b4cea..a0c1ff5e5 100644 --- a/policyengine_us_data/utils/release_promotion.py +++ b/policyengine_us_data/utils/release_promotion.py @@ -9,10 +9,14 @@ import logging from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Sequence +from typing import TYPE_CHECKING, Any, Callable, Sequence +from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.utils.release_completion import release_completion_marker_path +if TYPE_CHECKING: + from policyengine_us_data.release_promotion import FullPromotionResult + ManifestFile = tuple[Path, str] ReleaseManifest = dict[str, Any] @@ -135,11 +139,36 @@ def promote_full_release( "hf_promoted": promoted_hf, "gcs_uploaded": uploaded_gcs, "release_manifest_artifacts": len(release_manifest["artifacts"]), + "version_manifest_updated": True, "release_completion_marker": completion_marker.get("marker_path"), "staging_cleaned": cleaned, + "staging_cleanup_attempted": config.cleanup_staging, } +@pipeline_node( + id="typed_full_release_promotion", + label="Typed Full Release Promotion", + node_type="library", + description="Compatibility wrapper that returns typed Stage 5 promotion results from the existing transaction engine.", + status="transitional", + stability="moving", + pathways=["5_validate_and_promote_release"], + artifacts_in=["staged release artifacts", "release manifest inputs"], + artifacts_out=["FullPromotionResult"], + validation_commands=["uv run pytest tests/unit/release_promotion/test_results.py"], +) +def promote_full_release_with_result( + config: FullReleasePromotionConfig, + deps: FullReleasePromotionDependencies, +) -> "FullPromotionResult": + """Run the existing transaction engine and wrap its output in a typed result.""" + + from policyengine_us_data.release_promotion import FullPromotionResult + + return FullPromotionResult.from_legacy_dict(promote_full_release(config, deps)) + + def _validated_release_paths( config: FullReleasePromotionConfig, deps: FullReleasePromotionDependencies, @@ -229,8 +258,10 @@ def _finish_already_finalized_release( "hf_promoted": 0, "gcs_uploaded": 0, "release_manifest_artifacts": len(finalized_manifest["artifacts"]), + "version_manifest_updated": False, "release_completion_marker": completion_marker_path, "staging_cleaned": cleaned, + "staging_cleanup_attempted": config.cleanup_staging, "already_finalized": True, } diff --git a/tests/unit/release_promotion/test_candidate.py b/tests/unit/release_promotion/test_candidate.py new file mode 100644 index 000000000..da60c824c --- /dev/null +++ b/tests/unit/release_promotion/test_candidate.py @@ -0,0 +1,1049 @@ +import json +from collections.abc import Mapping, Sequence +from typing import Any + +import pytest + +from policyengine_us_data.release_promotion import ( + ReleaseCandidateInputBundle, + ReleaseArtifactSpec, + ReleasePromotionContext, + build_legacy_release_candidate_bundle, + build_release_candidate_bundle_from_stage4_contract, + build_release_candidate_shape_report, + infer_release_artifact_spec, + normalize_release_path, + read_stage4_release_candidate_bundle, +) +from policyengine_us_data.stage_contracts import ( + ArtifactRef, + DiagnosticRef, + ExecutionRecord, + StageContract, + ValidationReport, + contract_to_json, +) +from policyengine_us_data.stage_contracts.fingerprints import fingerprint_material + + +def _context() -> ReleasePromotionContext: + return ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version="1.72.0", + release_bump="minor", + ) + + +def _stage4_contract( + *, + fingerprint_marker: str = "default", + relative_path: str = "states/AL.h5", + run_id: str = "run-123", + execution_status: str = "completed", +) -> StageContract: + outputs = ( + ArtifactRef( + logical_name="state_al_h5", + uri="hf://policyengine/policyengine-us-data/staging/1.73.0rc1-run-123/states/AL.h5", + sha256="sha256:state-al", + size_bytes=12, + metadata={ + "relative_path": relative_path, + "artifact_family": "state_h5", + "source_stage_id": "4_build_outputs", + "area_type": "state", + "area_id": "AL", + }, + ), + ) + return StageContract( + contract_type="output_build", + stage_id="4_build_outputs", + run_id=run_id, + created_at="2026-05-18T12:00:00Z", + outputs=outputs, + fingerprint=fingerprint_material( + { + "stage_id": "4_build_outputs", + "outputs": [output.to_dict() for output in outputs], + "fingerprint_marker": fingerprint_marker, + } + ), + execution=ExecutionRecord(status=execution_status, reuse_decision="computed"), + ) + + +def _stage4_contract_with_outputs( + outputs: Sequence[ArtifactRef], + *, + diagnostics: Sequence[DiagnosticRef] = (), + validation: ValidationReport | None = None, + fingerprint_payload: Mapping[str, Any] | None = None, +) -> StageContract: + output_tuple = tuple(outputs) + return StageContract( + contract_type="output_build", + stage_id="4_build_outputs", + run_id="run-123", + created_at="2026-05-18T12:00:00Z", + outputs=output_tuple, + diagnostics=tuple(diagnostics), + validation=validation, + fingerprint=fingerprint_material( + fingerprint_payload + or {"outputs": [output.to_dict() for output in output_tuple]} + ), + execution=ExecutionRecord(status="completed", reuse_decision="computed"), + ) + + +def _inventory_record( + path: str, + *, + key: str = "path", + logical_name: str = "district_nc_01_h5", + artifact_family: str = "district_h5", + area_type: str = "district", + area_id: str = "NC-01", + sha256: str = "sha256:nc-01", + size_bytes: int = 42, + run_id: str = "run-123", +) -> dict: + return { + key: path, + "logical_name": logical_name, + "artifact_family": artifact_family, + "source_stage_id": "4_build_outputs", + "area_type": area_type, + "area_id": area_id, + "sha256": sha256, + "size_bytes": size_bytes, + "run_id": run_id, + "stage_id": "4_build_outputs", + } + + +def _legacy_identity_metadata() -> dict[str, dict]: + return { + "states/AL.h5": {"sha256": "sha256:state-al", "size_bytes": 12}, + "policy_data.db": {"sha256": "sha256:policy-db", "size_bytes": 24}, + } + + +def test_release_path_normalization_rejects_parent_paths() -> None: + assert normalize_release_path("./states//AL.h5") == "states/AL.h5" + + with pytest.raises(ValueError, match="parent traversal"): + normalize_release_path("../states/AL.h5") + with pytest.raises(ValueError, match="parent traversal"): + normalize_release_path("states/../release_manifest.json") + + +@pytest.mark.parametrize( + "path", + [ + "hf://repo/states/AL.h5", + "s3://bucket/states/AL.h5", + "/states/AL.h5", + "C:\\tmp\\AL.h5", + "..\\states\\AL.h5", + ], +) +def test_release_path_normalization_rejects_external_or_absolute_paths( + path, +) -> None: + with pytest.raises(ValueError): + normalize_release_path(path) + + +def test_release_artifact_spec_infers_area_and_source_stage() -> None: + state = infer_release_artifact_spec("states/AL.h5") + base = infer_release_artifact_spec("policy_data.db") + national = infer_release_artifact_spec("national/US.h5") + + assert state.artifact_family == "state_h5" + assert state.area_type == "state" + assert state.area_id == "AL" + assert state.source_stage_id == "4_build_outputs" + assert base.logical_name == "policy_data_db" + assert base.source_stage_id == "1_build_datasets" + assert national.area_type == "national" + + +def test_release_promotion_context_round_trips_with_staging_prefix() -> None: + context = _context() + + restored = ReleasePromotionContext.from_dict(context.to_dict()) + + assert restored == context + assert restored.candidate_scope == "1.73.0rc1" + assert restored.hf_staging_prefix == "staging/1.73.0rc1-run-123" + assert restored.schema_version + + +def test_release_promotion_context_serializes_canonical_identity() -> None: + context = ReleasePromotionContext( + run_id="Run ID", + candidate_version="Candidate Scope", + release_version="1.73.0rc1", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version="1.72.0rc1", + release_bump="MINOR", + ) + + assert context.run_id == "run-id" + assert context.candidate_version == "Candidate-Scope" + assert context.release_version == "1.73.0" + assert context.base_release_version == "1.72.0" + assert context.release_bump == "minor" + assert context.hf_staging_prefix == "staging/Candidate-Scope-run-id" + + +def test_release_promotion_context_rejects_mismatched_staging_prefix() -> None: + with pytest.raises(ValueError, match="hf_staging_prefix"): + ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + hf_staging_prefix="staging/other-run", + ) + + +def test_legacy_candidate_bundle_dedupes_and_strips_staging_prefix() -> None: + bundle = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=[ + "states/AL.h5", + "staging/1.73.0rc1-run-123/national/US.h5", + "policy_data.db", + "states/AL.h5", + ], + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + ) + + assert isinstance(bundle, ReleaseCandidateInputBundle) + assert [artifact.relative_path for artifact in bundle.artifacts] == [ + "national/US.h5", + "policy_data.db", + "states/AL.h5", + ] + assert bundle.release_candidate_fingerprint is None + assert bundle.metadata["fingerprint_status"] == ( + "path_only_missing_artifact_identity" + ) + assert bundle.validation_report_paths == ( + "calibration/runs/run-123/diagnostics/validation_report.json", + ) + assert bundle.metadata["reader"] == "legacy_staged_paths" + + +def test_legacy_candidate_bundle_rejects_wrong_run_staging_prefix() -> None: + with pytest.raises(ValueError, match="expected staging prefix"): + build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["staging/other-run/states/AL.h5"], + ) + + +def test_legacy_candidate_fingerprint_tracks_semantic_material() -> None: + base = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5", "policy_data.db", "states/AL.h5"], + artifact_metadata_by_path=_legacy_identity_metadata(), + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + ) + reordered_duplicate = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["policy_data.db", "states/AL.h5", "states/AL.h5"], + artifact_metadata_by_path=_legacy_identity_metadata(), + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + ) + changed_report = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5", "policy_data.db"], + artifact_metadata_by_path=_legacy_identity_metadata(), + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_summary.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + ) + changed_diagnostics = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5", "policy_data.db"], + artifact_metadata_by_path=_legacy_identity_metadata(), + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/other.json", + ) + changed_artifacts = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5"], + artifact_metadata_by_path=_legacy_identity_metadata(), + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + ) + + assert [artifact.relative_path for artifact in reordered_duplicate.artifacts] == [ + "policy_data.db", + "states/AL.h5", + ] + assert reordered_duplicate.release_candidate_fingerprint == ( + base.release_candidate_fingerprint + ) + assert changed_report.release_candidate_fingerprint != ( + base.release_candidate_fingerprint + ) + assert changed_diagnostics.release_candidate_fingerprint != ( + base.release_candidate_fingerprint + ) + assert changed_artifacts.release_candidate_fingerprint != ( + base.release_candidate_fingerprint + ) + + +def test_candidate_fingerprint_excludes_arbitrary_metadata() -> None: + first = build_legacy_release_candidate_bundle( + context=ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + metadata={"attempt": 1}, + ), + rel_paths=["states/AL.h5"], + artifact_metadata_by_path={ + "states/AL.h5": { + "sha256": "sha256:state-al", + "size_bytes": 12, + "provenance": "first", + } + }, + ) + second = build_legacy_release_candidate_bundle( + context=ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + metadata={"attempt": 2}, + ), + rel_paths=["states/AL.h5"], + artifact_metadata_by_path={ + "states/AL.h5": { + "sha256": "sha256:state-al", + "size_bytes": 12, + "provenance": "second", + } + }, + ) + + assert first.release_candidate_fingerprint == second.release_candidate_fingerprint + + +def test_candidate_fingerprint_uses_normalized_paths() -> None: + base = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5"], + artifact_metadata_by_path={ + "states/AL.h5": _legacy_identity_metadata()["states/AL.h5"] + }, + validation_report_paths=[ + "./calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="./calibration/runs/run-123/diagnostics/manifest.json", + ) + equivalent = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["./states/AL.h5"], + artifact_metadata_by_path={ + "./states/AL.h5": _legacy_identity_metadata()["states/AL.h5"] + }, + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + ) + + assert ( + base.release_candidate_fingerprint == equivalent.release_candidate_fingerprint + ) + + +def test_stage4_candidate_reader_accepts_inventory_records() -> None: + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[ + _inventory_record("staging/1.73.0rc1-run-123/districts/NC-01.h5") + ], + source_output_contract_path="calibration/runs/run-123/output_build_contract.json", + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == [ + "districts/NC-01.h5", + "states/AL.h5", + ] + assert bundle.artifacts[0].artifact_family == "district_h5" + assert bundle.artifacts[0].sha256 == "sha256:nc-01" + assert bundle.source_output_contract_path == ( + "calibration/runs/run-123/output_build_contract.json" + ) + assert bundle.metadata["reader"] == "stage4_contract" + + +@pytest.mark.parametrize( + ("record", "expected_path"), + [ + ( + _inventory_record( + "national/US.h5", + key="expected_release_path", + logical_name="national_us_h5", + artifact_family="national_h5", + area_type="national", + area_id="US", + ), + "national/US.h5", + ), + ( + _inventory_record( + "staging/1.73.0rc1-run-123/national/US.h5", + key="staging_path", + logical_name="national_us_h5", + artifact_family="national_h5", + area_type="national", + area_id="US", + ), + "national/US.h5", + ), + ( + _inventory_record( + "national/US.h5", + key="output_relative_path", + logical_name="national_us_h5", + artifact_family="national_h5", + area_type="national", + area_id="US", + ), + "national/US.h5", + ), + ( + _inventory_record( + "states/AL.h5", + key="repo_path", + logical_name="state_al_h5", + artifact_family="state_h5", + area_type="state", + area_id="AL", + sha256="sha256:state-al", + size_bytes=12, + ), + "states/AL.h5", + ), + ( + { + "artifact": _inventory_record( + "districts/NC-01.h5", + key="output_relative_path", + ) + }, + "districts/NC-01.h5", + ), + ], +) +def test_stage4_candidate_reader_accepts_supported_inventory_path_shapes( + record, + expected_path, +) -> None: + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[record], + ) + + assert expected_path in {artifact.relative_path for artifact in bundle.artifacts} + assert "states/AL.h5" in {artifact.relative_path for artifact in bundle.artifacts} + + +def test_stage4_candidate_reader_requires_inventory_path_fields_to_agree() -> None: + matching_record = _inventory_record( + "national/US.h5", + key="expected_release_path", + logical_name="national_us_h5", + artifact_family="national_h5", + area_type="national", + area_id="US", + ) + matching_record["staging_path"] = "staging/1.73.0rc1-run-123/national/US.h5" + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[matching_record], + ) + + assert "national/US.h5" in {artifact.relative_path for artifact in bundle.artifacts} + + conflicting_record = { + **matching_record, + "staging_path": "staging/1.73.0rc1-run-123/cities/NYC.h5", + } + with pytest.raises(ValueError, match="path fields must agree"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[conflicting_record], + ) + + wrong_prefix_record = { + **matching_record, + "staging_path": "staging/other-run/national/US.h5", + } + with pytest.raises(ValueError, match="expected staging prefix"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[wrong_prefix_record], + ) + + +def test_stage4_candidate_reader_rejects_run_mismatches() -> None: + with pytest.raises(ValueError, match="run_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(run_id="other-run"), + ) + + with pytest.raises(ValueError, match="run_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[_inventory_record("states/AL.h5", run_id="other-run")], + ) + + +def test_stage4_candidate_reader_rejects_stage_mismatches() -> None: + with pytest.raises(ValueError, match="Stage 4"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=StageContract( + contract_type="dataset_build_output", + stage_id="1_build_datasets", + run_id="run-123", + created_at="2026-05-18T12:00:00Z", + outputs=(), + fingerprint=fingerprint_material({"stage_id": "1_build_datasets"}), + execution=ExecutionRecord(status="completed"), + ), + ) + + with pytest.raises(ValueError, match="stage_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[ + { + **_inventory_record("states/AL.h5"), + "stage_id": "3_fit_weights", + } + ], + ) + + +def test_stage4_candidate_reader_rejects_incomplete_contracts() -> None: + for execution_status in ("pending", "running", "failed", "skipped"): + with pytest.raises(ValueError, match="execution.status"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(execution_status=execution_status), + ) + + +@pytest.mark.parametrize( + "execution_status", ["completed", "reused", "partially_reused"] +) +def test_stage4_candidate_reader_accepts_release_safe_execution_statuses( + execution_status, +) -> None: + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(execution_status=execution_status), + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == ["states/AL.h5"] + + +def test_stage4_candidate_reader_strips_or_rejects_staged_contract_paths() -> None: + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract( + relative_path="staging/1.73.0rc1-run-123/states/AL.h5" + ), + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == ["states/AL.h5"] + + with pytest.raises(ValueError, match="expected staging prefix"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract( + relative_path="staging/other-run/states/AL.h5" + ), + ) + + +def test_stage4_candidate_reader_rejects_duplicate_identity_conflicts() -> None: + with pytest.raises(ValueError, match="sha256"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[ + _inventory_record( + "states/AL.h5", + logical_name="state_al_h5", + artifact_family="state_h5", + area_type="state", + area_id="AL", + sha256="sha256:different", + size_bytes=12, + ) + ], + ) + + with pytest.raises(ValueError, match="sha256"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + inventory_records=[ + _inventory_record("districts/NC-01.h5", sha256="sha256:first"), + _inventory_record("districts/NC-01.h5", sha256="sha256:second"), + ], + ) + + +def test_stage4_candidate_fingerprint_tracks_source_contract_fingerprint() -> None: + first = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(fingerprint_marker="first"), + ) + second = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(fingerprint_marker="second"), + ) + + assert first.release_candidate_fingerprint != second.release_candidate_fingerprint + + +def test_stage4_candidate_reader_falls_back_to_contract_output_paths() -> None: + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == ["states/AL.h5"] + assert bundle.artifacts[0].sha256 == "sha256:state-al" + + +def test_stage4_candidate_reader_can_use_artifact_uri_without_path_metadata() -> None: + output = ArtifactRef( + logical_name="state_al_h5", + uri="hf://policyengine/policyengine-us-data/staging/1.73.0rc1-run-123/states/AL.h5", + sha256="sha256:state-al", + size_bytes=12, + ) + contract = _stage4_contract_with_outputs((output,)) + + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == ["states/AL.h5"] + assert bundle.artifacts[0].artifact_family == "state_h5" + + +@pytest.mark.parametrize( + ("uri", "match"), + [ + ( + "hf://policyengine/policyengine-us-data/staging/1.73.0rc1-other-run/states/AL.h5", + "expected staging prefix", + ), + ( + "hf://policyengine/policyengine-us-data/states/AL.h5", + "expected staging prefix", + ), + ( + "hf://other/policyengine-us-data/staging/1.73.0rc1-run-123/states/AL.h5", + "hf_repo_name", + ), + ( + "hf://policyengine/policyengine-us-data/staging/1.73.0rc1-run-123/districts/NC-01.h5", + "metadata path must match", + ), + ], +) +def test_stage4_candidate_reader_validates_contract_artifact_uri_against_metadata( + uri, + match, +) -> None: + output = ArtifactRef( + logical_name="state_al_h5", + uri=uri, + sha256="sha256:state-al", + size_bytes=12, + metadata={ + "relative_path": "states/AL.h5", + "artifact_family": "state_h5", + "source_stage_id": "4_build_outputs", + "area_type": "state", + "area_id": "AL", + }, + ) + + with pytest.raises(ValueError, match=match): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract_with_outputs((output,)), + ) + + +def test_stage4_candidate_reader_rejects_external_production_uris() -> None: + output = ArtifactRef( + logical_name="state_al_h5", + uri="hf://policyengine/policyengine-us-data/states/AL.h5", + sha256="sha256:state-al", + size_bytes=12, + ) + contract = _stage4_contract_with_outputs((output,)) + + with pytest.raises(ValueError, match="expected staging prefix"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + +def test_stage4_candidate_reader_rejects_wrong_repo_staged_uris() -> None: + output = ArtifactRef( + logical_name="state_al_h5", + uri="hf://other/policyengine-us-data/staging/1.73.0rc1-run-123/states/AL.h5", + sha256="sha256:state-al", + size_bytes=12, + ) + contract = _stage4_contract_with_outputs((output,)) + + with pytest.raises(ValueError, match="hf_repo_name"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + +def test_stage4_candidate_reader_rejects_uri_only_base_artifact() -> None: + output = ArtifactRef( + logical_name="policy_data_db", + uri="hf://policyengine/policyengine-us-data/policy_data.db", + sha256="sha256:policy-db", + size_bytes=24, + ) + + with pytest.raises(ValueError, match="expected staging prefix"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract_with_outputs((output,)), + ) + + +def test_stage4_candidate_bundle_can_read_contract_and_inventory_files( + tmp_path, +) -> None: + contract_path = tmp_path / "output_build_contract.json" + inventory_path = tmp_path / "output_inventory.jsonl" + contract_path.write_text(contract_to_json(_stage4_contract()), encoding="utf-8") + inventory_path.write_text( + json.dumps( + _inventory_record( + "cities/NYC.h5", + key="relative_path", + logical_name="city_nyc_h5", + artifact_family="city_h5", + area_type="city", + area_id="NYC", + sha256="sha256:nyc", + size_bytes=6, + ) + ) + + "\n", + encoding="utf-8", + ) + + bundle = read_stage4_release_candidate_bundle( + context=_context(), + output_contract_path=contract_path, + output_inventory_path=inventory_path, + source_output_contract_path="calibration/runs/run-123/output_build_contract.json", + ) + + assert bundle.source_output_contract_path == ( + "calibration/runs/run-123/output_build_contract.json" + ) + assert [artifact.relative_path for artifact in bundle.artifacts] == [ + "cities/NYC.h5", + "states/AL.h5", + ] + assert bundle.artifacts[0].area_type == "city" + + +def test_stage4_candidate_reader_uses_named_diagnostics_manifest() -> None: + diagnostics_manifest = ArtifactRef( + logical_name="diagnostics_manifest", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/manifest.json", + metadata={ + "relative_path": "calibration/runs/run-123/diagnostics/manifest.json" + }, + ) + other_diagnostic = ArtifactRef( + logical_name="worker_log", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/worker.log", + metadata={"relative_path": "calibration/runs/run-123/diagnostics/worker.log"}, + ) + contract = _stage4_contract_with_outputs( + _stage4_contract().outputs, + diagnostics=( + DiagnosticRef(name="worker_log", kind="log", artifact=other_diagnostic), + DiagnosticRef( + name="diagnostics_manifest", + kind="json", + artifact=diagnostics_manifest, + ), + ), + fingerprint_payload={"diagnostics": "manifest"}, + ) + + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + assert bundle.diagnostics_manifest_path == ( + "calibration/runs/run-123/diagnostics/manifest.json" + ) + + +def test_stage4_candidate_reader_uses_uri_only_and_validation_diagnostics() -> None: + diagnostics_manifest = ArtifactRef( + logical_name="diagnostics_manifest", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/manifest.json", + ) + contract = _stage4_contract_with_outputs( + _stage4_contract().outputs, + validation=ValidationReport( + status="pass", + diagnostics=( + DiagnosticRef( + name="diagnostics_manifest", + kind="json", + artifact=diagnostics_manifest, + ), + ), + ), + fingerprint_payload={"diagnostics": "validation"}, + ) + + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + assert bundle.diagnostics_manifest_path == ( + "calibration/runs/run-123/diagnostics/manifest.json" + ) + + +def test_stage4_candidate_reader_scopes_diagnostics_and_validation_paths() -> None: + with pytest.raises(ValueError, match="context.run_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + diagnostics_manifest_path=( + "calibration/runs/other-run/diagnostics/manifest.json" + ), + ) + + with pytest.raises(ValueError, match="context.run_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + validation_report_paths=( + "calibration/runs/other-run/diagnostics/validation_report.json", + ), + ) + + +def test_stage4_candidate_reader_scopes_source_contract_path() -> None: + with pytest.raises(ValueError, match="context.run_id"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=_stage4_contract(), + source_output_contract_path=( + "calibration/runs/other-run/output_build_contract.json" + ), + ) + + +def test_stage4_candidate_reader_validates_diagnostics_manifest_uri() -> None: + diagnostics_manifest = ArtifactRef( + logical_name="diagnostics_manifest", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/other.json", + metadata={ + "relative_path": "calibration/runs/run-123/diagnostics/manifest.json" + }, + ) + contract = _stage4_contract_with_outputs( + _stage4_contract().outputs, + diagnostics=( + DiagnosticRef( + name="diagnostics_manifest", + kind="json", + artifact=diagnostics_manifest, + ), + ), + fingerprint_payload={"diagnostics": "manifest"}, + ) + + with pytest.raises(ValueError, match="metadata path must match"): + build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + +def test_stage4_candidate_fingerprint_tracks_diagnostics_manifest_identity() -> None: + def contract_with_manifest_sha(sha256: str) -> StageContract: + diagnostics_manifest = ArtifactRef( + logical_name="diagnostics_manifest", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/manifest.json", + sha256=sha256, + size_bytes=100, + metadata={ + "relative_path": "calibration/runs/run-123/diagnostics/manifest.json" + }, + ) + return _stage4_contract_with_outputs( + _stage4_contract().outputs, + diagnostics=( + DiagnosticRef( + name="diagnostics_manifest", + kind="json", + artifact=diagnostics_manifest, + ), + ), + fingerprint_payload={"stage4": "same"}, + ) + + first = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract_with_manifest_sha("sha256:first"), + ) + second = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract_with_manifest_sha("sha256:second"), + ) + + assert first.release_candidate_fingerprint != second.release_candidate_fingerprint + + +def test_stage4_candidate_reader_keeps_diagnostics_out_of_release_artifacts() -> None: + diagnostics_output = ArtifactRef( + logical_name="diagnostics_manifest", + uri="hf://policyengine/policyengine-us-data/calibration/runs/run-123/diagnostics/manifest.json", + metadata={ + "relative_path": "calibration/runs/run-123/diagnostics/manifest.json", + "artifact_family": "diagnostics", + "source_stage_id": "4_build_outputs", + }, + ) + base_contract = _stage4_contract() + contract = _stage4_contract_with_outputs( + (*base_contract.outputs, diagnostics_output), + fingerprint_payload={"outputs": "with_diagnostics"}, + ) + + bundle = build_release_candidate_bundle_from_stage4_contract( + context=_context(), + output_contract=contract, + ) + + assert [artifact.relative_path for artifact in bundle.artifacts] == ["states/AL.h5"] + assert bundle.diagnostics_manifest_path == ( + "calibration/runs/run-123/diagnostics/manifest.json" + ) + + +def test_release_candidate_bundle_round_trips_through_dict_and_json() -> None: + artifact = infer_release_artifact_spec( + "states/AL.h5", + sha256="sha256:state-al", + size_bytes=12, + metadata={"source": "fixture"}, + ) + bundle = ReleaseCandidateInputBundle( + context=_context(), + artifacts=(artifact,), + source_output_contract_path="calibration/runs/run-123/output_build_contract.json", + release_candidate_fingerprint="sha256:fixture", + validation_report_paths=( + "calibration/runs/run-123/diagnostics/validation_report.json", + ), + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + metadata={"reader": "fixture"}, + ) + + payload = bundle.to_dict() + restored = ReleaseCandidateInputBundle.from_dict(json.loads(json.dumps(payload))) + + assert payload["bundle_type"] == "release_candidate_input_bundle" + assert payload["stage_id"] == "5_validate_and_promote_release" + assert payload["schema_version"] + assert ReleaseArtifactSpec.from_dict(artifact.to_dict()) == artifact + assert restored.to_dict() == payload + + +def test_release_candidate_shape_report_uses_canonical_validation_schema() -> None: + bundle = build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5"], + ) + + report = build_release_candidate_shape_report(bundle) + + assert isinstance(report, ValidationReport) + assert report.status == "pass" + assert [finding.check_id for finding in report.findings] == [ + "release_candidate_identity_declared", + "release_candidate_artifacts_declared", + ] + assert report.metadata["stage_id"] == "5_validate_and_promote_release" + assert report.metadata["release_candidate_fingerprint"] == ( + bundle.release_candidate_fingerprint + ) diff --git a/tests/unit/release_promotion/test_contract.py b/tests/unit/release_promotion/test_contract.py new file mode 100644 index 000000000..6222e160c --- /dev/null +++ b/tests/unit/release_promotion/test_contract.py @@ -0,0 +1,263 @@ +import json + +import pytest + +from policyengine_us_data.release_promotion import ( + RELEASE_PROMOTION_CONTRACT_FILENAME, + RELEASE_PROMOTION_CONTRACT_TYPE, + FullPromotionResult, + ReleasePromotionContext, + build_legacy_release_candidate_bundle, + build_release_promotion_contract, + published_artifact_index_artifact_ref, + promoted_runs_index_artifact_ref, + PromotedRunsIndexUpdate, + release_promotion_contract_path, + release_promotion_contract_repo_path, + write_release_promotion_contract, +) +from policyengine_us_data.stage_contracts import ( + ArtifactRef, + DiagnosticRef, + StageContract, + ValidationFinding, + ValidationReport, + contract_to_json, + read_contract, +) + + +def _context() -> ReleasePromotionContext: + return ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version="1.72.0", + release_bump="minor", + modal_app_name="us-data-run-123", + modal_environment="main", + ) + + +def _candidate_bundle(): + return build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=["states/AL.h5", "policy_data.db"], + artifact_metadata_by_path={ + "states/AL.h5": {"sha256": "sha256:state-al", "size_bytes": 12}, + "policy_data.db": {"sha256": "sha256:policy-db", "size_bytes": 24}, + }, + validation_report_paths=[ + "calibration/runs/run-123/diagnostics/validation_report.json" + ], + diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json", + source_output_contract_path=( + "calibration/runs/run-123/diagnostics/contracts/output_build_contract.json" + ), + ) + + +def _promotion_result(*, already_finalized: bool = False) -> FullPromotionResult: + return FullPromotionResult.from_legacy_dict( + { + "run_id": "run-123", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": 2, + "hf_promoted": 0 if already_finalized else 2, + "gcs_uploaded": 0 if already_finalized else 2, + "release_manifest_artifacts": 2, + "version_manifest_updated": not already_finalized, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": 3, + "staging_cleanup_attempted": True, + "already_finalized": already_finalized, + } + ) + + +def _validation_report() -> ValidationReport: + diagnostic = DiagnosticRef( + name="validation_report", + kind="json", + artifact=ArtifactRef( + logical_name="validation_report", + uri=( + "hf://policyengine/policyengine-us-data/calibration/runs/" + "run-123/diagnostics/validation_report.json" + ), + media_type="application/json", + ), + ) + return ValidationReport( + status="pass", + findings=( + ValidationFinding( + check_id="release_candidate_identity_declared", + status="pass", + message="candidate identity is declared", + ), + ), + diagnostics=(diagnostic,), + metadata={"stage_id": "5_validate_and_promote_release"}, + ) + + +def _promoted_runs_index_update() -> PromotedRunsIndexUpdate: + return PromotedRunsIndexUpdate( + status="created", + run_id="run-123", + release_version="1.73.0", + run_count=1, + release_version_run_count=1, + already_finalized=False, + updated_at="2026-05-18T12:00:00+00:00", + ) + + +def test_release_promotion_contract_records_candidate_and_public_refs() -> None: + promoted_runs_update = _promoted_runs_index_update() + contract = build_release_promotion_contract( + candidate_bundle=_candidate_bundle(), + promotion_result=_promotion_result(), + created_at="2026-05-18T12:00:00+00:00", + code_sha="abc123", + package_version="1.73.0", + validation=_validation_report(), + published_artifact_index=published_artifact_index_artifact_ref( + _context(), + row_count=9, + sha256="sha256:index", + size_bytes=123, + ), + promoted_runs_index=promoted_runs_index_artifact_ref( + _context(), + promoted_runs_update, + sha256="sha256:runs-index", + size_bytes=456, + ), + promoted_runs_index_update=promoted_runs_update.to_dict(), + metadata={"writer": "test"}, + ) + + input_names = {artifact.logical_name for artifact in contract.inputs} + output_names = {artifact.logical_name for artifact in contract.outputs} + + assert contract.contract_type == RELEASE_PROMOTION_CONTRACT_TYPE + assert contract.stage_id == "5_validate_and_promote_release" + assert contract.run_id == "run-123" + assert "stage4_output_contract" in input_names + assert "validation_report_1" in input_names + assert "diagnostics_manifest" in input_names + assert output_names == { + "huggingface_release_artifacts", + "gcs_release_artifacts", + "release_manifest", + "versioned_release_manifest", + "trace_tro", + "versioned_trace_tro", + "version_manifest", + "release_completion_marker", + "published_artifact_index", + "promoted_runs_index", + } + assert contract.execution.status == "completed" + assert contract.execution.reuse_decision == "computed" + assert contract.execution.reuse_summary.expected_outputs == 2 + assert contract.parameters["release_candidate_fingerprint"] + assert contract.parameters["source_output_contract_path"] == ( + "calibration/runs/run-123/diagnostics/contracts/output_build_contract.json" + ) + assert contract.parameters["published_artifact_index_path"] == ( + "calibration/runs/run-123/diagnostics/published_artifact_index.jsonl" + ) + assert contract.parameters["promoted_runs_index_path"] == ( + "calibration/runs/index.json" + ) + assert contract.metadata["contract_file"] == RELEASE_PROMOTION_CONTRACT_FILENAME + assert contract.metadata["already_finalized"] is False + assert contract.metadata["cleanup"]["cleaned_count"] == 3 + assert contract.metadata["published_artifact_index"]["metadata"]["row_count"] == 9 + assert ( + contract.metadata["promoted_runs_index"]["metadata"]["update_status"] + == "created" + ) + assert contract.metadata["promoted_runs_index_update"]["run_count"] == 1 + assert contract.metadata["public_refs"]["release_manifest"] == ( + "hf://policyengine/policyengine-us-data/release_manifest.json" + ) + assert contract.metadata["public_refs"]["published_artifact_index"].endswith( + "published_artifact_index.jsonl" + ) + assert contract.metadata["public_refs"]["promoted_runs_index"].endswith( + "calibration/runs/index.json" + ) + assert [substage.substage_id for substage in contract.substages] == [ + "5a_validate_outputs", + "5b_promote_huggingface", + "5c_promote_gcs", + "5d_write_version_manifest", + ] + assert StageContract.from_dict(json.loads(contract_to_json(contract))) == contract + + +def test_release_promotion_contract_records_already_finalized_reuse() -> None: + contract = build_release_promotion_contract( + candidate_bundle=_candidate_bundle(), + promotion_result=_promotion_result(already_finalized=True), + created_at="2026-05-18T12:00:00+00:00", + ) + + assert contract.execution.reuse_decision == "reused" + assert contract.execution.reuse_reason == "already_finalized" + assert contract.execution.reuse_summary.valid_reused_outputs == 2 + assert contract.execution.reuse_summary.recomputed_outputs == 0 + + +def test_write_release_promotion_contract_writes_run_diagnostics_path(tmp_path) -> None: + contract_path = release_promotion_contract_path(tmp_path / "run-123") + + written = write_release_promotion_contract( + contract_path=contract_path, + candidate_bundle=_candidate_bundle(), + promotion_result=_promotion_result(), + created_at="2026-05-18T12:00:00+00:00", + ) + + assert contract_path == ( + tmp_path + / "run-123" + / "diagnostics" + / "contracts" + / "release_promotion_contract.json" + ) + assert read_contract(contract_path) == written + assert release_promotion_contract_repo_path("run-123") == ( + "calibration/runs/run-123/diagnostics/contracts/release_promotion_contract.json" + ) + + +def test_release_promotion_contract_rejects_mismatched_result_identity() -> None: + result = FullPromotionResult.from_legacy_dict( + { + "run_id": "other-run", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": 2, + "hf_promoted": 2, + "gcs_uploaded": 2, + "release_manifest_artifacts": 2, + "version_manifest_updated": True, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": 3, + } + ) + + with pytest.raises(ValueError, match="run_id"): + build_release_promotion_contract( + candidate_bundle=_candidate_bundle(), + promotion_result=result, + created_at="2026-05-18T12:00:00+00:00", + ) diff --git a/tests/unit/release_promotion/test_promoted_runs_index.py b/tests/unit/release_promotion/test_promoted_runs_index.py new file mode 100644 index 000000000..a95cedcc4 --- /dev/null +++ b/tests/unit/release_promotion/test_promoted_runs_index.py @@ -0,0 +1,218 @@ +import pytest + +from policyengine_us_data.release_promotion import ( + FullPromotionResult, + ReleasePromotionContext, + build_promoted_run_index_entry, + load_promoted_runs_index, + promoted_runs_index_artifact_ref, + promoted_runs_index_from_json, + promoted_runs_index_path, + promoted_runs_index_repo_path, + promoted_runs_index_to_json, + read_promoted_runs_index, + update_promoted_runs_index, +) + + +def _context( + *, + run_id: str = "run-123", + candidate_version: str = "1.73.0rc1", + release_version: str = "1.73.0", +) -> ReleasePromotionContext: + return ReleasePromotionContext( + run_id=run_id, + candidate_version=candidate_version, + release_version=release_version, + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version="1.72.0", + release_bump="minor", + ) + + +def _promotion_result( + *, + run_id: str = "run-123", + candidate_version: str = "1.73.0rc1", + release_version: str = "1.73.0", + already_finalized: bool = False, +) -> FullPromotionResult: + return FullPromotionResult.from_legacy_dict( + { + "run_id": run_id, + "candidate_version": candidate_version, + "release_version": release_version, + "artifact_count": 2, + "hf_promoted": 0 if already_finalized else 2, + "gcs_uploaded": 0 if already_finalized else 2, + "release_manifest_artifacts": 2, + "version_manifest_updated": not already_finalized, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": 3, + "staging_cleanup_attempted": True, + "already_finalized": already_finalized, + } + ) + + +def _entry( + *, + run_id: str = "run-123", + candidate_version: str = "1.73.0rc1", + release_version: str = "1.73.0", + already_finalized: bool = False, + promoted_at: str = "2026-05-20T12:00:00+00:00", +): + return build_promoted_run_index_entry( + context=_context( + run_id=run_id, + candidate_version=candidate_version, + release_version=release_version, + ), + promotion_result=_promotion_result( + run_id=run_id, + candidate_version=candidate_version, + release_version=release_version, + already_finalized=already_finalized, + ), + promoted_at=promoted_at, + release_promotion_contract_path=( + f"calibration/runs/{run_id}/diagnostics/contracts/" + "release_promotion_contract.json" + ), + published_artifact_index_path=( + f"calibration/runs/{run_id}/diagnostics/published_artifact_index.jsonl" + ), + run_manifest_path=f"calibration/runs/{run_id}/run_manifest.json", + step_manifest_path=( + f"calibration/runs/{run_id}/steps/5_validate_and_promote_release.json" + ), + metadata={"branch": "main", "package_version": release_version}, + ) + + +def test_promoted_runs_index_creates_run_oriented_discovery_file(tmp_path) -> None: + path = promoted_runs_index_path(tmp_path / "runs") + entry = _entry() + + index, update = update_promoted_runs_index( + path=path, + entry=entry, + updated_at="2026-05-20T12:00:01+00:00", + ) + + assert update.status == "created" + assert update.run_count == 1 + assert update.release_version_run_count == 1 + assert index.runs["run-123"].run_id == "run-123" + assert index.runs["run-123"].status == "promoted" + assert index.release_versions["1.73.0"].latest_run_id == "run-123" + assert index.release_versions["1.73.0"].run_ids == ("run-123",) + assert read_promoted_runs_index(path) == index + assert promoted_runs_index_from_json(promoted_runs_index_to_json(index)) == index + assert promoted_runs_index_repo_path() == "calibration/runs/index.json" + + +def test_promoted_runs_index_updates_same_run_without_duplicates(tmp_path) -> None: + path = promoted_runs_index_path(tmp_path / "runs") + update_promoted_runs_index( + path=path, + entry=_entry(), + updated_at="2026-05-20T12:00:01+00:00", + ) + + index, update = update_promoted_runs_index( + path=path, + entry=_entry( + already_finalized=True, + promoted_at="2026-05-20T12:05:00+00:00", + ), + updated_at="2026-05-20T12:05:01+00:00", + ) + + assert update.status == "updated" + assert update.already_finalized is True + assert len(index.runs) == 1 + assert index.runs["run-123"].already_finalized is True + assert index.release_versions["1.73.0"].run_ids == ("run-123",) + + +def test_promoted_runs_index_tracks_duplicate_release_versions_once_per_run( + tmp_path, +) -> None: + path = promoted_runs_index_path(tmp_path / "runs") + update_promoted_runs_index( + path=path, + entry=_entry(), + updated_at="2026-05-20T12:00:01+00:00", + ) + update_promoted_runs_index( + path=path, + entry=_entry( + run_id="run-456", + candidate_version="1.73.0rc2", + promoted_at="2026-05-20T13:00:00+00:00", + ), + updated_at="2026-05-20T13:00:01+00:00", + ) + + index, update = update_promoted_runs_index( + path=path, + entry=_entry( + run_id="run-456", + candidate_version="1.73.0rc2", + promoted_at="2026-05-20T13:05:00+00:00", + ), + updated_at="2026-05-20T13:05:01+00:00", + ) + + release = index.release_versions["1.73.0"] + assert update.status == "updated" + assert len(index.runs) == 2 + assert release.latest_run_id == "run-456" + assert release.run_ids == ("run-123", "run-456") + assert update.release_version_run_count == 2 + + +def test_promoted_runs_index_artifact_ref_records_update_status(tmp_path) -> None: + index, update = update_promoted_runs_index( + path=promoted_runs_index_path(tmp_path / "runs"), + entry=_entry(), + updated_at="2026-05-20T12:00:01+00:00", + ) + + artifact = promoted_runs_index_artifact_ref( + _context(), + update, + sha256="sha256:index", + size_bytes=123, + ) + + assert len(index.runs) == 1 + assert artifact.logical_name == "promoted_runs_index" + assert artifact.media_type == "application/json" + assert artifact.metadata["update_status"] == "created" + assert artifact.metadata["relative_path"] == "calibration/runs/index.json" + + +def test_load_promoted_runs_index_returns_empty_index_for_missing_file( + tmp_path, +) -> None: + index = load_promoted_runs_index( + promoted_runs_index_path(tmp_path / "runs"), + updated_at="2026-05-20T12:00:00+00:00", + ) + + assert index.runs == {} + assert index.release_versions == {} + + +def test_promoted_run_entry_rejects_mismatched_result_identity() -> None: + with pytest.raises(ValueError, match="run_id"): + build_promoted_run_index_entry( + context=_context(), + promotion_result=_promotion_result(run_id="other-run"), + promoted_at="2026-05-20T12:00:00+00:00", + ) diff --git a/tests/unit/release_promotion/test_published_index.py b/tests/unit/release_promotion/test_published_index.py new file mode 100644 index 000000000..28620fb8f --- /dev/null +++ b/tests/unit/release_promotion/test_published_index.py @@ -0,0 +1,229 @@ +import json + +import pytest + +from policyengine_us_data.release_promotion import ( + FullPromotionResult, + ReleasePromotionContext, + build_legacy_release_candidate_bundle, + build_published_artifact_index, + published_artifact_index_artifact_ref, + published_artifact_index_from_jsonl, + published_artifact_index_path, + published_artifact_index_repo_path, + published_artifact_index_to_jsonl, + read_published_artifact_index, + write_published_artifact_index, +) +from policyengine_us_data.stage_contracts import ArtifactRef + + +def _context() -> ReleasePromotionContext: + return ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + base_release_version="1.72.0", + release_bump="minor", + ) + + +def _rel_paths() -> list[str]: + return [ + "policy_data.db", + "states/AL.h5", + "districts/NC-01.h5", + "cities/NYC.h5", + "national/US.h5", + ] + + +def _candidate_bundle(): + return build_legacy_release_candidate_bundle( + context=_context(), + rel_paths=_rel_paths(), + artifact_metadata_by_path={ + path: {"sha256": f"sha256:candidate-{index}", "size_bytes": index} + for index, path in enumerate(_rel_paths(), start=1) + }, + ) + + +def _promotion_result() -> FullPromotionResult: + return FullPromotionResult.from_legacy_dict( + { + "run_id": "run-123", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": len(_rel_paths()), + "hf_promoted": len(_rel_paths()), + "gcs_uploaded": len(_rel_paths()), + "release_manifest_artifacts": len(_rel_paths()), + "version_manifest_updated": True, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": len(_rel_paths()) + 1, + } + ) + + +def _release_manifest() -> dict: + return { + "artifacts": { + path.removesuffix(".h5").removesuffix(".db"): { + "kind": "database" if path.endswith(".db") else "microdata", + "path": path, + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.73.0", + "sha256": f"manifest-{index}", + "size_bytes": 100 + index, + } + for index, path in enumerate(_rel_paths(), start=1) + } + } + + +def _diagnostic_artifact() -> ArtifactRef: + return ArtifactRef( + logical_name="release_promotion_contract", + uri=( + "hf://policyengine/policyengine-us-data/calibration/runs/" + "run-123/diagnostics/contracts/release_promotion_contract.json" + ), + media_type="application/json", + metadata={ + "artifact_family": "stage_contract", + "source_stage_id": "5_validate_and_promote_release", + "relative_path": ( + "calibration/runs/run-123/diagnostics/contracts/" + "release_promotion_contract.json" + ), + }, + ) + + +def _rows(): + return build_published_artifact_index( + candidate_bundle=_candidate_bundle(), + promotion_result=_promotion_result(), + release_manifest=_release_manifest(), + diagnostic_artifacts=(_diagnostic_artifact(),), + ) + + +def _by_path(rows): + return {row.relative_path: row for row in rows} + + +def test_published_artifact_index_records_release_artifact_destinations() -> None: + rows = _by_path(_rows()) + + state = rows["states/AL.h5"] + district = rows["districts/NC-01.h5"] + city = rows["cities/NYC.h5"] + national = rows["national/US.h5"] + base = rows["policy_data.db"] + + assert state.artifact_family == "state_h5" + assert state.area_type == "state" + assert state.area_id == "AL" + assert state.hf_uri == "hf://policyengine/policyengine-us-data/states/AL.h5" + assert state.gcs_uri == "gs://policyengine-us-data/states/AL.h5" + assert state.release_manifest_key == "states/AL" + assert state.release_manifest_revision == "1.73.0" + assert state.sha256 == "manifest-2" + assert district.artifact_family == "district_h5" + assert district.area_id == "NC-01" + assert city.artifact_family == "city_h5" + assert city.area_id == "NYC" + assert national.artifact_family == "national_h5" + assert national.area_type == "national" + assert base.artifact_family == "base_dataset" + assert base.source_stage_id == "1_build_datasets" + assert base.gcs_uri == "gs://policyengine-us-data/policy_data.db" + + +def test_published_artifact_index_records_manifest_and_diagnostic_rows() -> None: + rows = _by_path(_rows()) + + release_manifest = rows["release_manifest.json"] + version_manifest = rows["version_manifest.json"] + completion_marker = rows["releases/1.73.0/release-complete.json"] + diagnostic = rows[ + "calibration/runs/run-123/diagnostics/contracts/release_promotion_contract.json" + ] + + assert release_manifest.artifact_role == "release_metadata" + assert release_manifest.artifact_family == "release_manifest" + assert release_manifest.gcs_uri is None + assert version_manifest.artifact_family == "version_manifest" + assert completion_marker.artifact_family == "release_completion_marker" + assert diagnostic.artifact_role == "diagnostic" + assert diagnostic.artifact_family == "stage_contract" + assert diagnostic.hf_uri.endswith("release_promotion_contract.json") + + +def test_published_artifact_index_jsonl_round_trips_deterministically() -> None: + rows = _rows() + + payload = published_artifact_index_to_jsonl(rows) + restored = published_artifact_index_from_jsonl(payload) + + assert payload.endswith("\n") + assert len(payload.splitlines()) == len(rows) + assert restored == tuple(rows) + assert published_artifact_index_to_jsonl(restored) == payload + + +def test_write_published_artifact_index_writes_explicit_path(tmp_path) -> None: + path = published_artifact_index_path(tmp_path / "run-123") + rows = _rows() + + written = write_published_artifact_index(rows, path) + + assert written == tuple(rows) + assert read_published_artifact_index(path) == tuple(rows) + assert json.loads(path.read_text(encoding="utf-8").splitlines()[0]) + assert published_artifact_index_repo_path("run-123") == ( + "calibration/runs/run-123/diagnostics/published_artifact_index.jsonl" + ) + + +def test_published_artifact_index_artifact_ref_records_row_count() -> None: + artifact = published_artifact_index_artifact_ref( + _context(), + row_count=12, + sha256="sha256:index", + size_bytes=123, + ) + + assert artifact.logical_name == "published_artifact_index" + assert artifact.media_type == "application/jsonl" + assert artifact.metadata["row_count"] == 12 + assert artifact.metadata["relative_path"] == ( + "calibration/runs/run-123/diagnostics/published_artifact_index.jsonl" + ) + + +def test_published_artifact_index_rejects_mismatched_result_identity() -> None: + result = FullPromotionResult.from_legacy_dict( + { + "run_id": "other-run", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": len(_rel_paths()), + "hf_promoted": len(_rel_paths()), + "gcs_uploaded": len(_rel_paths()), + "release_manifest_artifacts": len(_rel_paths()), + "version_manifest_updated": True, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": len(_rel_paths()) + 1, + } + ) + + with pytest.raises(ValueError, match="run_id"): + build_published_artifact_index( + candidate_bundle=_candidate_bundle(), + promotion_result=result, + ) diff --git a/tests/unit/release_promotion/test_results.py b/tests/unit/release_promotion/test_results.py new file mode 100644 index 000000000..820504ea7 --- /dev/null +++ b/tests/unit/release_promotion/test_results.py @@ -0,0 +1,269 @@ +from pathlib import Path + +import pytest + +from policyengine_us_data.release_promotion import FullPromotionResult +from policyengine_us_data.utils.release_promotion import ( + FullReleasePromotionConfig, + FullReleasePromotionDependencies, + promote_full_release_with_result, +) + + +def _make_files(tmp_path, rel_paths): + files = [] + for rel_path in rel_paths: + path = tmp_path / rel_path + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(rel_path, encoding="utf-8") + files.append((path, rel_path)) + return tuple(files) + + +class FakeFullReleasePromotionDependencies: + def __init__( + self, + *, + finalized_manifest=None, + marker_exists=True, + missing_staged_artifacts=(), + preflight_result=(True, []), + cleanup_error: Exception | None = None, + ) -> None: + self.finalized_manifest = finalized_manifest + self.marker_exists = marker_exists + self.missing_staged_artifacts = tuple(missing_staged_artifacts) + self.preflight_result = preflight_result + self.cleanup_error = cleanup_error + self.calls = [] + + def as_dependencies(self) -> FullReleasePromotionDependencies: + return FullReleasePromotionDependencies( + dedupe_preserving_order=self.dedupe_preserving_order, + download_staged_artifacts_for_manifest=( + self.download_staged_artifacts_for_manifest + ), + get_matching_finalized_release_manifest=( + self.get_matching_finalized_release_manifest + ), + list_missing_staged_artifacts=self.list_missing_staged_artifacts, + preflight_release_manifest_publish=self.preflight_release_manifest_publish, + promote_staging_to_production_hf=self.promote_staging_to_production_hf, + upload_from_hf_staging_to_gcs=self.upload_from_hf_staging_to_gcs, + publish_release_manifest_to_hf=self.publish_release_manifest_to_hf, + upload_final_version_manifest=self.upload_final_version_manifest, + upload_release_completion_marker=self.upload_release_completion_marker, + release_completion_marker_exists=self.release_completion_marker_exists, + cleanup_staging_hf=self.cleanup_staging_hf, + ) + + def dedupe_preserving_order(self, paths): + seen = set() + deduped = [] + for path in paths: + if path not in seen: + seen.add(path) + deduped.append(path) + return deduped + + def download_staged_artifacts_for_manifest(self, *args, **kwargs): + self.calls.append("download") + return [] + + def get_matching_finalized_release_manifest(self, *args, **kwargs): + self.calls.append("check_finalized") + return self.finalized_manifest + + def list_missing_staged_artifacts(self, *args, **kwargs): + self.calls.append("validate_staging") + return list(self.missing_staged_artifacts) + + def preflight_release_manifest_publish(self, *args, **kwargs): + self.calls.append("preflight_manifest") + return self.preflight_result + + def promote_staging_to_production_hf(self, paths, *args, **kwargs): + self.calls.append("promote_hf") + return len(paths) + + def upload_from_hf_staging_to_gcs(self, paths, *args, **kwargs): + self.calls.append("upload_gcs") + return len(paths) + + def publish_release_manifest_to_hf(self, files_with_paths, *args, **kwargs): + self.calls.append("release_manifest") + return { + "artifacts": { + Path(repo_path).with_suffix("").as_posix(): { + "path": repo_path, + "sha256": f"sha256:{repo_path}", + } + for _, repo_path in files_with_paths + } + } + + def upload_final_version_manifest(self, *args, **kwargs): + self.calls.append("version_manifest") + + def upload_release_completion_marker(self, *args, **kwargs): + self.calls.append("release_complete") + return {"marker_path": "releases/1.73.0/release-complete.json"} + + def release_completion_marker_exists(self, *args, **kwargs): + self.calls.append("check_marker") + return self.marker_exists + + def cleanup_staging_hf(self, paths, *args, **kwargs): + self.calls.append("cleanup_staging") + if self.cleanup_error is not None: + raise self.cleanup_error + return len(paths) + + +def _config( + rel_paths, + files_with_paths, + *, + cleanup_staging=True, +) -> FullReleasePromotionConfig: + return FullReleasePromotionConfig( + rel_paths=rel_paths, + candidate_version="1.73.0rc1", + release_version="1.73.0", + run_id="run-123", + files_with_paths=files_with_paths, + extra_cleanup_paths=("_run_context.json",), + cleanup_staging=cleanup_staging, + ) + + +def test_full_promotion_result_wraps_legacy_dict() -> None: + result = FullPromotionResult.from_legacy_dict( + { + "run_id": "run-123", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": 2, + "hf_promoted": 2, + "gcs_uploaded": 2, + "release_manifest_artifacts": 2, + "release_completion_marker": "releases/1.73.0/release-complete.json", + "staging_cleaned": 3, + } + ) + + assert result.hf.promoted_count == 2 + assert result.gcs.uploaded_count == 2 + assert result.release_manifest.artifact_count == 2 + assert result.version_manifest.updated is True + assert result.completion_marker.marker_path == ( + "releases/1.73.0/release-complete.json" + ) + assert result.cleanup.cleaned_count == 3 + assert FullPromotionResult.from_dict(result.to_dict()) == result + + +def test_promote_full_release_with_result_preserves_transaction_order(tmp_path) -> None: + rel_paths = ("cps_2024.h5", "states/AL.h5", "national/US.h5") + files = _make_files(tmp_path, rel_paths) + fake_deps = FakeFullReleasePromotionDependencies() + + result = promote_full_release_with_result( + _config(rel_paths, files), + fake_deps.as_dependencies(), + ) + + assert fake_deps.calls == [ + "check_finalized", + "validate_staging", + "preflight_manifest", + "promote_hf", + "upload_gcs", + "release_manifest", + "version_manifest", + "release_complete", + "cleanup_staging", + ] + assert isinstance(result, FullPromotionResult) + assert result.run_id == "run-123" + assert result.artifact_count == 3 + assert result.hf.promoted_count == 3 + assert result.gcs.uploaded_count == 3 + assert result.release_manifest.artifact_count == 3 + assert result.version_manifest.updated is True + assert result.cleanup.cleaned_count == 4 + assert result.already_finalized is False + + +def test_promote_full_release_with_result_handles_already_finalized(tmp_path) -> None: + rel_paths = ("states/AL.h5",) + files = _make_files(tmp_path, rel_paths) + fake_deps = FakeFullReleasePromotionDependencies( + finalized_manifest={"artifacts": {"states/AL": {"path": "states/AL.h5"}}}, + marker_exists=True, + ) + + result = promote_full_release_with_result( + _config(rel_paths, files), + fake_deps.as_dependencies(), + ) + + assert fake_deps.calls == ["check_finalized", "check_marker", "cleanup_staging"] + assert result.already_finalized is True + assert result.hf.promoted_count == 0 + assert result.gcs.uploaded_count == 0 + assert result.release_manifest.artifact_count == 1 + assert result.version_manifest.updated is False + assert result.completion_marker.marker_path == ( + "releases/1.73.0/release-complete.json" + ) + + +def test_promote_full_release_with_result_represents_cleanup_failure(tmp_path) -> None: + rel_paths = ("states/AL.h5",) + files = _make_files(tmp_path, rel_paths) + fake_deps = FakeFullReleasePromotionDependencies( + cleanup_error=RuntimeError("cleanup unavailable"), + ) + + result = promote_full_release_with_result( + _config(rel_paths, files), + fake_deps.as_dependencies(), + ) + + assert "cleanup_staging" in fake_deps.calls + assert result.cleanup.attempted is True + assert result.cleanup.cleaned_count == 0 + assert result.hf.promoted_count == 1 + assert result.gcs.uploaded_count == 1 + + +def test_promote_full_release_with_result_represents_skipped_cleanup(tmp_path) -> None: + rel_paths = ("states/AL.h5",) + files = _make_files(tmp_path, rel_paths) + fake_deps = FakeFullReleasePromotionDependencies() + + result = promote_full_release_with_result( + _config(rel_paths, files, cleanup_staging=False), + fake_deps.as_dependencies(), + ) + + assert "cleanup_staging" not in fake_deps.calls + assert result.cleanup.attempted is False + assert result.cleanup.cleaned_count == 0 + + +def test_promote_full_release_with_result_fails_before_public_writes(tmp_path) -> None: + rel_paths = ("states/AL.h5",) + files = _make_files(tmp_path, rel_paths) + fake_deps = FakeFullReleasePromotionDependencies( + missing_staged_artifacts=("staging/1.73.0rc1-run-123/states/AL.h5",), + ) + + with pytest.raises(FileNotFoundError, match="Missing staged release artifacts"): + promote_full_release_with_result( + _config(rel_paths, files), + fake_deps.as_dependencies(), + ) + + assert fake_deps.calls == ["check_finalized", "validate_staging"] diff --git a/tests/unit/release_promotion/test_validation.py b/tests/unit/release_promotion/test_validation.py new file mode 100644 index 000000000..eacf66fed --- /dev/null +++ b/tests/unit/release_promotion/test_validation.py @@ -0,0 +1,297 @@ +from pathlib import Path +from typing import Any + +from policyengine_us_data.release_promotion import ( + ReleaseCandidateInputBundle, + ReleaseCandidateValidationDependencies, + ReleaseCandidateValidator, + ReleasePromotionContext, + infer_release_artifact_spec, +) +from policyengine_us_data.stage_contracts import ValidationFinding, ValidationReport + + +def _context() -> ReleasePromotionContext: + return ReleasePromotionContext( + run_id="run-123", + candidate_version="1.73.0rc1", + release_version="1.73.0", + hf_repo_name="policyengine/policyengine-us-data", + gcs_bucket_name="policyengine-us-data", + ) + + +def _bundle( + *, + paths: tuple[str, ...] = ( + "national/US.h5", + "states/AL.h5", + "districts/NC-01.h5", + "cities/NYC.h5", + ), + validation_report_paths: tuple[str, ...] = ( + "calibration/runs/run-123/diagnostics/validation_report.json", + ), +) -> ReleaseCandidateInputBundle: + return ReleaseCandidateInputBundle( + context=_context(), + artifacts=tuple( + infer_release_artifact_spec( + path, + sha256=f"sha256:{path}", + size_bytes=100, + ) + for path in paths + ), + validation_report_paths=validation_report_paths, + ) + + +def _manifest_files( + paths: tuple[str, ...] = ( + "national/US.h5", + "states/AL.h5", + "districts/NC-01.h5", + "cities/NYC.h5", + ), +) -> tuple[tuple[Path, str], ...]: + return tuple((Path(path), path) for path in paths) + + +class FakeReleaseCandidateValidationDependencies: + def __init__( + self, + *, + finalized_manifest: dict[str, Any] | None = None, + finalized_error: Exception | None = None, + marker_exists: bool = False, + missing_staged_artifacts: tuple[str, ...] = (), + missing_validation_reports: tuple[str, ...] = (), + preflight_result: tuple[bool, list[str]] = (True, []), + ) -> None: + self.finalized_manifest = finalized_manifest + self.finalized_error = finalized_error + self.marker_exists = marker_exists + self.missing_staged_artifacts = missing_staged_artifacts + self.missing_validation_reports = missing_validation_reports + self.preflight_result = preflight_result + self.calls: list[str] = [] + + def as_dependencies(self) -> ReleaseCandidateValidationDependencies: + return ReleaseCandidateValidationDependencies( + get_matching_finalized_release_manifest=( + self.get_matching_finalized_release_manifest + ), + list_missing_staged_artifacts=self.list_missing_staged_artifacts, + list_missing_validation_reports=self.list_missing_validation_reports, + preflight_release_manifest_publish=self.preflight_release_manifest_publish, + release_completion_marker_exists=self.release_completion_marker_exists, + ) + + def get_matching_finalized_release_manifest(self, *args, **kwargs): + self.calls.append("get_matching_finalized_release_manifest") + if self.finalized_error is not None: + raise self.finalized_error + return self.finalized_manifest + + def list_missing_staged_artifacts(self, *args, **kwargs): + self.calls.append("list_missing_staged_artifacts") + return list(self.missing_staged_artifacts) + + def list_missing_validation_reports(self, *args, **kwargs): + self.calls.append("list_missing_validation_reports") + return list(self.missing_validation_reports) + + def preflight_release_manifest_publish(self, *args, **kwargs): + self.calls.append("preflight_release_manifest_publish") + return self.preflight_result + + def release_completion_marker_exists(self, *args, **kwargs): + self.calls.append("release_completion_marker_exists") + return self.marker_exists + + +def _validator( + fake_deps: FakeReleaseCandidateValidationDependencies, +) -> ReleaseCandidateValidator: + return ReleaseCandidateValidator(dependencies=fake_deps.as_dependencies()) + + +def _finding(report: ValidationReport, check_id: str) -> ValidationFinding: + return next(finding for finding in report.findings if finding.check_id == check_id) + + +def test_release_candidate_validator_passes_complete_candidate() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies() + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + assert isinstance(report, ValidationReport) + assert report.status == "pass" + assert [finding.status for finding in report.findings] == ["pass"] * 5 + assert report.metadata["suite_id"] == "release_candidate_validation" + assert report.metadata["substage_id"] == "5a_validate_outputs" + assert fake_deps.calls == [ + "get_matching_finalized_release_manifest", + "list_missing_staged_artifacts", + "list_missing_validation_reports", + "preflight_release_manifest_publish", + ] + + +def test_release_candidate_validator_reports_missing_required_families() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies() + + report = _validator(fake_deps).validate( + _bundle(paths=("national/US.h5", "states/AL.h5", "districts/NC-01.h5")), + files_with_paths=_manifest_files( + ("national/US.h5", "states/AL.h5", "districts/NC-01.h5") + ), + ) + + finding = _finding(report, "release_candidate_required_artifact_families") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == ("city_h5",) + + +def test_release_candidate_validator_reports_missing_staged_artifacts() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies( + missing_staged_artifacts=("staging/1.73.0rc1-run-123/states/AL.h5",), + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_staged_artifacts_present") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == ("staging/1.73.0rc1-run-123/states/AL.h5",) + + +def test_release_candidate_validator_reports_missing_validation_reports() -> None: + missing_report = "calibration/runs/run-123/diagnostics/validation_report.json" + fake_deps = FakeReleaseCandidateValidationDependencies( + missing_validation_reports=(missing_report,), + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_validation_reports_present") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == (missing_report,) + + +def test_release_candidate_validator_requires_validation_report_paths() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies() + + report = _validator(fake_deps).validate( + _bundle(validation_report_paths=()), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_validation_reports_present") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == () + assert "list_missing_validation_reports" not in fake_deps.calls + + +def test_release_candidate_validator_reports_incomplete_local_area_prefixes() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies( + preflight_result=(False, ["districts/"]), + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_release_manifest_preflight") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == ("districts/",) + + +def test_release_candidate_validator_requires_manifest_files_for_preflight() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies() + + report = _validator(fake_deps).validate(_bundle()) + + finalized_finding = _finding( + report, + "release_candidate_finalized_release_state", + ) + preflight_finding = _finding( + report, + "release_candidate_release_manifest_preflight", + ) + assert report.status == "fail" + assert finalized_finding.status == "fail" + assert preflight_finding.status == "fail" + assert "get_matching_finalized_release_manifest" not in fake_deps.calls + + +def test_release_candidate_validator_accepts_finalized_release_with_marker() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies( + finalized_manifest={"artifacts": {"national": {"path": "national/US.h5"}}}, + marker_exists=True, + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + assert report.status == "pass" + assert _finding(report, "release_candidate_finalized_release_state").value is True + assert "release_completion_marker_exists" in fake_deps.calls + assert "list_missing_staged_artifacts" not in fake_deps.calls + assert "list_missing_validation_reports" not in fake_deps.calls + assert "preflight_release_manifest_publish" not in fake_deps.calls + + +def test_release_candidate_validator_rejects_finalized_release_without_marker() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies( + finalized_manifest={"artifacts": {"national": {"path": "national/US.h5"}}}, + marker_exists=False, + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_finalized_release_state") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value is False + assert "list_missing_staged_artifacts" not in fake_deps.calls + assert "preflight_release_manifest_publish" not in fake_deps.calls + + +def test_release_candidate_validator_reports_finalized_lookup_errors() -> None: + fake_deps = FakeReleaseCandidateValidationDependencies( + finalized_error=RuntimeError("finalized manifest unavailable"), + ) + + report = _validator(fake_deps).validate( + _bundle(), + files_with_paths=_manifest_files(), + ) + + finding = _finding(report, "release_candidate_finalized_release_state") + assert report.status == "fail" + assert finding.status == "fail" + assert finding.value == "RuntimeError" + assert "list_missing_staged_artifacts" not in fake_deps.calls + assert "preflight_release_manifest_publish" not in fake_deps.calls diff --git a/tests/unit/test_pipeline.py b/tests/unit/test_pipeline.py index 69f67bb82..a1848bb72 100644 --- a/tests/unit/test_pipeline.py +++ b/tests/unit/test_pipeline.py @@ -15,7 +15,10 @@ _calibration_package_parameters, _new_run_metadata, _pipeline_error_summary, + _promotion_result_from_stdout, + _release_artifact_metadata_by_path, _run_required_promotion_subprocess, + _stage4_output_contract_repo_path_if_available, _try_reload_pipeline_volume_after_h5_builds, ) from modal_app.step_manifests.state import RunMetadata # noqa: E402 @@ -128,6 +131,60 @@ def test_pipeline_error_summary_falls_back_to_bounded_traceback(monkeypatch): assert "old traceback" not in summary +def test_promotion_result_from_stdout_returns_typed_result(): + result = _promotion_result_from_stdout( + json.dumps( + { + "run_id": "run-123", + "candidate_version": "1.73.0rc1", + "release_version": "1.73.0", + "artifact_count": 1, + "hf_promoted": 1, + "gcs_uploaded": 1, + "release_manifest_artifacts": 1, + "version_manifest_updated": True, + "release_completion_marker": ("releases/1.73.0/release-complete.json"), + "staging_cleaned": 2, + } + ) + ) + + assert result.run_id == "run-123" + assert result.artifact_count == 1 + assert result.hf.promoted_count == 1 + + +def test_release_artifact_metadata_by_path_uses_local_files(tmp_path, monkeypatch): + artifact = tmp_path / "states" / "AL.h5" + artifact.parent.mkdir(parents=True) + artifact.write_text("state fixture", encoding="utf-8") + + monkeypatch.setattr( + "modal_app.pipeline._full_release_manifest_files", + lambda run_id, rel_paths: [(artifact, "states/AL.h5")], + ) + + metadata = _release_artifact_metadata_by_path("run-123", ["states/AL.h5"]) + + assert metadata["states/AL.h5"]["sha256"].startswith("sha256:") + assert metadata["states/AL.h5"]["size_bytes"] == artifact.stat().st_size + + +def test_stage4_output_contract_repo_path_detects_run_local_contract( + tmp_path, + monkeypatch, +): + run_dir = tmp_path / "run-123" + contract_path = run_dir / "diagnostics" / "contracts" / "output_build_contract.json" + contract_path.parent.mkdir(parents=True) + contract_path.write_text("{}", encoding="utf-8") + monkeypatch.setattr("modal_app.pipeline._run_dir", lambda run_id: run_dir) + + assert _stage4_output_contract_repo_path_if_available("run-123") == ( + "calibration/runs/run-123/diagnostics/contracts/output_build_contract.json" + ) + + def test_new_run_metadata_accepts_release_context_fields_once(): context = RunContext.from_mapping( { diff --git a/tests/unit/test_pipeline_source_contracts.py b/tests/unit/test_pipeline_source_contracts.py index a022ef3fe..eed217061 100644 --- a/tests/unit/test_pipeline_source_contracts.py +++ b/tests/unit/test_pipeline_source_contracts.py @@ -29,6 +29,9 @@ def test_promote_run_uses_single_full_release_promotion() -> None: assert "_apply_run_context_env(promotion_context)" in source assert "_promote_full_release_from_staging(" in source assert "promotion_context.to_dict()" in source + assert "_promotion_result_from_stdout(promotion_stdout)" in source + assert "_write_release_promotion_contract_for_run(" in source + assert "release_promotion_refs" in source assert "promote_publish.remote(" not in source assert "promote_national_publish.remote(" not in source assert "upload_datasets(" not in source @@ -100,6 +103,41 @@ def test_promote_run_uses_unified_staged_release_path() -> None: assert 'extra_cleanup_paths=["_run_context.json"]' in source +def test_promote_run_writes_release_promotion_contract_output() -> None: + tree = ast.parse(PIPELINE_SOURCE.read_text()) + helper = _function_def(tree, "_write_release_promotion_contract_for_run") + stage4_helper = _function_def( + tree, + "_stage4_output_contract_repo_path_if_available", + ) + helper_source = ast.get_source_segment(PIPELINE_SOURCE.read_text(), helper) + stage4_source = ast.get_source_segment(PIPELINE_SOURCE.read_text(), stage4_helper) + + assert "release_promotion_contract_path(run_dir)" in helper_source + assert "build_legacy_release_candidate_bundle(" in helper_source + assert "build_published_artifact_index(" in helper_source + assert "build_promoted_run_index_entry(" in helper_source + assert "write_published_artifact_index(" in helper_source + assert "update_promoted_runs_index(" in helper_source + assert "write_release_promotion_contract(" in helper_source + assert 'role="contract"' in helper_source + assert 'role="index"' in helper_source + assert 'media_type="application/json"' in helper_source + assert 'media_type="application/jsonl"' in helper_source + assert 'manifest_path=f"../{promoted_index_path.name}"' in helper_source + assert ( + "source_output_contract_path=_stage4_output_contract_repo_path_if_available" + in (helper_source) + ) + assert "published_artifact_index=published_index_artifact" in helper_source + assert "promoted_runs_index=promoted_index_artifact" in helper_source + assert "promoted_runs_index_update=promoted_index_update.to_dict()" in ( + helper_source + ) + assert 'diagnostics" / "contracts" / "output_build_contract.json"' in stage4_source + assert "calibration/runs/{run_id}/" in stage4_source + + def test_run_pipeline_refreshes_diagnostics_even_when_h5_outputs_reused() -> None: tree = ast.parse(PIPELINE_SOURCE.read_text()) run_pipeline = _function_def(tree, "run_pipeline")