diff --git a/.gitignore b/.gitignore index 3517fb7..3aff33a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,11 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +# Explicitly un-ignore the manifest module folder (MANIFEST above is for Python packaging artifacts) +!databusclient/manifest/ +!databusclient/manifest/** +databusclient/manifest/__pycache__/ +*.py[cod] # PyInstaller # Usually these files are written by a python script from a template diff --git a/README.md b/README.md index e5a2b52..a3b9f98 100644 --- a/README.md +++ b/README.md @@ -675,4 +675,30 @@ Or to ensure compatibility with the `pyproject.toml` configured dependencies, ru ```bash poetry run pytest tests/ -``` \ No newline at end of file +``` + +## Manifest + +All three commands support an optional `--manifest` flag that writes a structured JSON-LD record of the operation to disk: + +```bash +databusclient download https://databus.dbpedia.org/dbpedia/generic/labels/2023.12.01 \ + --manifest ./manifests/labels-download.jsonld + +databusclient deploy --version-id https://databus.dbpedia.org/myaccount/mygroup/mydata/1.0 \ + --title "My Dataset" --abstract "..." --description "..." \ + --license https://creativecommons.org/licenses/by-sa/3.0/ \ + --apikey YOUR_KEY --manifest ./manifests/deploy-run.jsonld \ + myfile.nt + +databusclient delete https://databus.dbpedia.org/myaccount/mygroup/mydata/1.0 \ + --databus-key YOUR_KEY --manifest ./manifests/delete-run.jsonld +``` + +The manifest records input parameters, per-file URLs, checksums, byte sizes, timestamps, and success/failure status for each file. It uses the DataID vocabulary and is versioned via `dbus:schemaVersion`. + +- If the target path already exists, the manifest is written to an auto-suffixed path (e.g. `run_1.jsonld`) with a warning. +- Sensitive fields (API keys, vault tokens) are never written. +- If manifest writing fails, a warning is printed and the exit code reflects the actual operation result. + +See `examples/reproducible-download.md` for a full walkthrough. \ No newline at end of file diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py index 8e2d916..735e4eb 100644 --- a/databusclient/api/delete.py +++ b/databusclient/api/delete.py @@ -116,6 +116,7 @@ def _delete_resource( dry_run: bool = False, force: bool = False, queue: DeleteQueue = None, + manifest_context=None, ): """Delete a single Databus resource (version, artifact, group). @@ -144,6 +145,8 @@ def _delete_resource( if dry_run: print(f"[DRY RUN] Would delete: {databusURI}") + if manifest_context is not None: + manifest_context.record_file(url=databusURI, status="dry_run") return if queue is not None: @@ -156,6 +159,8 @@ def _delete_resource( if response.status_code in (200, 204): print(f"Successfully deleted: {databusURI}") + if manifest_context is not None: + manifest_context.record_file(url=databusURI, status="success") else: raise Exception( f"Failed to delete {databusURI}: {response.status_code} - {response.text}" @@ -168,6 +173,7 @@ def _delete_list( dry_run: bool = False, force: bool = False, queue: DeleteQueue = None, + manifest_context=None, ): """Delete a list of Databus resources. @@ -180,7 +186,7 @@ def _delete_list( """ for databusURI in databusURIs: _delete_resource( - databusURI, databus_key, dry_run=dry_run, force=force, queue=queue + databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) @@ -190,6 +196,7 @@ def _delete_artifact( dry_run: bool = False, force: bool = False, queue: DeleteQueue = None, + manifest_context=None, ): """Delete an artifact and all its versions. @@ -223,11 +230,11 @@ def _delete_artifact( else: # Delete all versions _delete_list( - version_uris, databus_key, dry_run=dry_run, force=force, queue=queue + version_uris, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) # Finally, delete the artifact itself - _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue) + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue,manifest_context=manifest_context) def _delete_group( @@ -236,6 +243,7 @@ def _delete_group( dry_run: bool = False, force: bool = False, queue: DeleteQueue = None, + manifest_context=None, ): """Delete a group and all its artifacts and versions. @@ -266,14 +274,14 @@ def _delete_group( # Delete all artifacts (which deletes their versions) for artifact_uri in artifact_uris: _delete_artifact( - artifact_uri, databus_key, dry_run=dry_run, force=force, queue=queue + artifact_uri, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) # Finally, delete the group itself - _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue) + _delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue,manifest_context=manifest_context) -def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool): +def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool, manifest_context=None): """Delete a dataset from the databus. Delete a group, artifact, or version identified by the given databus URI. @@ -296,24 +304,24 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool) if group == "collections" and artifact is not None: print(f"Deleting collection: {databusURI}") _delete_resource( - databusURI, databus_key, dry_run=dry_run, force=force, queue=queue + databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) elif file is not None: print(f"Deleting file is not supported via API: {databusURI}") elif version is not None: print(f"Deleting version: {databusURI}") _delete_resource( - databusURI, databus_key, dry_run=dry_run, force=force, queue=queue + databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) elif artifact is not None: print(f"Deleting artifact and all its versions: {databusURI}") _delete_artifact( - databusURI, databus_key, dry_run=dry_run, force=force, queue=queue + databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) elif group is not None and group != "collections": print(f"Deleting group and all its artifacts and versions: {databusURI}") _delete_group( - databusURI, databus_key, dry_run=dry_run, force=force, queue=queue + databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context ) else: print(f"Deleting {databusURI} is not supported.") diff --git a/databusclient/api/download.py b/databusclient/api/download.py index bd71555..6f87010 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -12,6 +12,7 @@ import requests from SPARQLWrapper import JSON, SPARQLWrapper from tqdm import tqdm +from datetime import datetime, timezone from databusclient.api.utils import ( fetch_databus_jsonld, @@ -327,6 +328,7 @@ def _download_file( base_uri=None, validate_checksum: bool = False, expected_checksum: str | None = None, + manifest_context=None, ) -> None: """Download a file from the internet with a progress bar using tqdm. @@ -495,32 +497,43 @@ def _download_file( raise IOError("Downloaded size does not match Content-Length header") # --- 6. Validate checksum on original downloaded file (BEFORE conversion) --- + actual_checksum = None if validate_checksum: - # reuse compute_sha256_and_length from webdav extension try: - actual, _ = compute_sha256_and_length(filename) + actual_checksum, _ = compute_sha256_and_length(filename) except (OSError, IOError) as e: print(f"WARNING: error computing checksum for {filename}: {e}") - actual = None + actual_checksum = None if expected_checksum is None: print( f"WARNING: no expected checksum available for {filename}; skipping validation" ) - elif actual is None: + elif actual_checksum is None: print( f"WARNING: could not compute checksum for {filename}; skipping validation" ) else: - if actual.lower() != expected_checksum.lower(): + if actual_checksum.lower() != expected_checksum.lower(): try: - os.remove(filename) # delete corrupted file + os.remove(filename) except OSError: pass raise IOError( - f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" + f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual_checksum}" ) + # Record file to manifest after all verification passes. + # Use actual computed checksum if available, otherwise fall back to expected. + if manifest_context is not None: + manifest_context.record_file( + url=url, + status="success", + sha256=actual_checksum or expected_checksum, + size_bytes=total_size_in_bytes if total_size_in_bytes else None, + downloaded_at=datetime.now(timezone.utc).isoformat(), + ) + # --- 7. Unified compression/format conversion pass --- source_compression = _detect_compression_format(file) should_convert_compression, source_fmt = _should_convert_compression( @@ -713,6 +726,7 @@ def _download_files( convert_format: str = None, graph_name: str = None, base_uri: str = None, + manifest_context=None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -749,6 +763,7 @@ def _download_files( base_uri=base_uri, validate_checksum=validate_checksum, expected_checksum=expected, + manifest_context=manifest_context, ) def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str: @@ -896,6 +911,7 @@ def _download_collection( convert_format: str = None, graph_name: str = None, base_uri: str = None, + manifest_context=None, validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -935,6 +951,7 @@ def _download_collection( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -951,6 +968,7 @@ def _download_version( convert_format: str = None, graph_name: str = None, base_uri: str = None, + manifest_context=None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -988,6 +1006,7 @@ def _download_version( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, checksums=checksums, ) @@ -1005,6 +1024,7 @@ def _download_artifact( convert_format: str = None, graph_name: str = None, base_uri: str = None, + manifest_context=None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -1049,6 +1069,7 @@ def _download_artifact( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, checksums=checksums, ) @@ -1127,6 +1148,7 @@ def _download_group( convert_format: str = None, graph_name: str = None, base_uri: str = None, + manifest_context=None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -1161,6 +1183,7 @@ def _download_group( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, ) @@ -1213,6 +1236,7 @@ def download( graph_name=None, base_uri=None, validate_checksum: bool = False, + manifest_context=None, ) -> None: """Download datasets from databus. @@ -1262,6 +1286,7 @@ def download( convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, ) elif file is not None: @@ -1285,6 +1310,7 @@ def download( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -1301,6 +1327,7 @@ def download( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -1319,6 +1346,7 @@ def download( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -1337,6 +1365,7 @@ def download( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, ) elif account is not None: @@ -1377,6 +1406,7 @@ def download( convert_format=convert_format, graph_name=graph_name, base_uri=base_uri, + manifest_context=manifest_context, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) \ No newline at end of file diff --git a/databusclient/cli.py b/databusclient/cli.py index 277d0d6..ed35aa5 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -8,6 +8,8 @@ import databusclient.api.deploy as api_deploy from databusclient.api.delete import delete as api_delete from databusclient.api.download import download as api_download, DownloadAuthError +from databusclient.manifest.context import ManifestContext +from databusclient.manifest.writer import ManifestWriter from databusclient.extensions import webdav @@ -59,6 +61,12 @@ def app(): "webdav_url", help="WebDAV URL (e.g., https://cloud.example.com/remote.php/webdav)", ) +@click.option( + "--manifest", + "manifest_path", + default=None, + help="Write a JSON-LD manifest of this operation to PATH.", +) @click.option("--remote", help="rclone remote name (e.g., 'nextcloud')") @click.option("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')") @click.argument("distributions", nargs=-1) @@ -74,6 +82,7 @@ def deploy( remote, path, distributions: List[str], + manifest_path, ): """ Flexible deploy to Databus command supporting three modes:\n @@ -91,31 +100,80 @@ def deploy( raise click.UsageError( "Invalid combination: when using WebDAV/Nextcloud mode, please provide --webdav-url, --remote, and --path together." ) + + manifest_context = None + if manifest_path: + manifest_context = ManifestContext(command="deploy") + manifest_context.record_params({ + "version_id": version_id, + "title": title, + "abstract": abstract, + "description": description, + "license_url": license_url, + "distributions": list(distributions) if distributions else [], + "metadata_file": metadata_file, + }) + + def _write_manifest(): + if manifest_path and manifest_context is not None: + try: + actual_path = ManifestWriter.write(manifest_context, manifest_path) + click.echo(f"Manifest written to {actual_path}") + except (OSError, IOError) as e: + click.echo( + f"WARNING: Manifest could not be written to {manifest_path}: {e}", + err=True, + ) # === Mode 1: Classic Deploy === if distributions and not (metadata_file or webdav_url or remote or path): click.echo("[MODE] Classic deploy with distributions") click.echo(f"Deploying dataset version: {version_id}") - - dataid = api_deploy.create_dataset( - version_id=version_id, - artifact_version_title=title, - artifact_version_abstract=abstract, - artifact_version_description=description, - license_url=license_url, - distributions=distributions, - ) - api_deploy.deploy(dataid=dataid, api_key=apikey) + try: + dataid = api_deploy.create_dataset( + version_id=version_id, + artifact_version_title=title, + artifact_version_abstract=abstract, + artifact_version_description=description, + license_url=license_url, + distributions=distributions, + ) + api_deploy.deploy(dataid=dataid, api_key=apikey) + if manifest_context: + for dist in distributions: + url = str(dist).split("|")[0] + manifest_context.record_file(url=url, status="success") + except Exception as exc: + if manifest_context: + manifest_context.record_operation_error(exc) + raise + finally: + _write_manifest() return # === Mode 2: Metadata File === if metadata_file: click.echo(f"[MODE] Deploy from metadata file: {metadata_file}") - with open(metadata_file, "r") as f: - metadata = json.load(f) - api_deploy.deploy_from_metadata( - metadata, version_id, title, abstract, description, license_url, apikey - ) + try: + with open(metadata_file, "r") as f: + metadata = json.load(f) + api_deploy.deploy_from_metadata( + metadata, version_id, title, abstract, description, license_url, apikey + ) + if manifest_context: + for entry in metadata: + manifest_context.record_file( + url=entry.get("url", ""), + status="success", + sha256=entry.get("checksum"), + size_bytes=entry.get("size"), + ) + except Exception as exc: + if manifest_context: + manifest_context.record_operation_error(exc) + raise + finally: + _write_manifest() return # === Mode 3: Upload & Deploy (Nextcloud) === @@ -124,20 +182,32 @@ def deploy( raise click.UsageError( "Please provide files to upload when using WebDAV/Nextcloud mode." ) - - # Check that all given paths exist and are files or directories. invalid = [f for f in distributions if not os.path.exists(f)] if invalid: raise click.UsageError( f"The following input files or folders do not exist: {', '.join(invalid)}" ) - click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud") click.echo(f"→ Uploading to: {remote}:{path}") - metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url) - api_deploy.deploy_from_metadata( - metadata, version_id, title, abstract, description, license_url, apikey - ) + try: + metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url) + api_deploy.deploy_from_metadata( + metadata, version_id, title, abstract, description, license_url, apikey + ) + if manifest_context: + for entry in metadata: + manifest_context.record_file( + url=entry.get("url", ""), + status="success", + sha256=entry.get("checksum"), + size_bytes=entry.get("size"), + ) + except Exception as exc: + if manifest_context: + manifest_context.record_operation_error(exc) + raise + finally: + _write_manifest() return raise click.UsageError( @@ -147,7 +217,6 @@ def deploy( " - Upload & deploy: use --webdav-url, --remote, --path, and file arguments" ) - @app.command() @click.argument("databusuris", nargs=-1, required=True) @click.option( @@ -222,6 +291,12 @@ def deploy( help="Base URI for CSV -> RDF Triple conversion (Layer 3). " "Required when converting CSV/TSV to RDF triple formats.", ) +@click.option( + "--manifest", + "manifest_path", + default=None, + help="Write a JSON-LD manifest of this operation to PATH (e.g. --manifest manifest.jsonld).", +) @click.option( "--validate-checksum", is_flag=True, help="Validate checksums of downloaded files" ) @@ -239,11 +314,38 @@ def download( graph_name, base_uri, validate_checksum, + manifest_path, ): """ Download datasets from databus, optionally using vault access if vault options are provided. Supports on-the-fly compression format conversion using --convert-to and --convert-from options. """ + # Determine auth method for manifest (never store the token itself) + auth_method = None + if vault_token: + auth_method = "vault_token" + elif databus_key: + auth_method = "databus_key" + + manifest_context = None + if manifest_path: + manifest_context = ManifestContext( + command="download", + endpoint=databus, + auth_method=auth_method, + ) + # Record safe replay params — sensitive fields excluded + manifest_context.record_params({ + "databusURIs": list(databusuris), + "compression": compression, + "convert_format": convert_format, + "graph_name": graph_name, + "base_uri": base_uri, + "all_versions": all_versions, + "validate_checksum": validate_checksum, + "authurl": authurl, + "clientid": clientid, + }) try: api_download( localDir=localdir, @@ -259,11 +361,26 @@ def download( graph_name=graph_name, base_uri=base_uri, validate_checksum=validate_checksum, + manifest_context=manifest_context, ) except DownloadAuthError as e: + if manifest_context: + manifest_context.record_operation_error(e) raise click.ClickException(str(e)) except ValueError as e: + if manifest_context: + manifest_context.record_operation_error(e) raise click.ClickException(str(e)) + finally: + if manifest_path and manifest_context is not None: + try: + actual_path = ManifestWriter.write(manifest_context, manifest_path) + click.echo(f"Manifest written to {actual_path}") + except (OSError, IOError) as e: + click.echo( + f"WARNING: Manifest could not be written to {manifest_path}: {e}", + err=True, + ) @@ -278,7 +395,13 @@ def download( @click.option( "--force", is_flag=True, help="Force deletion without confirmation prompt" ) -def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool): +@click.option( + "--manifest", + "manifest_path", + default=None, + help="Write a JSON-LD manifest of this operation to PATH.", +) +def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool, manifest_path): """ Delete a dataset from the databus. @@ -286,12 +409,36 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) Will recursively delete all data associated with the dataset. """ - api_delete( - databusURIs=databusuris, - databus_key=databus_key, - dry_run=dry_run, - force=force, - ) + manifest_context = None + if manifest_path: + manifest_context = ManifestContext(command="delete") + manifest_context.record_params({ + "databusURIs": list(databusuris), + "dry_run": dry_run, + }) + + try: + api_delete( + databusURIs=databusuris, + databus_key=databus_key, + dry_run=dry_run, + force=force, + manifest_context=manifest_context, + ) + except Exception as exc: + if manifest_context: + manifest_context.record_operation_error(exc) + raise + finally: + if manifest_path and manifest_context is not None: + try: + actual_path = ManifestWriter.write(manifest_context, manifest_path) + click.echo(f"Manifest written to {actual_path}") + except (OSError, IOError) as e: + click.echo( + f"WARNING: Manifest could not be written to {manifest_path}: {e}", + err=True, + ) if __name__ == "__main__": diff --git a/databusclient/manifest/__init__.py b/databusclient/manifest/__init__.py new file mode 100644 index 0000000..791429d --- /dev/null +++ b/databusclient/manifest/__init__.py @@ -0,0 +1,5 @@ +"""Manifest system for the Databus Python Client. + +Provides ManifestContext (records operation details in memory) +and ManifestWriter (serializes to JSON-LD on disk). +""" \ No newline at end of file diff --git a/databusclient/manifest/context.py b/databusclient/manifest/context.py new file mode 100644 index 0000000..790f909 --- /dev/null +++ b/databusclient/manifest/context.py @@ -0,0 +1,159 @@ +"""ManifestContext — in-memory record of a Databus operation. + +Created by the CLI when --manifest is passed. Threaded through +API functions as manifest_context=None. When None, all recording +calls are no-ops and existing behavior is completely unchanged. +""" + +from __future__ import annotations + +import traceback as tb +from datetime import datetime, timezone +from typing import Optional + + +class ManifestContext: + """Records all details of a Databus operation in memory. + + Created once per CLI invocation when --manifest is passed. + Passed as manifest_context parameter to download(), deploy(), + delete(). When --manifest is not passed, manifest_context is + None and all recording is skipped. + + Sensitive fields (vault_token, api_key, local_dir) are never + passed to this class — they are excluded at the CLI layer. + """ + + def __init__( + self, + command: str, + endpoint: Optional[str] = None, + auth_method: Optional[str] = None, + ) -> None: + """Initialise a new manifest context. + + Args: + command: CLI command name ('download', 'deploy', 'delete'). + endpoint: SPARQL endpoint URL used (if applicable). + auth_method: Authentication method used ('vault_token', + 'databus_key', or None for public access). + """ + self.command = command + self.endpoint = endpoint + self.auth_method = auth_method + self.issued: str = datetime.now(timezone.utc).isoformat() + self.replay_params: dict = {} + self.files: list = [] + self.operation_error: Optional[dict] = None + + def record_params(self, params: dict) -> None: + """Save the replay parameters for this operation. + + Stores CLI-level inputs needed to reconstruct the operation. + Sensitive fields (vault_token, api_key, local_dir) must be + excluded by the caller before passing params here. + + Args: + params: Dict of safe CLI parameters to store for replay. + """ + self.replay_params = params + + def record_file( + self, + url: str, + status: str, + sha256: Optional[str] = None, + size_bytes: Optional[int] = None, + compression: Optional[str] = None, + downloaded_at: Optional[str] = None, + error_message: Optional[str] = None, + error_traceback: Optional[str] = None, + retry_count: int = 0, + ) -> None: + """Record the outcome of processing a single file or URI. + + Called after each file download, deploy distribution, or + delete operation completes (successfully or not). + + Args: + url: The file URL or Databus URI that was processed. + status: 'success' or 'failed'. + sha256: SHA-256 checksum of the file (if available). + size_bytes: File size in bytes (if available). + compression: Compression format of the file (if applicable). + downloaded_at: ISO-8601 timestamp of when file was processed. + error_message: Error description if status is 'failed'. + error_traceback: Full traceback string if status is 'failed'. + retry_count: Number of retries attempted (default 0). + """ + entry: dict = { + "url": url, + "status": status, + } + if sha256 is not None: + entry["sha256"] = sha256 + if size_bytes is not None: + entry["size_bytes"] = size_bytes + if compression is not None: + entry["compression"] = compression + if downloaded_at is not None: + entry["downloaded_at"] = downloaded_at + if error_message is not None: + entry["error_message"] = error_message + if error_traceback is not None: + entry["error_traceback"] = error_traceback + if retry_count: + entry["retry_count"] = retry_count + + self.files.append(entry) + + def record_file_error(self, url: str, exc: Exception) -> None: + """Convenience method to record a failed file from an exception. + + Captures the exception message and full traceback automatically. + + Args: + url: The file URL or Databus URI that failed. + exc: The exception that caused the failure. + """ + self.record_file( + url=url, + status="failed", + error_message=str(exc), + error_traceback=tb.format_exc(), + ) + + def record_operation_error(self, exc: Exception) -> None: + """Record a top-level operation failure. + + Used when the entire operation fails (e.g. DeployError, auth failure) + rather than an individual file failing. Captures the exception message + and full traceback so the manifest is useful for debugging even when + no per-file recording happened. + + Args: + exc: The exception that caused the operation to fail. + """ + self.operation_error = { + "error_message": str(exc), + "error_traceback": tb.format_exc(), + "error_type": type(exc).__name__, + } + + def summary(self) -> dict: + """Return execution summary counts. + + Returns: + Dict with total, succeeded, failed counts and total_bytes. + """ + succeeded = sum(1 for f in self.files if f["status"] == "success") + failed = sum(1 for f in self.files if f["status"] == "failed") + total_bytes = sum( + f.get("size_bytes", 0) for f in self.files if f["status"] == "success" + ) + return { + "total": len(self.files), + "succeeded": succeeded, + "failed": failed, + "total_bytes": total_bytes, + } \ No newline at end of file diff --git a/databusclient/manifest/writer.py b/databusclient/manifest/writer.py new file mode 100644 index 0000000..5e606aa --- /dev/null +++ b/databusclient/manifest/writer.py @@ -0,0 +1,161 @@ +"""ManifestWriter — serializes a ManifestContext to a JSON-LD file. + +Called once at the end of a CLI operation. If writing fails, +a warning is printed and the CLI exits with code 0 (the actual +operation already succeeded). +""" + +from __future__ import annotations + +import json +import os + +from databusclient.manifest.context import ManifestContext + +# JSON-LD context using DataID vocabulary — same vocabulary used +# by the Databus platform itself for semantic interoperability. +_JSONLD_CONTEXT = { + "dataid": "http://dataid.dbpedia.org/ns#", + "dcat": "http://www.w3.org/ns/dcat#", + "dcterms": "http://purl.org/dc/terms/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "dbus": "http://databus.dbpedia.org/manifest/ns#", +} + +_SCHEMA_VERSION = "1.0" +_CLIENT_VERSION = "0.15" + + +class ManifestWriter: + """Serializes a ManifestContext to a JSON-LD manifest file.""" + + @staticmethod + def write(context: ManifestContext, path: str) -> str: + """Write the manifest to a JSON-LD file at the given path. + + Creates parent directories if they do not exist. + If a file already exists at `path`, auto-suffixes with _1, _2, etc. + and prints a warning rather than silently overwriting. + On failure, raises OSError — callers should catch and warn. + + Args: + context: The completed ManifestContext to serialize. + path: File path to write the manifest to. + + Raises: + OSError: If the file cannot be written, or if path is a directory. + """ + if path.endswith(("/", "\\")) or os.path.isdir(path): + stripped = path.rstrip("/\\") + raise OSError( + f"--manifest path '{path}' is a directory, not a file. " + f"Please provide a full file path, e.g. '{stripped}/manifest.jsonld'." + ) + + summary = context.summary() + + # Build file entries using DataID vocabulary + file_entries = [] + for f in context.files: + entry: dict = { + "@type": "dataid:File", + "dcat:downloadURL": f["url"], + "dbus:status": f["status"], + } + if f.get("sha256"): + entry["dataid:checksum"] = f["sha256"] + if f.get("size_bytes") is not None: + entry["dataid:byteSize"] = f["size_bytes"] + if f.get("compression"): + entry["dataid:compression"] = f["compression"] + if f.get("downloaded_at"): + entry["dbus:processedAt"] = { + "@value": f["downloaded_at"], + "@type": "xsd:dateTime", + } + if f.get("error_message"): + entry["dbus:errorMessage"] = f["error_message"] + if f.get("error_traceback"): + entry["dbus:errorTraceback"] = f["error_traceback"] + if f.get("retry_count"): + entry["dbus:retryCount"] = f["retry_count"] + file_entries.append(entry) + + manifest = { + "@context": _JSONLD_CONTEXT, + "@type": "dbus:OperationManifest", + "dbus:schemaVersion": _SCHEMA_VERSION, + "dbus:clientVersion": _CLIENT_VERSION, + "dbus:command": context.command, + "dcterms:issued": { + "@value": context.issued, + "@type": "xsd:dateTime", + }, + } + + if context.endpoint: + manifest["dbus:endpoint"] = context.endpoint + if context.auth_method: + manifest["dbus:authMethod"] = context.auth_method + if context.replay_params: + manifest["dbus:replayParams"] = context.replay_params + + manifest["dataid:distribution"] = { + "@type": "dataid:Distribution", + "dataid:file": file_entries, + } + + manifest["dbus:executionResult"] = { + "@type": "dbus:ExecutionSummary", + "dbus:totalFiles": summary["total"], + "dbus:succeeded": summary["succeeded"], + "dbus:failed": summary["failed"], + "dbus:totalBytes": summary["total_bytes"], + } + + if context.operation_error: + manifest["dbus:operationError"] = { + "@type": "dbus:OperationError", + "dbus:errorType": context.operation_error["error_type"], + "dbus:errorMessage": context.operation_error["error_message"], + "dbus:errorTraceback": context.operation_error["error_traceback"], + } + + parent = os.path.dirname(os.path.abspath(path)) + if parent: + os.makedirs(parent, exist_ok=True) + + final_path = ManifestWriter._resolve_available_path(path) + if final_path != path: + print( + f"WARNING: manifest already exists at '{path}', " + f"creating '{final_path}' instead" + ) + + with open(final_path, "w", encoding="utf-8") as f: + json.dump(manifest, f, indent=2, ensure_ascii=False) + return final_path + + @staticmethod + def _resolve_available_path(path: str) -> str: + """Return a non-colliding path, auto-suffixing with _1, _2, ... if needed. + + If `path` does not exist, it is returned unchanged. If it exists, + appends _1, _2, etc. before the extension until a free path is found. + + Args: + path: Desired manifest file path. + + Returns: + A path that does not currently exist on disk. + """ + if not os.path.exists(path): + return path + + base, ext = os.path.splitext(path) + counter = 1 + while True: + candidate = f"{base}_{counter}{ext}" + if not os.path.exists(candidate): + return candidate + counter += 1 \ No newline at end of file diff --git a/examples/reproducible-download.md b/examples/reproducible-download.md new file mode 100644 index 0000000..d5e1919 --- /dev/null +++ b/examples/reproducible-download.md @@ -0,0 +1,39 @@ +# Reproducible Research Download with Manifest + +This example shows how to use `--manifest` to record a download +operation for reproducibility. + +## Running the download + +```bash +databusclient download \ + https://databus.dbpedia.org/dbpedia/generic/labels/2023.12.01 \ + --localdir ./data \ + --manifest ./manifests/labels-download.jsonld +``` + +This produces: +- Downloaded files in `./data/` +- A manifest at `./manifests/labels-download.jsonld` recording: + - The exact Databus URIs downloaded + - SHA-256 checksum and size of each file + - Timestamp of the download + - All parameters needed to reproduce the operation + +## What the manifest contains + +The manifest is a JSON-LD file using the DataID vocabulary. +Key fields: + +- `dbus:replayParams` — the exact parameters used, sufficient to + re-run the download six months later and get the same files +- `dataid:file` — one entry per downloaded file with checksum, + size, and status +- `dbus:executionResult` — summary of succeeded/failed files + +## Verifying the download later + +Six months later, a colleague can verify the same data was +downloaded by checking the checksums in the manifest against +the files on disk, or by inspecting the `dbus:replayParams` +to understand exactly what was fetched and when. \ No newline at end of file diff --git a/tests/test_manifest.py b/tests/test_manifest.py new file mode 100644 index 0000000..573978c --- /dev/null +++ b/tests/test_manifest.py @@ -0,0 +1,319 @@ +"""Tests for the manifest system (Milestone 2). + +Unit tests: ManifestContext records correctly, ManifestWriter +produces valid JSON-LD. +Edge case: manifest write failure warns but does not fail operation. +""" + +import json +import os +import tempfile + +import pytest + +from databusclient.manifest.context import ManifestContext +from databusclient.manifest.writer import ManifestWriter + + +# --------------------------------------------------------------------------- +# ManifestContext tests +# --------------------------------------------------------------------------- + +def test_context_records_command_and_timestamp(): + ctx = ManifestContext(command="download") + assert ctx.command == "download" + assert ctx.issued is not None + assert "T" in ctx.issued # ISO-8601 format + + +def test_context_records_params(): + ctx = ManifestContext(command="download") + ctx.record_params({"databusURIs": ["https://example.org/data"], "compression": "gz"}) + assert ctx.replay_params["databusURIs"] == ["https://example.org/data"] + assert ctx.replay_params["compression"] == "gz" + + +def test_context_records_successful_file(): + ctx = ManifestContext(command="download") + ctx.record_file( + url="https://example.org/file.ttl", + status="success", + sha256="abc123", + size_bytes=1024, + ) + assert len(ctx.files) == 1 + assert ctx.files[0]["status"] == "success" + assert ctx.files[0]["sha256"] == "abc123" + assert ctx.files[0]["size_bytes"] == 1024 + + +def test_context_records_failed_file(): + ctx = ManifestContext(command="download") + ctx.record_file( + url="https://example.org/file.ttl", + status="failed", + error_message="Connection timeout", + error_traceback="Traceback...", + ) + assert ctx.files[0]["status"] == "failed" + assert ctx.files[0]["error_message"] == "Connection timeout" + + +def test_context_record_file_error_convenience(): + ctx = ManifestContext(command="download") + try: + raise ValueError("test error") + except ValueError as e: + ctx.record_file_error("https://example.org/file.ttl", e) + assert ctx.files[0]["status"] == "failed" + assert "test error" in ctx.files[0]["error_message"] + assert ctx.files[0]["error_traceback"] is not None + + +def test_context_summary_counts(): + ctx = ManifestContext(command="download") + ctx.record_file(url="https://a.org/1", status="success", size_bytes=100) + ctx.record_file(url="https://a.org/2", status="success", size_bytes=200) + ctx.record_file(url="https://a.org/3", status="failed") + s = ctx.summary() + assert s["total"] == 3 + assert s["succeeded"] == 2 + assert s["failed"] == 1 + assert s["total_bytes"] == 300 + + +def test_context_sensitive_fields_not_stored(): + """Sensitive fields must never appear in replay_params.""" + ctx = ManifestContext(command="download", auth_method="vault_token") + ctx.record_params({ + "databusURIs": ["https://example.org"], + "compression": "gz", + }) + # auth_method is stored (it's safe — describes the method, not the credential) + assert ctx.auth_method == "vault_token" + # but the actual token must not be in replay_params + assert "vault_token" not in ctx.replay_params + assert "databus_key" not in ctx.replay_params + assert "token" not in ctx.replay_params + + +# --------------------------------------------------------------------------- +# ManifestWriter tests +# --------------------------------------------------------------------------- + +def test_writer_produces_valid_jsonld(): + ctx = ManifestContext(command="download", endpoint="https://databus.dbpedia.org/sparql") + ctx.record_params({"databusURIs": ["https://example.org/data"]}) + ctx.record_file( + url="https://example.org/file.ttl", + status="success", + sha256="abc123", + size_bytes=1024, + ) + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + path = f.name + os.remove(path) # NamedTemporaryFile creates an empty file; remove it so write() doesn't auto-suffix + try: + actual_path = ManifestWriter.write(ctx, path) + with open(actual_path, "r", encoding="utf-8") as f: + manifest = json.load(f) + + assert manifest["@type"] == "dbus:OperationManifest" + assert manifest["dbus:command"] == "download" + assert manifest["dbus:schemaVersion"] == "1.0" + assert "dcterms:issued" in manifest + assert manifest["dbus:endpoint"] == "https://databus.dbpedia.org/sparql" + assert manifest["dbus:replayParams"]["databusURIs"] == ["https://example.org/data"] + + files = manifest["dataid:distribution"]["dataid:file"] + assert len(files) == 1 + assert files[0]["dcat:downloadURL"] == "https://example.org/file.ttl" + assert files[0]["dbus:status"] == "success" + assert files[0]["dataid:checksum"] == "abc123" + assert files[0]["dataid:byteSize"] == 1024 + + result = manifest["dbus:executionResult"] + assert result["dbus:totalFiles"] == 1 + assert result["dbus:succeeded"] == 1 + assert result["dbus:failed"] == 0 + finally: + if os.path.exists(actual_path): + os.remove(actual_path) + + +def test_writer_creates_parent_directories(): + ctx = ManifestContext(command="delete") + ctx.record_file(url="https://example.org/v1", status="success") + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "nested", "dir", "manifest.jsonld") + ManifestWriter.write(ctx, path) + assert os.path.exists(path) + + +def test_writer_records_failed_file(): + ctx = ManifestContext(command="download") + ctx.record_file( + url="https://example.org/file.ttl", + status="failed", + error_message="Timeout", + error_traceback="Traceback...", + ) + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + path = f.name + os.remove(path) + try: + actual_path = ManifestWriter.write(ctx, path) + with open(actual_path, "r", encoding="utf-8") as f: + manifest = json.load(f) + files = manifest["dataid:distribution"]["dataid:file"] + assert files[0]["dbus:status"] == "failed" + assert files[0]["dbus:errorMessage"] == "Timeout" + finally: + if os.path.exists(actual_path): + os.remove(actual_path) + + +def test_writer_failure_raises_oserror(): + """Writer raises OSError on invalid path — caller should catch and warn. + + Uses a file as the parent directory, which is always invalid on all + platforms (Windows and Unix) since you cannot create a directory + inside a file. + """ + ctx = ManifestContext(command="download") + ctx.record_file(url="https://example.org/f", status="success") + + with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f: + file_as_parent = f.name + + try: + # Use an existing file as if it were a parent directory — + # always raises OSError on all platforms + invalid_path = os.path.join(file_as_parent, "manifest.jsonld") + with pytest.raises(OSError): + ManifestWriter.write(ctx, invalid_path) + finally: + if os.path.exists(file_as_parent): + os.remove(file_as_parent) + +def test_writer_auto_suffix_on_collision(): + """If the manifest path already exists, auto-suffix with _1 and warn.""" + ctx1 = ManifestContext(command="download") + ctx1.record_file(url="https://example.org/f1", status="success") + + ctx2 = ManifestContext(command="download") + ctx2.record_file(url="https://example.org/f2", status="success") + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "run.jsonld") + + first_path = ManifestWriter.write(ctx1, path) + assert first_path == path + + second_path = ManifestWriter.write(ctx2, path) + assert second_path == os.path.join(tmpdir, "run_1.jsonld") + + # Original file must be untouched (still has ctx1's data) + with open(first_path, "r", encoding="utf-8") as f: + original = json.load(f) + assert original["dataid:distribution"]["dataid:file"][0]["dcat:downloadURL"] == "https://example.org/f1" + + with open(second_path, "r", encoding="utf-8") as f: + suffixed = json.load(f) + assert suffixed["dataid:distribution"]["dataid:file"][0]["dcat:downloadURL"] == "https://example.org/f2" + + +def test_writer_auto_suffix_increments(): + """Repeated collisions increment the suffix: _1, then _2.""" + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "run.jsonld") + + ctx_a = ManifestContext(command="download") + ctx_b = ManifestContext(command="download") + ctx_c = ManifestContext(command="download") + + path_a = ManifestWriter.write(ctx_a, path) + path_b = ManifestWriter.write(ctx_b, path) + path_c = ManifestWriter.write(ctx_c, path) + + assert path_a == path + assert path_b == os.path.join(tmpdir, "run_1.jsonld") + assert path_c == os.path.join(tmpdir, "run_2.jsonld") + + +def test_writer_rejects_invalid_path(): + """Directory paths (existing dir, or trailing slash) raise OSError.""" + ctx = ManifestContext(command="download") + ctx.record_file(url="https://example.org/f", status="success") + + with tempfile.TemporaryDirectory() as tmpdir: + # Case 1: path is an existing directory + with pytest.raises(OSError, match="is a directory"): + ManifestWriter.write(ctx, tmpdir) + + # Case 2: path ends with a trailing slash + trailing_slash_path = os.path.join(tmpdir, "subdir") + os.sep + with pytest.raises(OSError, match="is a directory"): + ManifestWriter.write(ctx, trailing_slash_path) + +def test_context_records_operation_error(): + """record_operation_error captures exception type, message, and traceback.""" + ctx = ManifestContext(command="deploy") + try: + raise ValueError("Authentication failed.") + except ValueError as e: + ctx.record_operation_error(e) + + assert ctx.operation_error is not None + assert ctx.operation_error["error_type"] == "ValueError" + assert "Authentication failed." in ctx.operation_error["error_message"] + assert ctx.operation_error["error_traceback"] is not None + + +def test_writer_includes_operation_error(): + """ManifestWriter writes dbus:operationError when operation_error is set.""" + ctx = ManifestContext(command="deploy") + ctx.record_params({"version_id": "https://example.org/v1"}) + try: + raise RuntimeError("DeployError: bad API key") + except RuntimeError as e: + ctx.record_operation_error(e) + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + path = f.name + os.remove(path) + try: + actual_path = ManifestWriter.write(ctx, path) + with open(actual_path, "r", encoding="utf-8") as f: + manifest = json.load(f) + + assert "dbus:operationError" in manifest + err = manifest["dbus:operationError"] + assert err["@type"] == "dbus:OperationError" + assert err["dbus:errorType"] == "RuntimeError" + assert "bad API key" in err["dbus:errorMessage"] + assert err["dbus:errorTraceback"] is not None + finally: + if os.path.exists(actual_path): + os.remove(actual_path) + + +def test_writer_no_operation_error_field_when_success(): + """dbus:operationError is absent from manifest when operation succeeded.""" + ctx = ManifestContext(command="download") + ctx.record_file(url="https://example.org/f", status="success") + + with tempfile.NamedTemporaryFile(suffix=".jsonld", delete=False) as f: + path = f.name + os.remove(path) + try: + actual_path = ManifestWriter.write(ctx, path) + with open(actual_path, "r", encoding="utf-8") as f: + manifest = json.load(f) + assert "dbus:operationError" not in manifest + finally: + if os.path.exists(actual_path): + os.remove(actual_path) \ No newline at end of file