Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7b4c635
gsoc26: Layer 2 with tests initial commit
Jun 5, 2026
5d224f6
gsoc26: Refactor Layer 2 with handler architecture and improved tests
Jun 6, 2026
f2fe92e
Improvements to Format layer implementation
Jun 10, 2026
180c255
Review comments resolved under issue #59
Jun 12, 2026
34039f2
#61: replacing --convert_from & --convert_to with --compression
Jun 13, 2026
cd5e990
gsoc26: layer2 complete + implementation for #61
Jun 14, 2026
508bf86
gsoc2026: mapping layer initial commit
Jun 15, 2026
9aba4f6
fix: errors from testing fixed.
Jun 16, 2026
549ea3b
docs: update README for --format and --compression flags
Jun 16, 2026
2c50924
docs: update README for --format and --compression flags
Jun 16, 2026
4ff66d9
fix: resolve PR review comments.
Jun 19, 2026
9a541ee
Merge branch 'feature/format-conversion' into gsoc-2026
Jun 20, 2026
6c0200e
fix: remove duplicate import after merging layer2 fixes into layer3
Jun 20, 2026
d5db670
docs: add Layer 3 mapping conversion flags and examples
Jun 20, 2026
cac241f
merge: sync with upstream/gsoc-2026 after PR #62 merge
Jun 23, 2026
5c905f5
Initial commit: Manifest System
Jun 26, 2026
6a3f728
fix: move all test data to tests/resources, add round trip IR compari…
Jun 28, 2026
092702a
fix: align Quad->Triple->Quad round trip test as required.
Jun 29, 2026
3185c6b
fix: fixed review comments
Jun 29, 2026
9948f58
fix: fixed review comments(2)
Jun 29, 2026
1836d50
Merge branch 'gsoc-2026' of https://github.com/dbpedia/databus-python…
Jun 29, 2026
6a9c5e1
Merge branch 'gsoc-2026' into dev
Jun 29, 2026
9bfcc21
Complete Implementation of Milestone 2
Jun 30, 2026
ebe0fe4
edge case handled
Jun 30, 2026
393aa6c
docs: add Manifest section to README documenting --manifest flag
Jun 30, 2026
485bd41
feat: capture operation-level errors in manifest via dbus:operationError
Jul 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
# Explicitly un-ignore the manifest module folder (MANIFEST above is for Python packaging artifacts)
!databusclient/manifest/
!databusclient/manifest/**
databusclient/manifest/__pycache__/
*.py[cod]

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -675,4 +675,30 @@ Or to ensure compatibility with the `pyproject.toml` configured dependencies, ru

```bash
poetry run pytest tests/
```
```

## Manifest

All three commands support an optional `--manifest` flag that writes a structured JSON-LD record of the operation to disk:

```bash
databusclient download https://databus.dbpedia.org/dbpedia/generic/labels/2023.12.01 \
--manifest ./manifests/labels-download.jsonld

databusclient deploy --version-id https://databus.dbpedia.org/myaccount/mygroup/mydata/1.0 \
--title "My Dataset" --abstract "..." --description "..." \
--license https://creativecommons.org/licenses/by-sa/3.0/ \
--apikey YOUR_KEY --manifest ./manifests/deploy-run.jsonld \
myfile.nt

databusclient delete https://databus.dbpedia.org/myaccount/mygroup/mydata/1.0 \
--databus-key YOUR_KEY --manifest ./manifests/delete-run.jsonld
```

The manifest records input parameters, per-file URLs, checksums, byte sizes, timestamps, and success/failure status for each file. It uses the DataID vocabulary and is versioned via `dbus:schemaVersion`.

- If the target path already exists, the manifest is written to an auto-suffixed path (e.g. `run_1.jsonld`) with a warning.
- Sensitive fields (API keys, vault tokens) are never written.
- If manifest writing fails, a warning is printed and the exit code reflects the actual operation result.

See `examples/reproducible-download.md` for a full walkthrough.
28 changes: 18 additions & 10 deletions databusclient/api/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def _delete_resource(
dry_run: bool = False,
force: bool = False,
queue: DeleteQueue = None,
manifest_context=None,
):
"""Delete a single Databus resource (version, artifact, group).

Expand Down Expand Up @@ -144,6 +145,8 @@ def _delete_resource(

if dry_run:
print(f"[DRY RUN] Would delete: {databusURI}")
if manifest_context is not None:
manifest_context.record_file(url=databusURI, status="dry_run")
return

if queue is not None:
Expand All @@ -156,6 +159,8 @@ def _delete_resource(

if response.status_code in (200, 204):
print(f"Successfully deleted: {databusURI}")
if manifest_context is not None:
manifest_context.record_file(url=databusURI, status="success")
else:
raise Exception(
f"Failed to delete {databusURI}: {response.status_code} - {response.text}"
Expand All @@ -168,6 +173,7 @@ def _delete_list(
dry_run: bool = False,
force: bool = False,
queue: DeleteQueue = None,
manifest_context=None,
):
"""Delete a list of Databus resources.

Expand All @@ -180,7 +186,7 @@ def _delete_list(
"""
for databusURI in databusURIs:
_delete_resource(
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)


Expand All @@ -190,6 +196,7 @@ def _delete_artifact(
dry_run: bool = False,
force: bool = False,
queue: DeleteQueue = None,
manifest_context=None,
):
"""Delete an artifact and all its versions.

Expand Down Expand Up @@ -223,11 +230,11 @@ def _delete_artifact(
else:
# Delete all versions
_delete_list(
version_uris, databus_key, dry_run=dry_run, force=force, queue=queue
version_uris, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)

# Finally, delete the artifact itself
_delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue)
_delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue,manifest_context=manifest_context)


def _delete_group(
Expand All @@ -236,6 +243,7 @@ def _delete_group(
dry_run: bool = False,
force: bool = False,
queue: DeleteQueue = None,
manifest_context=None,
):
"""Delete a group and all its artifacts and versions.

Expand Down Expand Up @@ -266,14 +274,14 @@ def _delete_group(
# Delete all artifacts (which deletes their versions)
for artifact_uri in artifact_uris:
_delete_artifact(
artifact_uri, databus_key, dry_run=dry_run, force=force, queue=queue
artifact_uri, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)

# Finally, delete the group itself
_delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue)
_delete_resource(databusURI, databus_key, dry_run=dry_run, force=force, queue=queue,manifest_context=manifest_context)


def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool):
def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool, manifest_context=None):
"""Delete a dataset from the databus.

Delete a group, artifact, or version identified by the given databus URI.
Expand All @@ -296,24 +304,24 @@ def delete(databusURIs: List[str], databus_key: str, dry_run: bool, force: bool)
if group == "collections" and artifact is not None:
print(f"Deleting collection: {databusURI}")
_delete_resource(
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)
elif file is not None:
print(f"Deleting file is not supported via API: {databusURI}")
elif version is not None:
print(f"Deleting version: {databusURI}")
_delete_resource(
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)
elif artifact is not None:
print(f"Deleting artifact and all its versions: {databusURI}")
_delete_artifact(
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)
elif group is not None and group != "collections":
print(f"Deleting group and all its artifacts and versions: {databusURI}")
_delete_group(
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue
databusURI, databus_key, dry_run=dry_run, force=force, queue=queue, manifest_context=manifest_context
)
else:
print(f"Deleting {databusURI} is not supported.")
Expand Down
44 changes: 37 additions & 7 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import requests
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm import tqdm
from datetime import datetime, timezone

from databusclient.api.utils import (
fetch_databus_jsonld,
Expand Down Expand Up @@ -327,6 +328,7 @@ def _download_file(
base_uri=None,
validate_checksum: bool = False,
expected_checksum: str | None = None,
manifest_context=None,
) -> None:
"""Download a file from the internet with a progress bar using tqdm.

Expand Down Expand Up @@ -495,32 +497,43 @@ def _download_file(
raise IOError("Downloaded size does not match Content-Length header")

# --- 6. Validate checksum on original downloaded file (BEFORE conversion) ---
actual_checksum = None
if validate_checksum:
# reuse compute_sha256_and_length from webdav extension
try:
actual, _ = compute_sha256_and_length(filename)
actual_checksum, _ = compute_sha256_and_length(filename)
except (OSError, IOError) as e:
print(f"WARNING: error computing checksum for {filename}: {e}")
actual = None
actual_checksum = None

if expected_checksum is None:
print(
f"WARNING: no expected checksum available for {filename}; skipping validation"
)
elif actual is None:
elif actual_checksum is None:
print(
f"WARNING: could not compute checksum for {filename}; skipping validation"
)
else:
if actual.lower() != expected_checksum.lower():
if actual_checksum.lower() != expected_checksum.lower():
try:
os.remove(filename) # delete corrupted file
os.remove(filename)
except OSError:
pass
raise IOError(
f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}"
f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual_checksum}"
)

# Record file to manifest after all verification passes.
# Use actual computed checksum if available, otherwise fall back to expected.
if manifest_context is not None:
manifest_context.record_file(
url=url,
status="success",
sha256=actual_checksum or expected_checksum,
size_bytes=total_size_in_bytes if total_size_in_bytes else None,
downloaded_at=datetime.now(timezone.utc).isoformat(),
)

# --- 7. Unified compression/format conversion pass ---
source_compression = _detect_compression_format(file)
should_convert_compression, source_fmt = _should_convert_compression(
Expand Down Expand Up @@ -713,6 +726,7 @@ def _download_files(
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
manifest_context=None,
validate_checksum: bool = False,
checksums: dict | None = None,
) -> None:
Expand Down Expand Up @@ -749,6 +763,7 @@ def _download_files(
base_uri=base_uri,
validate_checksum=validate_checksum,
expected_checksum=expected,
manifest_context=manifest_context,
)

def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str:
Expand Down Expand Up @@ -896,6 +911,7 @@ def _download_collection(
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
manifest_context=None,
validate_checksum: bool = False,
) -> None:
"""Download all files in a databus collection.
Expand Down Expand Up @@ -935,6 +951,7 @@ def _download_collection(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Expand All @@ -951,6 +968,7 @@ def _download_version(
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
manifest_context=None,
validate_checksum: bool = False,
) -> None:
"""Download all files in a databus artifact version.
Expand Down Expand Up @@ -988,6 +1006,7 @@ def _download_version(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
checksums=checksums,
)
Expand All @@ -1005,6 +1024,7 @@ def _download_artifact(
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
manifest_context=None,
validate_checksum: bool = False,
) -> None:
"""Download files in a databus artifact.
Expand Down Expand Up @@ -1049,6 +1069,7 @@ def _download_artifact(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
checksums=checksums,
)
Expand Down Expand Up @@ -1127,6 +1148,7 @@ def _download_group(
convert_format: str = None,
graph_name: str = None,
base_uri: str = None,
manifest_context=None,
validate_checksum: bool = False,
) -> None:
"""Download files in a databus group.
Expand Down Expand Up @@ -1161,6 +1183,7 @@ def _download_group(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
)

Expand Down Expand Up @@ -1213,6 +1236,7 @@ def download(
graph_name=None,
base_uri=None,
validate_checksum: bool = False,
manifest_context=None,
) -> None:
"""Download datasets from databus.

Expand Down Expand Up @@ -1262,6 +1286,7 @@ def download(
convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
)
elif file is not None:
Expand All @@ -1285,6 +1310,7 @@ def download(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
expected_checksum=expected,
)
Expand All @@ -1301,6 +1327,7 @@ def download(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
)
elif artifact is not None:
Expand All @@ -1319,6 +1346,7 @@ def download(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
)
elif group is not None and group != "collections":
Expand All @@ -1337,6 +1365,7 @@ def download(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
)
elif account is not None:
Expand Down Expand Up @@ -1377,6 +1406,7 @@ def download(
convert_format=convert_format,
graph_name=graph_name,
base_uri=base_uri,
manifest_context=manifest_context,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Loading
Loading