Skip to content

Commit 815e3b8

Browse files
committed
Support repairing metadata files
closes #1099 Assisted by: Claude Sonnet 4
1 parent b51d79b commit 815e3b8

File tree

3 files changed

+248
-8
lines changed

3 files changed

+248
-8
lines changed

CHANGES/1099.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for recreating and fixing metadata files to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 129 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66

77
from django.db.models import Prefetch
88
from django.db.models.query import QuerySet
9+
from pulpcore.app.models import Artifact, Domain
910
from pulp_python.app.models import PythonPackageContent, PythonRepository
1011
from pulp_python.app.utils import (
12+
artifact_to_metadata_artifact,
1113
artifact_to_python_content_data,
1214
fetch_json_release_metadata,
1315
parse_metadata,
@@ -41,16 +43,25 @@ def repair(repository_pk: UUID) -> None:
4143
content_set = repository.latest_version().content.values_list("pk", flat=True)
4244
content = PythonPackageContent.objects.filter(pk__in=content_set)
4345

44-
num_repaired, pkgs_not_repaired = repair_metadata(content)
46+
num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired = (
47+
repair_metadata(content)
48+
)
49+
# Convert set() to 0
50+
if not pkgs_not_repaired:
51+
pkgs_not_repaired = 0
52+
if not pkgs_metadata_not_repaired:
53+
pkgs_metadata_not_repaired = 0
54+
4555
log.info(
4656
_(
4757
"{} packages' metadata repaired. Not repaired packages due to either "
48-
"inaccessible URL or mismatched sha256: {}."
49-
).format(num_repaired, pkgs_not_repaired)
58+
"inaccessible URL or mismatched sha256: {}. "
59+
"{} metadata files repaired. Packages whose metadata files could not be repaired: {}."
60+
).format(num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired)
5061
)
5162

5263

53-
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]:
64+
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str], int, set[str]]:
5465
"""
5566
Repairs metadata for a queryset of PythonPackageContent objects
5667
and updates the progress report.
@@ -59,9 +70,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
5970
content (QuerySet[PythonPackageContent]): The queryset of items to repair.
6071
6172
Returns:
62-
tuple[int, set[str]]: A tuple containing:
73+
tuple[int, set[str], int, set[str]]: A tuple containing:
6374
- The number of packages that were repaired.
6475
- A set of packages' PKs that were not repaired.
76+
- The number of metadata files that were repaired.
77+
- A set of packages' PKs without repaired metadata artifacts.
6578
"""
6679
immediate_content = (
6780
content.filter(contentartifact__artifact__isnull=False)
@@ -87,6 +100,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
87100
# Keep track of on-demand packages that were not repaired
88101
pkgs_not_repaired = set()
89102

103+
# Metadata artifacts and content artifacts
104+
metadata_batch = []
105+
total_metadata_repaired = 0
106+
pkgs_metadata_not_repaired = set()
107+
90108
progress_report = ProgressReport(
91109
message="Repairing packages' metadata",
92110
code="repair.metadata",
@@ -102,6 +120,14 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
102120
.artifact
103121
)
104122
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
123+
total_metadata_repaired += update_metadata_artifact_if_needed(
124+
package,
125+
new_data.get("metadata_sha256"),
126+
main_artifact,
127+
domain,
128+
metadata_batch,
129+
pkgs_metadata_not_repaired,
130+
)
105131
total_repaired += update_package_if_needed(
106132
package, new_data, batch, set_of_update_fields
107133
)
@@ -163,7 +189,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
163189
total_repaired += len(batch)
164190
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
165191

166-
return total_repaired, pkgs_not_repaired
192+
if metadata_batch:
193+
not_repaired = _process_metadata_batch(metadata_batch)
194+
pkgs_metadata_not_repaired.update(not_repaired)
195+
total_metadata_repaired += len(metadata_batch) - len(not_repaired)
196+
197+
return total_repaired, pkgs_not_repaired, total_metadata_repaired, pkgs_metadata_not_repaired
167198

168199

169200
def update_package_if_needed(
@@ -202,3 +233,95 @@ def update_package_if_needed(
202233
set_of_update_fields.clear()
203234

204235
return total_repaired
236+
237+
238+
def update_metadata_artifact_if_needed(
239+
package: PythonPackageContent,
240+
new_metadata_sha256: str | None,
241+
main_artifact: Artifact,
242+
domain: Domain,
243+
metadata_batch: list[tuple],
244+
pkgs_metadata_not_repaired: set[str],
245+
) -> int:
246+
"""
247+
Repairs metadata artifacts for wheel packages by creating missing metadata artifacts
248+
or updating existing ones when the metadata_sha256 differs. Only processes wheel files
249+
that have a valid new_metadata_sha256. Queues operations for batch processing.
250+
251+
Args:
252+
package: Package to check for metadata changes.
253+
new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None.
254+
main_artifact: The main package artifact used to generate metadata.
255+
domain: The domain in which the metadata artifact will be created.
256+
metadata_batch: List of tuples for batch processing (updated in-place).
257+
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).
258+
259+
Returns:
260+
Number of repaired metadata artifacts (only when batch is flushed at BULK_SIZE).
261+
"""
262+
total_metadata_repaired = 0
263+
264+
if not package.filename.endswith(".whl") or not new_metadata_sha256:
265+
return total_metadata_repaired
266+
267+
original_metadata_sha256 = package.metadata_sha256
268+
cas = package.contentartifact_set.filter(relative_path__endswith=".metadata")
269+
270+
# Create missing
271+
if not cas:
272+
metadata_batch.append(("create", package, main_artifact, None, domain))
273+
# Fix existing
274+
elif new_metadata_sha256 != original_metadata_sha256:
275+
ca = cas.first()
276+
metadata_artifact = ca.artifact
277+
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
278+
metadata_batch.append(("update", package, main_artifact, ca, domain))
279+
280+
if len(metadata_batch) == BULK_SIZE:
281+
not_repaired = _process_metadata_batch(metadata_batch)
282+
pkgs_metadata_not_repaired.update(not_repaired)
283+
total_metadata_repaired += BULK_SIZE - len(not_repaired)
284+
metadata_batch.clear()
285+
286+
return total_metadata_repaired
287+
288+
289+
def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
290+
"""
291+
Processes a batch of metadata repair operations by creating metadata artifacts
292+
and their corresponding ContentArtifacts.
293+
294+
Args:
295+
metadata_batch: List of (action, package, main_artifact, content_artifact, domain) tuples.
296+
297+
Returns:
298+
Set of package PKs for which metadata artifacts could not be created.
299+
"""
300+
not_repaired = set()
301+
content_artifacts_to_create = []
302+
content_artifacts_to_update = []
303+
304+
for action, package, main_artifact, content_artifact, domain in metadata_batch:
305+
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
306+
if metadata_artifact:
307+
metadata_artifact.pulp_domain = domain
308+
metadata_artifact.save()
309+
if action == "create":
310+
ca = ContentArtifact(
311+
artifact=metadata_artifact,
312+
content=package,
313+
relative_path=f"{package.filename}.metadata",
314+
)
315+
content_artifacts_to_create.append(ca)
316+
elif action == "update":
317+
content_artifact.artifact = metadata_artifact
318+
content_artifacts_to_update.append(content_artifact)
319+
else:
320+
not_repaired.add(package.pk)
321+
322+
if content_artifacts_to_create:
323+
ContentArtifact.objects.bulk_create(content_artifacts_to_create)
324+
if content_artifacts_to_update:
325+
ContentArtifact.objects.bulk_update(content_artifacts_to_update, ["artifact"])
326+
327+
return not_repaired

pulp_python/tests/functional/api/test_repair.py

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
@pytest.fixture
1212
def create_content_direct(python_bindings):
13-
def _create(artifact_filename, content_data):
13+
def _create(artifact_filename, content_data, metadata_artifact_filename=None):
1414
commands = (
1515
"from pulpcore.plugin.models import Artifact, ContentArtifact; "
1616
"from pulpcore.plugin.util import get_url; "
@@ -21,8 +21,15 @@ def _create(artifact_filename, content_data):
2121
"c.save(); "
2222
f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); "
2323
"ca.save(); "
24-
"print(get_url(c))"
2524
)
25+
if metadata_artifact_filename:
26+
commands += (
27+
f"a2 = Artifact.init_and_validate('{metadata_artifact_filename}'); "
28+
"a2.save(); "
29+
f"ca2 = ContentArtifact(artifact=a2, content=c, relative_path=c.filename + '.metadata'); "
30+
"ca2.save(); "
31+
)
32+
commands += "print(get_url(c))"
2633
process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True)
2734

2835
assert process.returncode == 0
@@ -214,3 +221,112 @@ def test_metadata_repair_endpoint(
214221
assert new_content.author == author
215222
assert new_content.packagetype == packagetype
216223
assert new_content.requires_python == requires_python
224+
225+
226+
def test_metadata_artifact_repair_endpoint(
227+
create_content_direct,
228+
delete_orphans_pre,
229+
download_python_file,
230+
monitor_task,
231+
move_to_repository,
232+
pulpcore_bindings,
233+
python_bindings,
234+
python_repo_factory,
235+
):
236+
"""
237+
Test repairing of PythonPackageContent's metadata_sha256 and its metadata Artifact
238+
and ContentArtifact via `Repositories.repair_metadata` endpoint.
239+
"""
240+
# 1. Setup tested data
241+
python_repo = python_repo_factory()
242+
243+
# missing metadata_sha256, missing metadata Artifact + ContentArtifact
244+
filename_1 = "scipy-1.1.0-cp27-none-win_amd64.whl"
245+
metadata_1 = None
246+
url_1 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_1)
247+
file_1 = download_python_file(filename_1, url_1)
248+
249+
# correct metadata_sha256, missing metadata Artifact + ContentArtifact
250+
filename_2 = "scipy-1.1.0-cp27-cp27m-manylinux1_x86_64.whl"
251+
metadata_2 = "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"
252+
url_2 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_2)
253+
file_2 = download_python_file(filename_2, url_2)
254+
255+
# wrong metadata_sha256, missing metadata Artifact + ContentArtifact
256+
filename_3 = "scipy-1.1.0-cp34-none-win32.whl"
257+
metadata_3 = "1234"
258+
url_3 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_3)
259+
file_3 = download_python_file(filename_3, url_3)
260+
261+
# wrong metadata_sha256, wrong metadata Artifact, correct metadata ContentArtifact
262+
filename_4 = "scipy-1.1.0-cp35-none-win32.whl"
263+
metadata_4 = "5678"
264+
url_4 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_4)
265+
file_4 = download_python_file(filename_4, url_4)
266+
metadata_file_4 = download_python_file(
267+
f"{filename_1}.metadata",
268+
urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), f"{filename_1}.metadata"),
269+
)
270+
271+
# Build PythonPackageContent data
272+
filenames = [filename_1, filename_2, filename_3, filename_4]
273+
metadata_sha256s = [metadata_1, metadata_2, metadata_3, metadata_4]
274+
data_1, data_2, data_3, data_4 = [
275+
{"name": "scipy", "version": "1.1.0", "filename": f, "metadata_sha256": m}
276+
for f, m in zip(filenames, metadata_sha256s)
277+
]
278+
279+
# 2. Create content
280+
content_1 = create_content_direct(file_1, data_1)
281+
content_2 = create_content_direct(file_2, data_2)
282+
content_3 = create_content_direct(file_3, data_3)
283+
content_4 = create_content_direct(file_4, data_4, metadata_file_4)
284+
285+
content_hrefs = {}
286+
for data, content in [
287+
(data_1, content_1),
288+
(data_2, content_2),
289+
(data_3, content_3),
290+
(data_4, content_4),
291+
]:
292+
for field, test_value in data.items():
293+
assert getattr(content, field) == test_value
294+
content_hrefs[data["filename"]] = content.pulp_href
295+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
296+
297+
# 3. Repair metadata and metadata files
298+
response = python_bindings.RepositoriesPythonApi.repair_metadata(python_repo.pulp_href)
299+
monitor_task(response.task)
300+
301+
# 4. Check new metadata and metadata files
302+
main_artifact_hrefs = set()
303+
metadata_artifact_hrefs = set()
304+
new_data = [
305+
(filename_1, "15ae132303b2774a0d839d01c618cf99fc92716adfaaa2bc1267142ab2b76b98"),
306+
(filename_2, "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"),
307+
# filename_3 and filename_4 have the same metadata file
308+
(filename_3, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
309+
(filename_4, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
310+
]
311+
for filename, metadata_sha256 in new_data:
312+
content = pulpcore_bindings.ContentApi.list(pulp_href__in=[content_hrefs[filename]]).results
313+
assert content
314+
artifacts = content[0].artifacts
315+
assert len(artifacts) == 2
316+
317+
main_artifact_href = artifacts.get(filename)
318+
main_artifact_hrefs.add(main_artifact_href)
319+
main_artifact = pulpcore_bindings.ArtifactsApi.read(main_artifact_href)
320+
321+
metadata_artifact_href = artifacts.get(f"{filename}.metadata")
322+
metadata_artifact_hrefs.add(metadata_artifact_href)
323+
metadata_artifact = pulpcore_bindings.ArtifactsApi.read(metadata_artifact_href)
324+
325+
pkg = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
326+
assert pkg.metadata_sha256 == metadata_sha256
327+
assert main_artifact.sha256 == pkg.sha256
328+
assert metadata_artifact.sha256 == pkg.metadata_sha256
329+
330+
# Check deduplication
331+
assert len(main_artifact_hrefs) == 4
332+
assert len(metadata_artifact_hrefs) == 3

0 commit comments

Comments
 (0)