-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathtest_bindata_consistency.py
More file actions
166 lines (133 loc) · 5.45 KB
/
test_bindata_consistency.py
File metadata and controls
166 lines (133 loc) · 5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""Validation tests for BinData ↔ manifest ↔ section XML consistency.
Ensures generated HWPX never references non-existent BinData entries.
"""
import os
import re
import zipfile
import tempfile
from lxml import etree
import sys
sys.path.insert(0, os.path.dirname(__file__))
from hwp2hwpx.converter import convert_file
# Namespace map for parsing HWPX XML
NS = {
"opf": "http://www.idpf.org/2007/opf/",
"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
"hc": "http://www.hancom.co.kr/hwpml/2011/core",
"hs": "http://www.hancom.co.kr/hwpml/2011/section",
}
def _extract_hwpx_info(hwpx_path):
"""Extract manifest image IDs, BinData files, and section binaryItemIDRefs."""
manifest_ids = set()
bindata_files = set()
section_refs = set()
with zipfile.ZipFile(hwpx_path, "r") as zf:
names = zf.namelist()
# Collect actual BinData files in the ZIP
for name in names:
if name.startswith("BinData/"):
# Extract the label part (e.g., "image1" from "BinData/image1.png")
base = os.path.splitext(os.path.basename(name))[0]
bindata_files.add(base)
# Parse content.hpf manifest
hpf_data = zf.read("Contents/content.hpf")
hpf_tree = etree.fromstring(hpf_data)
for item in hpf_tree.iter("{%s}item" % NS["opf"]):
item_id = item.get("id", "")
href = item.get("href", "")
if "BinData/" in href:
manifest_ids.add(item_id)
# Parse all section XMLs for binaryItemIDRef
for name in names:
if re.match(r"Contents/section\d+\.xml", name):
sec_data = zf.read(name)
sec_tree = etree.fromstring(sec_data)
for img in sec_tree.iter("{%s}img" % NS["hc"]):
ref = img.get("binaryItemIDRef", "")
if ref:
section_refs.add(ref)
return manifest_ids, bindata_files, section_refs
def validate_hwpx(hwpx_path):
"""Validate BinData consistency in an HWPX file. Returns list of errors."""
manifest_ids, bindata_files, section_refs = _extract_hwpx_info(hwpx_path)
errors = []
# Check 1: Every manifest entry must have a corresponding BinData file
phantom_manifest = manifest_ids - bindata_files
if phantom_manifest:
errors.append(f"Manifest references missing BinData files: {sorted(phantom_manifest)}")
# Check 2: Every section binaryItemIDRef must exist in manifest
phantom_refs = section_refs - manifest_ids
if phantom_refs:
errors.append(f"Section XML references missing manifest entries: {sorted(phantom_refs)}")
# Check 3: Every section binaryItemIDRef must have actual BinData file
phantom_files = section_refs - bindata_files
if phantom_files:
errors.append(f"Section XML references missing BinData files: {sorted(phantom_files)}")
return errors
def test_all_hwp_files():
"""Regression test: convert all test/*.hwp and validate BinData consistency."""
test_dir = os.path.join(os.path.dirname(__file__), "test")
if not os.path.isdir(test_dir):
print("SKIP: test/ directory not found")
return
passed = 0
failed = 0
errors_found = []
for entry in sorted(os.listdir(test_dir)):
hwp_path = os.path.join(test_dir, entry, "from.hwp")
if not os.path.exists(hwp_path):
continue
with tempfile.NamedTemporaryFile(suffix=".hwpx", delete=False) as tmp:
tmp_path = tmp.name
try:
convert_file(hwp_path, tmp_path)
errors = validate_hwpx(tmp_path)
if errors:
failed += 1
errors_found.append((entry, errors))
print(f" FAIL {entry}")
for e in errors:
print(f" {e}")
else:
passed += 1
print(f" OK {entry}")
except Exception as ex:
failed += 1
errors_found.append((entry, [str(ex)]))
print(f" ERROR {entry}: {ex}")
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
print(f"\nResults: {passed} passed, {failed} failed out of {passed + failed}")
return failed == 0
def test_picture_case():
"""Specific test for the picture test case with known images."""
hwp_path = os.path.join(os.path.dirname(__file__), "test", "picture", "from.hwp")
if not os.path.exists(hwp_path):
print("SKIP: test/picture/from.hwp not found")
return True
with tempfile.NamedTemporaryFile(suffix=".hwpx", delete=False) as tmp:
tmp_path = tmp.name
try:
convert_file(hwp_path, tmp_path)
manifest_ids, bindata_files, section_refs = _extract_hwpx_info(tmp_path)
print(f" Manifest IDs: {sorted(manifest_ids)}")
print(f" BinData files: {sorted(bindata_files)}")
print(f" Section refs: {sorted(section_refs)}")
errors = validate_hwpx(tmp_path)
if errors:
print(" FAIL: " + "; ".join(errors))
return False
else:
print(" OK: All references valid")
return True
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
if __name__ == "__main__":
print("=== Picture case validation ===")
test_picture_case()
print()
print("=== Full regression test ===")
success = test_all_hwp_files()
sys.exit(0 if success else 1)