Skip to content

Commit b8fbc91

Browse files
committed
fix(etl): rebuild results from artifacts when logs are unavailable in CI
1 parent 648cfcb commit b8fbc91

1 file changed

Lines changed: 170 additions & 0 deletions

File tree

src/etl.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
EXIT_CODE_RE = re.compile(r"\bEXIT_CODE=(-?\d+)\b")
2020
SEED_RE = re.compile(r"\bseed=(\d+)\b")
2121
EXCEPTION_RE = re.compile(r"([A-Za-z_]\w*(?:Error|Exception)):\s*(.+)")
22+
RUN_ID_REGIME_RE = re.compile(r"^(R\d+)_")
23+
TOKEN_REGIME_RE = re.compile(r"(?:^|[_-])(R\d+)(?:[_-]|$)", re.IGNORECASE)
2224

2325

2426
def to_float(value):
@@ -724,6 +726,160 @@ def extract_run_data_from_bench_log(log_path):
724726
return results
725727

726728

729+
def get_nested(mapping, path, default=None):
730+
cursor = mapping
731+
for key in path:
732+
if not isinstance(cursor, dict):
733+
return default
734+
cursor = cursor.get(key)
735+
if cursor is None:
736+
return default
737+
return cursor
738+
739+
740+
def normalize_status(value):
741+
if value is None:
742+
return None
743+
status = str(value).strip().upper()
744+
if not status:
745+
return None
746+
if status in {'SUCCESS', 'SUCCEEDED', 'OK'}:
747+
return 'OK'
748+
if status in {'FAIL', 'FAILED', 'ERROR'}:
749+
return 'FAIL'
750+
return status
751+
752+
753+
def infer_target_regime(run_id, run_payload):
754+
if run_id:
755+
direct = RUN_ID_REGIME_RE.match(run_id)
756+
if direct:
757+
return direct.group(1).upper()
758+
759+
for candidate in (
760+
get_nested(run_payload, ('config', 'target_regime')),
761+
get_nested(run_payload, ('run', 'target_regime')),
762+
get_nested(run_payload, ('run', 'name')),
763+
):
764+
if not candidate:
765+
continue
766+
match = TOKEN_REGIME_RE.search(str(candidate))
767+
if match:
768+
return match.group(1).upper()
769+
770+
return None
771+
772+
773+
def extract_run_data_from_artifact_dir(artifact_dir):
774+
artifact_dir = Path(artifact_dir)
775+
if not artifact_dir.is_dir():
776+
return None
777+
778+
run_json_path = artifact_dir / 'run.json'
779+
if not run_json_path.exists():
780+
return None
781+
782+
run_id = artifact_dir.name
783+
try:
784+
with open(run_json_path, 'r', encoding='utf-8', errors='ignore') as f:
785+
payload = json.load(f)
786+
except Exception as e:
787+
print(f"Error reading artifact run json {run_json_path}: {e}")
788+
return None
789+
790+
if not isinstance(payload, dict):
791+
return None
792+
793+
config_payload = get_nested(payload, ('config',), default={})
794+
if not isinstance(config_payload, dict):
795+
config_payload = {}
796+
797+
method_id = (
798+
get_nested(payload, ('artifacts', 'method', 'id'))
799+
or get_nested(payload, ('method', 'id'))
800+
or get_nested(config_payload, ('method', 'id'))
801+
)
802+
paradigm = (
803+
get_nested(payload, ('artifacts', 'method', 'kind'))
804+
or get_nested(payload, ('run', 'benchmark_mode'))
805+
)
806+
modality = (
807+
get_nested(payload, ('artifacts', 'dataset', 'info', 'modality'))
808+
or get_nested(config_payload, ('dataset', 'modality'))
809+
)
810+
dataset_id = (
811+
get_nested(payload, ('artifacts', 'dataset', 'id'))
812+
or get_nested(config_payload, ('dataset', 'id'))
813+
)
814+
815+
status = normalize_status(get_nested(payload, ('run', 'status')))
816+
exit_code = (
817+
to_int(get_nested(payload, ('run', 'exit_code')))
818+
if get_nested(payload, ('run', 'exit_code')) is not None
819+
else to_int(get_nested(payload, ('run', 'error_code')))
820+
)
821+
if exit_code is None and status == 'OK':
822+
exit_code = 0
823+
824+
error_value = payload.get('error')
825+
if isinstance(error_value, dict):
826+
error_value = json.dumps(error_value, ensure_ascii=False)
827+
elif error_value is not None and not isinstance(error_value, str):
828+
error_value = str(error_value)
829+
if status == 'FAIL' and not error_value:
830+
if exit_code is not None:
831+
error_value = f"status=FAIL exit_code={exit_code}"
832+
else:
833+
error_value = "status=FAIL"
834+
835+
results = {
836+
'run_id': run_id,
837+
'raw_data_urls': {
838+
'run': f"data/artifacts/{run_id}/run.json",
839+
},
840+
'history': [],
841+
'method_id': method_id,
842+
'paradigm': paradigm,
843+
'modality': modality,
844+
'dataset_id': dataset_id,
845+
'target_regime': infer_target_regime(run_id, payload),
846+
'status': status,
847+
'exit_code': exit_code,
848+
'error': error_value,
849+
'seed': to_int(get_nested(payload, ('run', 'seed'))),
850+
'duration_s': to_float(get_nested(payload, ('run', 'duration_s'))),
851+
'test_accuracy': to_float(get_nested(payload, ('metrics', 'test', 'accuracy'))),
852+
'test_macro_f1': to_float(get_nested(payload, ('metrics', 'test', 'macro_f1'))),
853+
'val.accuracy': to_float(get_nested(payload, ('metrics', 'val', 'accuracy'))),
854+
'val.macro_f1': to_float(get_nested(payload, ('metrics', 'val', 'macro_f1'))),
855+
}
856+
857+
config_path = artifact_dir / 'config.yaml'
858+
if config_path.exists():
859+
results['raw_data_urls']['config'] = f"data/artifacts/{run_id}/config.yaml"
860+
861+
log_path = artifact_dir / 'run.log'
862+
if log_path.exists():
863+
results['raw_data_urls']['log'] = f"data/artifacts/{run_id}/run.log"
864+
865+
method_params = (
866+
get_nested(config_payload, ('method', 'params'))
867+
if isinstance(config_payload, dict)
868+
else None
869+
)
870+
if isinstance(method_params, dict):
871+
for key, value in flatten_dict(method_params, parent_key='params').items():
872+
results[key] = value
873+
874+
sampling_stats = get_nested(payload, ('artifacts', 'sampling', 'stats'))
875+
if isinstance(sampling_stats, dict):
876+
for key, value in flatten_dict(sampling_stats, parent_key='sampling.stats').items():
877+
results[key] = value
878+
879+
ensure_fallback_artifacts(results, artifact_dir, run_id, source_row=None)
880+
return results
881+
882+
727883
def main():
728884
output_dir = Path('dashboard/public/data')
729885
output_dir.mkdir(parents=True, exist_ok=True)
@@ -755,6 +911,20 @@ def main():
755911
all_runs.append(run_data)
756912
seen_run_ids.add(run_id)
757913

914+
if not all_runs:
915+
artifacts_root = output_dir / 'artifacts'
916+
if artifacts_root.exists():
917+
print('No logs found; rebuilding results from artifacts.')
918+
for artifact_dir in sorted(artifacts_root.iterdir()):
919+
run_data = extract_run_data_from_artifact_dir(artifact_dir)
920+
if not run_data:
921+
continue
922+
run_id = run_data.get('run_id')
923+
if not run_id or run_id in seen_run_ids:
924+
continue
925+
all_runs.append(run_data)
926+
seen_run_ids.add(run_id)
927+
758928
output_file = output_dir / 'results.json'
759929
with open(output_file, 'w', encoding='utf-8') as f:
760930
json.dump(all_runs, f, indent=2)

0 commit comments

Comments
 (0)