From 1d3c803198a996e7794c3ebae8e5b2c9887b5418 Mon Sep 17 00:00:00 2001 From: Charles Martin Date: Sat, 21 Mar 2026 17:29:02 -0700 Subject: [PATCH] Retry OpenML selection until a valid binary dataset loads --- ...omBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb | 86 +++++++++++++------ 1 file changed, 62 insertions(+), 24 deletions(-) diff --git a/XGBWW_OpenML_RandomBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb b/XGBWW_OpenML_RandomBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb index f1f5803..f8a1d9c 100644 --- a/XGBWW_OpenML_RandomBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb +++ b/XGBWW_OpenML_RandomBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb @@ -65,7 +65,7 @@ "RANDOM_STATE = 42\n", "FORCE_FRESH_START = False # True = ignore prior checkpoints and start over for selected dataset\n", "RESTART_RUNTIME_AFTER_INSTALL = False\n", - "REUSE_LAST_MODEL = False # True = resume last selected OpenML binary dataset\n", + "REUSE_LAST_MODEL = True # Keep selected dataset/model id across restarts unless FORCE_FRESH_START=True\n", "\n", "print('Runs root:', RUNS_ROOT)\n", "print('Started at:', datetime.utcnow().isoformat() + 'Z')\n" @@ -306,42 +306,80 @@ "\n", "filters = Filters(min_rows=100, max_rows=None, max_features=100000)\n", "selection_history_path = RUNS_ROOT / 'selection_history.json'\n", + "MAX_DATASET_SELECTION_ATTEMPTS = 50\n", "\n", "if selection_history_path.exists():\n", " selection_history = json.loads(selection_history_path.read_text())\n", "else:\n", " selection_history = {'selected_dataset_uids': [], 'last_selected_dataset_uid': None, 'events': []}\n", "\n", - "if REUSE_LAST_MODEL and selection_history.get('last_selected_dataset_uid'):\n", - " selected_dataset_uid = selection_history['last_selected_dataset_uid']\n", - "else:\n", - " catalog = openml.datasets.list_datasets(output_format='dataframe')\n", - " did_col = 'did' if 'did' in catalog.columns else 'dataset_id'\n", - " class_col = 'NumberOfClasses' if 'NumberOfClasses' in catalog.columns else None\n", - " status_col = 'status' if 'status' in catalog.columns else None\n", + "resume_dataset_uid = selection_history.get('last_selected_dataset_uid')\n", "\n", - " if class_col is None:\n", - " raise RuntimeError('OpenML catalog missing NumberOfClasses column; cannot pre-filter binary datasets.')\n", + "catalog = openml.datasets.list_datasets(output_format='dataframe')\n", + "did_col = 'did' if 'did' in catalog.columns else 'dataset_id'\n", + "class_col = 'NumberOfClasses' if 'NumberOfClasses' in catalog.columns else None\n", + "status_col = 'status' if 'status' in catalog.columns else None\n", "\n", - " binary_catalog = catalog[catalog[class_col] == 2].copy()\n", - " if status_col is not None:\n", - " binary_catalog = binary_catalog[binary_catalog[status_col].astype(str).str.lower().isin(['active', 'deactivated'])]\n", + "if class_col is None:\n", + " raise RuntimeError('OpenML catalog missing NumberOfClasses column; cannot pre-filter binary datasets.')\n", "\n", - " available_ids = [int(v) for v in binary_catalog[did_col].dropna().astype(int).tolist()]\n", - " previously_selected = set(selection_history.get('selected_dataset_uids', []))\n", - " untested_ids = [did for did in available_ids if f'openml:{did}' not in previously_selected]\n", + "binary_catalog = catalog[catalog[class_col] == 2].copy()\n", + "if status_col is not None:\n", + " binary_catalog = binary_catalog[binary_catalog[status_col].astype(str).str.lower().isin(['active', 'deactivated'])]\n", "\n", - " pool = untested_ids if untested_ids else available_ids\n", - " selected_did = random.choice(pool)\n", - " selected_dataset_uid = f'openml:{selected_did}'\n", + "available_ids = [int(v) for v in binary_catalog[did_col].dropna().astype(int).tolist()]\n", + "previously_selected = set(selection_history.get('selected_dataset_uids', []))\n", + "untested_ids = [did for did in available_ids if f'openml:{did}' not in previously_selected]\n", "\n", - "configure_checkpoint_paths(selected_dataset_uid)\n", + "pool = untested_ids if untested_ids else available_ids\n", + "random.shuffle(pool)\n", + "random_candidates = [f'openml:{did}' for did in pool[:MAX_DATASET_SELECTION_ATTEMPTS]]\n", + "\n", + "candidate_uids = []\n", + "if (not FORCE_FRESH_START) and REUSE_LAST_MODEL and resume_dataset_uid:\n", + " candidate_uids.append(resume_dataset_uid)\n", + " print(f\"Trying previously selected dataset/model id first: {resume_dataset_uid}\")\n", + "elif FORCE_FRESH_START and resume_dataset_uid:\n", + " print('FORCE_FRESH_START=True -> selecting a new random dataset/model id.')\n", + "\n", + "for uid in random_candidates:\n", + " if uid not in candidate_uids:\n", + " candidate_uids.append(uid)\n", "\n", - "X, y, meta = load_dataset(selected_dataset_uid, filters=filters)\n", + "X = y = meta = None\n", + "selected_dataset_uid = None\n", + "selection_failures = []\n", + "\n", + "for attempt, candidate_uid in enumerate(candidate_uids, start=1):\n", + " configure_checkpoint_paths(candidate_uid)\n", + " try:\n", + " X_try, y_try, meta_try = load_dataset(candidate_uid, filters=filters)\n", + " except Exception as err:\n", + " msg = f\"attempt {attempt}: {candidate_uid} rejected by loader ({type(err).__name__}: {err})\"\n", + " print(msg)\n", + " selection_failures.append(msg)\n", + " continue\n", "\n", - "y = np.asarray(y)\n", - "if len(np.unique(y)) != 2:\n", - " raise ValueError(f'Dataset is not binary classification; dataset_uid={selected_dataset_uid}; classes={np.unique(y)}')\n", + " y_try = np.asarray(y_try)\n", + " unique_classes = np.unique(y_try)\n", + " if len(unique_classes) != 2:\n", + " msg = f\"attempt {attempt}: {candidate_uid} is not binary after load (classes={unique_classes})\"\n", + " print(msg)\n", + " selection_failures.append(msg)\n", + " continue\n", + "\n", + " X, y, meta = X_try, y_try, meta_try\n", + " selected_dataset_uid = candidate_uid\n", + " print(f\"Using dataset/model id: {selected_dataset_uid} (attempt {attempt})\")\n", + " break\n", + "\n", + "if selected_dataset_uid is None:\n", + " raise RuntimeError(\n", + " 'Unable to find a valid binary OpenML dataset/model id after retries. '\n", + " f'Tried {len(candidate_uids)} candidate(s). Last failures: {selection_failures[-5:]}'\n", + " )\n", + "\n", + "configure_checkpoint_paths(selected_dataset_uid)\n", "\n", "if hasattr(X, 'tocsr'):\n", " X = X.tocsr().astype(np.float32)\n",