Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 62 additions & 24 deletions XGBWW_OpenML_RandomBinary_W1W2W7W8W9_Alpha_Checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"RANDOM_STATE = 42\n",
"FORCE_FRESH_START = False # True = ignore prior checkpoints and start over for selected dataset\n",
"RESTART_RUNTIME_AFTER_INSTALL = False\n",
"REUSE_LAST_MODEL = False # True = resume last selected OpenML binary dataset\n",
"REUSE_LAST_MODEL = True # Keep selected dataset/model id across restarts unless FORCE_FRESH_START=True\n",
"\n",
"print('Runs root:', RUNS_ROOT)\n",
"print('Started at:', datetime.utcnow().isoformat() + 'Z')\n"
Expand Down Expand Up @@ -306,42 +306,80 @@
"\n",
"filters = Filters(min_rows=100, max_rows=None, max_features=100000)\n",
"selection_history_path = RUNS_ROOT / 'selection_history.json'\n",
"MAX_DATASET_SELECTION_ATTEMPTS = 50\n",
"\n",
"if selection_history_path.exists():\n",
" selection_history = json.loads(selection_history_path.read_text())\n",
"else:\n",
" selection_history = {'selected_dataset_uids': [], 'last_selected_dataset_uid': None, 'events': []}\n",
"\n",
"if REUSE_LAST_MODEL and selection_history.get('last_selected_dataset_uid'):\n",
" selected_dataset_uid = selection_history['last_selected_dataset_uid']\n",
"else:\n",
" catalog = openml.datasets.list_datasets(output_format='dataframe')\n",
" did_col = 'did' if 'did' in catalog.columns else 'dataset_id'\n",
" class_col = 'NumberOfClasses' if 'NumberOfClasses' in catalog.columns else None\n",
" status_col = 'status' if 'status' in catalog.columns else None\n",
"resume_dataset_uid = selection_history.get('last_selected_dataset_uid')\n",
"\n",
" if class_col is None:\n",
" raise RuntimeError('OpenML catalog missing NumberOfClasses column; cannot pre-filter binary datasets.')\n",
"catalog = openml.datasets.list_datasets(output_format='dataframe')\n",
"did_col = 'did' if 'did' in catalog.columns else 'dataset_id'\n",
"class_col = 'NumberOfClasses' if 'NumberOfClasses' in catalog.columns else None\n",
"status_col = 'status' if 'status' in catalog.columns else None\n",
"\n",
" binary_catalog = catalog[catalog[class_col] == 2].copy()\n",
" if status_col is not None:\n",
" binary_catalog = binary_catalog[binary_catalog[status_col].astype(str).str.lower().isin(['active', 'deactivated'])]\n",
"if class_col is None:\n",
" raise RuntimeError('OpenML catalog missing NumberOfClasses column; cannot pre-filter binary datasets.')\n",
"\n",
" available_ids = [int(v) for v in binary_catalog[did_col].dropna().astype(int).tolist()]\n",
" previously_selected = set(selection_history.get('selected_dataset_uids', []))\n",
" untested_ids = [did for did in available_ids if f'openml:{did}' not in previously_selected]\n",
"binary_catalog = catalog[catalog[class_col] == 2].copy()\n",
"if status_col is not None:\n",
" binary_catalog = binary_catalog[binary_catalog[status_col].astype(str).str.lower().isin(['active', 'deactivated'])]\n",
"\n",
" pool = untested_ids if untested_ids else available_ids\n",
" selected_did = random.choice(pool)\n",
" selected_dataset_uid = f'openml:{selected_did}'\n",
"available_ids = [int(v) for v in binary_catalog[did_col].dropna().astype(int).tolist()]\n",
"previously_selected = set(selection_history.get('selected_dataset_uids', []))\n",
"untested_ids = [did for did in available_ids if f'openml:{did}' not in previously_selected]\n",
"\n",
"configure_checkpoint_paths(selected_dataset_uid)\n",
"pool = untested_ids if untested_ids else available_ids\n",
"random.shuffle(pool)\n",
"random_candidates = [f'openml:{did}' for did in pool[:MAX_DATASET_SELECTION_ATTEMPTS]]\n",
"\n",
"candidate_uids = []\n",
"if (not FORCE_FRESH_START) and REUSE_LAST_MODEL and resume_dataset_uid:\n",
" candidate_uids.append(resume_dataset_uid)\n",
" print(f\"Trying previously selected dataset/model id first: {resume_dataset_uid}\")\n",
"elif FORCE_FRESH_START and resume_dataset_uid:\n",
" print('FORCE_FRESH_START=True -> selecting a new random dataset/model id.')\n",
"\n",
"for uid in random_candidates:\n",
" if uid not in candidate_uids:\n",
" candidate_uids.append(uid)\n",
"\n",
"X, y, meta = load_dataset(selected_dataset_uid, filters=filters)\n",
"X = y = meta = None\n",
"selected_dataset_uid = None\n",
"selection_failures = []\n",
"\n",
"for attempt, candidate_uid in enumerate(candidate_uids, start=1):\n",
" configure_checkpoint_paths(candidate_uid)\n",
" try:\n",
" X_try, y_try, meta_try = load_dataset(candidate_uid, filters=filters)\n",
" except Exception as err:\n",
" msg = f\"attempt {attempt}: {candidate_uid} rejected by loader ({type(err).__name__}: {err})\"\n",
" print(msg)\n",
" selection_failures.append(msg)\n",
" continue\n",
"\n",
"y = np.asarray(y)\n",
"if len(np.unique(y)) != 2:\n",
" raise ValueError(f'Dataset is not binary classification; dataset_uid={selected_dataset_uid}; classes={np.unique(y)}')\n",
" y_try = np.asarray(y_try)\n",
" unique_classes = np.unique(y_try)\n",
" if len(unique_classes) != 2:\n",
" msg = f\"attempt {attempt}: {candidate_uid} is not binary after load (classes={unique_classes})\"\n",
" print(msg)\n",
" selection_failures.append(msg)\n",
" continue\n",
"\n",
" X, y, meta = X_try, y_try, meta_try\n",
" selected_dataset_uid = candidate_uid\n",
" print(f\"Using dataset/model id: {selected_dataset_uid} (attempt {attempt})\")\n",
" break\n",
"\n",
"if selected_dataset_uid is None:\n",
" raise RuntimeError(\n",
" 'Unable to find a valid binary OpenML dataset/model id after retries. '\n",
" f'Tried {len(candidate_uids)} candidate(s). Last failures: {selection_failures[-5:]}'\n",
" )\n",
"\n",
"configure_checkpoint_paths(selected_dataset_uid)\n",
"\n",
"if hasattr(X, 'tocsr'):\n",
" X = X.tocsr().astype(np.float32)\n",
Expand Down