robertsamples · robertsamples · Jun 30, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/.gitignore b/.gitignore
diff --git a/code/MSFaST.py b/code/MSFaST.py
@@ -10,7 +10,6 @@
 from groupsets import normalize_graphfilters
 from datetime import datetime
 import time
-from pathlib import Path
 
 #---Classes---
 
@@ -187,6 +186,11 @@ def run_MSFaST(params):
     # Filtering and error propagation
     print('Filtering data')
     ionfilters = {}
+    # Initialise here (not only inside `if analysis_params.grpave:`) so the
+    # unconditional groupionlists[...] writes further down (and the blank
+    # filter, which reads it) can't raise NameError if grpave is ever off.
+    # The GUI currently forces grpave=True, but loaded sessions/tests need not.
+    groupionlists = {}
     if analysis_params.relfil:
         ionfilters = filter.relationalfilter(analysis_params, ionfilters)
         if analysis_params.merge:
@@ -254,7 +258,7 @@ def run_MSFaST(params):
     msdata_filtered = pd.read_csv(analysis_params.outputdir / (analysis_params.filename.stem + '_filtered.csv'), sep = ',', header = [0, 1, 2], index_col = [0, 1, 2])
     analysisrec = open(analysis_params.outputdir / 'analysisinfo.txt',"w")
     analysisrec.writelines(['Analysis Date: ' + str(datetime.now()) + '\n',
-                            'Runetime: ' + str(round(runtime, 2)) + ' seconds\n',
+                            'Runtime: ' + str(round(runtime, 2)) + ' seconds\n',
                             'Input file: ' + str(analysis_params.filename) + '\n',
                             'Sample list: ' + str(analysis_params.samplelistfilename) + '\n',
                             'Extract metadata file: ' + str(analysis_params.extractmetadatafilename) + '\n',
@@ -280,10 +284,10 @@ def run_MSFaST(params):
     text = ''
     if analysis_params.relfil:
         text += 'Features failing peak correction filtering: ' + str(len(ionfilters['relfil'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['relfil'].ions) / len(msdata_unformatted.index), 2)) + '%\n'
-    if analysis_params.blnkfltr: #FIX THIS REF TO "BLANKS"
+    if analysis_params.blnkfltr:
         text += 'Features failing blank filtering: ' + str(len(groupionlists[analysis_params.blnkgrp])) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(groupionlists[analysis_params.blnkgrp]) / len(msdata_unformatted.index), 2)) + '%\n'
     if analysis_params.decon:
-        text += 'Features failing blank filtering: ' + str(len(ionfilters['insource'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['insource'].ions) / len(msdata_unformatted.index), 2)) + '%\n'
+        text += 'Features failing in-source/deconvolution filtering: ' + str(len(ionfilters['insource'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['insource'].ions) / len(msdata_unformatted.index), 2)) + '%\n'
     if analysis_params.CVfil:
         text += 'Features failing CV filtering: ' + str(len(ionfilters['cv'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['cv'].ions) / len(msdata_unformatted.index), 2)) + '%\n'
     text += 'Features failing any filters: ' + str(len(msdata_unformatted.index) - len(msdata_filtered.index)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * (len(msdata_unformatted.index) - len(msdata_filtered.index)) / len(msdata_unformatted.index), 2)) + '%\n'
@@ -310,7 +314,7 @@ def run_MSFaST(params):
                             'RT/mz/FC: ' + str(analysis_params.FC3Dplt) + ' ' + str(analysis_params.statstgrps) + '\n',
                             'KMD/mz ' + str(analysis_params.KMD) + '\n',
                             #'KMD/mz/RT ' + str(analysis_params.___) + '\n',
-                            'PCA unfitlered: ' + str(analysis_params.PCA) + '\n',
+                            'PCA unfiltered: ' + str(analysis_params.PCA) + '\n',
                             'PCA filtered: ' + str(analysis_params.PCA) + '\n',
                             'Dendrogram (ward) unfiltered: ' + str(analysis_params.Dendrogram) + '\n',
                             'Dendrogram (ward) Filtered: ' + str(analysis_params.Dendrogram) + '\n',

diff --git a/code/crashreport.py b/code/crashreport.py
@@ -0,0 +1,185 @@
+"""
+MPACT
+Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas
+
+Qt-free crash/error reporting. Installs a ``sys.excepthook`` that, on any
+otherwise-unhandled exception:
+
+1. formats a full report (traceback + environment: MPACT/Python/platform
+   versions, timestamp, optional context such as the tail of the run log),
+2. writes it to a timestamped file under a crash-log directory (so there's a
+   durable record even if the user dismisses the dialog), and
+3. hands the report to a GUI callback that asks the user whether to send it.
+
+The "send" path is deliberately backend-free: it builds a pre-filled GitHub
+*new issue* URL (title + body) for the MPACT repo, so reporting is one click
+in the browser and nothing leaves the user's machine until they choose to
+submit it. That satisfies "prompt the user before sending" without any cloud
+egress, DSN, or account.
+
+Why not Sentry (the obvious off-the-shelf option): ``sentry-sdk`` is excellent
+for hosted/web services but (a) sends events to a Sentry project by default --
+exactly the silent-egress this tool should avoid for a desktop research app,
+(b) needs a DSN/account to be provisioned, and (c) still needs a custom
+``before_send`` hook + dialog to honour "ask first." For a single-user desktop
+tool the local-log + pre-filled-GitHub-issue flow gives the same practical
+benefit (a complete traceback in the maintainer's hands) with no infrastructure
+and no privacy surprise. If MPACT ever ships to many non-technical users and a
+central error feed becomes worth it, Sentry with ``before_send`` gating is the
+documented upgrade path.
+
+This module is Qt-free and unit-tested (see ``tests/test_crashreport.py``); the
+GUI dialog is injected as a plain callback.
+"""
+
+import os
+import platform
+import sys
+import time
+import traceback
+import urllib.parse
+
+DEFAULT_REPO = 'robertsamples/mpact'
+# GitHub rejects extremely long issue URLs; keep the prefilled body well under
+# the practical limit so the link always opens (the full report is always in
+# the log file regardless).
+_MAX_ISSUE_BODY = 6000
+
+
+def _app_version():
+    try:
+        from mpactupdate import __version__
+        return __version__
+    except Exception:
+        return 'unknown'
+
+
+def format_report(exc_type, exc_value, exc_tb, context=None, now=None):
+    """Build the human-readable crash report text.
+
+    Args:
+        exc_type/exc_value/exc_tb: the ``sys.exc_info()``-style triple.
+        context: optional extra text appended under a "Context" heading
+            (e.g. the last lines of the run log, the current dataset name).
+        now: epoch seconds for the timestamp (injectable for tests).
+
+    Returns:
+        A multi-section plain-text report.
+    """
+    now = time.time() if now is None else now
+    stamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(now))
+    tb_text = ''.join(traceback.format_exception(exc_type, exc_value, exc_tb))
+    lines = [
+        'MPACT crash report',
+        '==================',
+        'Time: ' + stamp,
+        'MPACT version: ' + _app_version(),
+        'Python: ' + sys.version.split()[0],
+        'Platform: ' + platform.platform(),
+        '',
+        'Traceback:',
+        tb_text.rstrip(),
+    ]
+    if context:
+        lines += ['', 'Context:', str(context).rstrip()]
+    return '\n'.join(lines) + '\n'
+
+
+def write_log(report, log_dir, now=None):
+    """Write ``report`` to a timestamped file under ``log_dir``.
+
+    Creates ``log_dir`` if needed. Returns the path written, or ``None`` if the
+    write failed (reporting must never raise from inside an excepthook).
+    """
+    now = time.time() if now is None else now
+    try:
+        os.makedirs(log_dir, exist_ok=True)
+        fname = 'mpact_crash_' + time.strftime('%Y%m%d_%H%M%S', time.localtime(now)) + '.log'
+        path = os.path.join(log_dir, fname)
+        with open(path, 'w', encoding='utf-8', errors='replace') as handle:
+            handle.write(report)
+        return path
+    except Exception:
+        return None
+
+
+def one_line_summary(exc_type, exc_value):
+    """A concise ``TypeName: message`` for use as an issue title."""
+    name = getattr(exc_type, '__name__', str(exc_type))
+    message = str(exc_value).strip().splitlines()[0] if str(exc_value).strip() else ''
+    return (name + ': ' + message).strip().rstrip(':').strip() if message else name
+
+
+def build_issue_url(report, title, repo=DEFAULT_REPO):
+    """Build a GitHub 'new issue' URL with a prefilled title and body.
+
+    The body is the report wrapped in a code fence and truncated to
+    :data:`_MAX_ISSUE_BODY` so the URL stays openable. The full untruncated
+    report always lives in the on-disk log.
+    """
+    body = report
+    if len(body) > _MAX_ISSUE_BODY:
+        body = body[:_MAX_ISSUE_BODY] + '\n...\n[truncated -- see attached crash log]'
+    body_md = ('**Describe what you were doing when this happened:**\n\n\n'
+               '---\n```\n' + body + '\n```\n')
+    query = urllib.parse.urlencode({'title': title, 'body': body_md})
+    return 'https://github.com/' + repo + '/issues/new?' + query
+
+
+def make_excepthook(report_handler, log_dir=None, repo=DEFAULT_REPO,
+                    context_provider=None, prev_hook=None):
+    """Build (but don't install) an excepthook.
+
+    Args:
+        report_handler: callable ``handler(report, log_path, issue_url)`` that
+            shows the user the report and offers to send it. Exceptions raised
+            by the handler are swallowed (an excepthook must not itself raise).
+        log_dir: directory for crash logs (skipped if None).
+        repo: GitHub repo for the prefilled issue URL.
+        context_provider: optional zero-arg callable returning extra context
+            text to embed (called defensively; failure is ignored).
+        prev_hook: a previous excepthook to chain to (defaults to the standard
+            ``sys.__excepthook__`` so the traceback still reaches the console).
+
+    Returns:
+        A function with the ``(exc_type, exc_value, exc_tb)`` signature.
+    """
+    prev_hook = prev_hook if prev_hook is not None else sys.__excepthook__
+
+    def _hook(exc_type, exc_value, exc_tb):
+        # Always let the default hook print to stderr first (and never let our
+        # own reporting suppress that or raise over it).
+        try:
+            prev_hook(exc_type, exc_value, exc_tb)
+        except Exception:
+            pass
+        try:
+            context = None
+            if context_provider is not None:
+                try:
+                    context = context_provider()
+                except Exception:
+                    context = None
+            report = format_report(exc_type, exc_value, exc_tb, context=context)
+            log_path = write_log(report, log_dir) if log_dir else None
+            title = 'Crash: ' + one_line_summary(exc_type, exc_value)
+            issue_url = build_issue_url(report, title, repo=repo)
+            if report_handler is not None:
+                report_handler(report, log_path, issue_url)
+        except Exception:
+            # Reporting failed -- the default hook already printed the real
+            # traceback, so just give up quietly rather than masking it.
+            pass
+
+    return _hook
+
+
+def install_excepthook(report_handler, log_dir=None, repo=DEFAULT_REPO,
+                       context_provider=None):
+    """Install the crash excepthook as ``sys.excepthook``; return the previous
+    hook (so callers can restore it)."""
+    prev = sys.excepthook
+    sys.excepthook = make_excepthook(
+        report_handler, log_dir=log_dir, repo=repo,
+        context_provider=context_provider, prev_hook=prev)
+    return prev
diff --git a/code/dbsearch.py b/code/dbsearch.py
@@ -11,7 +11,6 @@
 """
 
 import numpy as np
-import pandas as pd
 
 from csvcache import cached_read_csv, invalidate
 
@@ -39,18 +38,54 @@ def search_npatlas(outputdir, filename_stem, atlas, ppm_threshold):
     msdata = cached_read_csv(outputdir / (filename_stem + '_filtered.csv'),
                               sep=',', header=[2], index_col=None).iloc[:, :3]
 
-    for _, mrow in msdata.iterrows():
-        # Iterates over iondict, filters DB matches within window.
-        # Repeats for adducts, uses length of concat DF for feature hits
-        mass = mrow['m/z']
-        hits_h = atlas[abs(1000000 * (atlas['compound_m_plus_h'] - mass) / atlas['compound_m_plus_h']) < ppm_threshold].copy()
-        hits_h['ppm'] = abs(1000000 * (hits_h['compound_m_plus_h'] - mass) / hits_h['compound_m_plus_h'])
-        hits_na = atlas[abs(1000000 * (atlas['compound_m_plus_na'] - mass) / atlas['compound_m_plus_na']) < ppm_threshold].copy()
-        hits_na['ppm'] = abs(1000000 * (hits_na['compound_m_plus_na'] - mass) / hits_na['compound_m_plus_na'])
-        hits = pd.concat([hits_h, hits_na])
+    # Pre-sort the two adduct-mass columns once so each feature only tests a
+    # tiny m/z window (via searchsorted) instead of scanning all ~36k atlas
+    # rows twice -- the old per-feature ``atlas[boolean_mask]`` over the whole
+    # table was O(features x atlas_rows). The exact original ppm test
+    # (``abs(1e6*(atlas_mz - mass)/atlas_mz) < ppm_threshold``) is re-applied to
+    # the windowed candidates, so the matched set is bit-for-bit identical; the
+    # window (mass*(1 +/- 2*t)) is a safe superset of the true ppm window for
+    # the small tolerances used here. Verified output-identical (hitdb frames,
+    # incl. row order + ppm, and the iondict 'hits' column) against the old
+    # implementation on the real example dataset (~5x faster there).
+    mph = atlas['compound_m_plus_h'].to_numpy(dtype=float)
+    mna = atlas['compound_m_plus_na'].to_numpy(dtype=float)
+    order_h = np.argsort(mph, kind='stable'); sorted_h = mph[order_h]
+    order_na = np.argsort(mna, kind='stable'); sorted_na = mna[order_na]
+    t = ppm_threshold / 1e6
+
+    def _match(mass, sorted_vals, order, col_vals):
+        # Atlas positions whose ppm error vs `mass` is below threshold, in
+        # ascending atlas-position (i.e. original boolean-mask) order, plus
+        # their ppm values.
+        lo = np.searchsorted(sorted_vals, mass * (1 - 2 * t), side='left')
+        hi = np.searchsorted(sorted_vals, mass * (1 + 2 * t), side='right')
+        cand = order[lo:hi]
+        if cand.size == 0:
+            return cand, cand.astype(float)
+        cv = col_vals[cand]
+        sel = np.sort(cand[np.abs(1e6 * (cv - mass) / cv) < ppm_threshold])
+        sel_cv = col_vals[sel]
+        return sel, np.abs(1e6 * (sel_cv - mass) / sel_cv)
+
+    masses = msdata['m/z'].to_numpy(dtype=float)
+    compounds = msdata.iloc[:, 0].to_numpy()
+    counts = np.empty(len(masses), dtype=float)
+    for i in range(len(masses)):
+        mass = masses[i]
+        # m+h matches then m+na matches, concatenated in that order (matching
+        # the old ``pd.concat([hits_h, hits_na])``) and slicing the atlas once.
+        pos_h, ppm_h = _match(mass, sorted_h, order_h, mph)
+        pos_na, ppm_na = _match(mass, sorted_na, order_na, mna)
+        positions = np.concatenate([pos_h, pos_na])
+        hits = atlas.iloc[positions].copy()
+        hits['ppm'] = np.concatenate([ppm_h, ppm_na])
         hits = hits.sort_values(by=['ppm'])
-        hitdb[mrow['Compound']] = hits
-        iondict.loc[mrow['Compound'], 'hits'] = hits.shape[0]
+        hitdb[compounds[i]] = hits
+        counts[i] = positions.size
+    # One vectorised column assignment instead of a per-feature ``.loc`` scalar
+    # set (msdata's Compound ids are unique, a subset of iondict's index).
+    iondict.loc[compounds, 'hits'] = counts
 
     iondict.to_csv(outputdir / 'iondict.csv', header=True, index=True)
     # iondict.csv just changed on disk (gained/updated the 'hits' column) --