From f3fe407be9bad7762ed04cdb317d931668376e11 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 01:39:18 -0400 Subject: [PATCH 01/20] Reword and reprioritize main.py TODO list --- code/main.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/code/main.py b/code/main.py index 4a25052..6028b3d 100644 --- a/code/main.py +++ b/code/main.py @@ -73,24 +73,22 @@ - add pca option and allow visualization of key features on multivar plt? #TODO# -- in source spectra viewer in tab plot -- do overall data quality score, AUC +- in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm +- clean up import sections and general code for better maintability and good syntax/standards +- do overall data quality score, AUC on CV plot or something, may be present in a different form already - standardize method and class names -- database management, options -- fix up analysisinfo file output - -- mzmine msp file import -- add other ordination options +- add terminal output with current line to status bar instead of just static status messages, perhaps with expand button to show full terminal output +- potentially consider other database options like HMDB etc +- fix up analysisinfo file output with better and more useful log ingo +- add other ordination options like pca, pls-da, etc etc - add custom keyword arguments for each plot to make calling them easier -- add runcheck before searching when switching to search tab -- Figure out way to have only active plot be updated and then to update others - when plot is switched -- make it so groups can be reordered +- add runcheck before searching when switching to search tab if not present +- make it so groups can be reordered in the groupsets widgets? - consider if indexing and feature highly functions in plot options have any easy wins for optimization or disk use. (prob not) - make goto buttons just one class and lambda an index for the stacked widgets when connecting! - +likely items that need more thought and planning - maybe have a comparison mode for many different strains with and without elicitor - specificity/sensitivity plot - other statistical models From 9feb9ec362ce2f23bed794fa089d00ddabd2045a Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 01:43:16 -0400 Subject: [PATCH 02/20] Add search-tab run-check, clean up main.py's dead imports - goto_search now tells the user to run an analysis first instead of silently doing nothing when the search tab is opened before self.analysisrun is set, closing the "add runcheck before searching" TODO. - Removed main.py's unused PyQt5/stdlib/groupsets imports (platform, GroupSet, several never-referenced Qt classes), verified via pyflakes + grep cross-check; no behavior change, 130 existing tests still pass. Co-Authored-By: Claude Sonnet 4.6 --- code/main.py | 12 ++++++------ code/ui_functions.py | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/code/main.py b/code/main.py index 6028b3d..442b315 100644 --- a/code/main.py +++ b/code/main.py @@ -12,11 +12,10 @@ import time import string -import platform from PyQt5 import QtCore, QtWidgets -from PyQt5.QtWidgets import QMainWindow, QSizeGrip, QGraphicsDropShadowEffect, QFileDialog, QListWidgetItem, QColorDialog -from PyQt5.QtCore import (QCoreApplication, QPropertyAnimation, QDate, QDateTime, QMetaObject, QObject, QPoint, QRect, QSize, QTime, QUrl, Qt, QEvent) -from PyQt5.QtGui import QBrush, QColor, QIcon, QPalette, QPainter, QPixmap +from PyQt5.QtWidgets import QMainWindow, QSizeGrip +from PyQt5.QtCore import QObject, Qt +from PyQt5.QtGui import QPixmap from pathlib import Path # Install/verify non-stock dependencies (epam.indigo, UpSetPlot, squarify) @@ -34,7 +33,7 @@ import files from MSFaST import run_MSFaST, analysis_parameters -from groupsets import GroupSet, GroupSetModel, build_query_dict +from groupsets import GroupSetModel, build_query_dict from plotslots import PlotSlotRegistry from paramfields import save_checkbox_fields from csvcache import cached_read_csv, invalidate as invalidate_csv_cache @@ -75,6 +74,8 @@ #TODO# - in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm - clean up import sections and general code for better maintability and good syntax/standards + ~main.py's own import section done (dead PyQt5/stdlib/groupsets imports removed, + verified unused via pyflakes + grep, no behavior change); other files not yet swept - do overall data quality score, AUC on CV plot or something, may be present in a different form already - standardize method and class names - add terminal output with current line to status bar instead of just static status messages, perhaps with expand button to show full terminal output @@ -82,7 +83,6 @@ - fix up analysisinfo file output with better and more useful log ingo - add other ordination options like pca, pls-da, etc etc - add custom keyword arguments for each plot to make calling them easier -- add runcheck before searching when switching to search tab if not present - make it so groups can be reordered in the groupsets widgets? - consider if indexing and feature highly functions in plot options have any easy wins for optimization or disk use. (prob not) - make goto buttons just one class and lambda an index for the stacked widgets diff --git a/code/ui_functions.py b/code/ui_functions.py index e0bbf68..1019600 100644 --- a/code/ui_functions.py +++ b/code/ui_functions.py @@ -211,6 +211,8 @@ def goto_search(self): self.dbsearchdone = True stop_functime('dbsearch complete') reset_runtime() + elif not self.analysisrun: + self.error('Run an analysis before searching.') #plotbar functions def goto_review(self): From b5a804c757a34c1031d1179478c254b9c52e2327 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 01:45:56 -0400 Subject: [PATCH 03/20] Add GroupSetModel.move() for groupset reordering (model layer only) Lays the Qt-free foundation for the "groups can be reordered" TODO: GroupSetModel.move(from_index, to_index) reorders groupsets and keeps the selection on the moved/shifted item by identity (not GroupSet's value-based __eq__, since two freshly-added default groupsets compare equal). 8 new tests in test_groupsets.py. UI wiring (drag-and-drop on listWidget_pltgrps) intentionally left for later -- it would need to be verified against a live GUI session to confirm it interacts correctly with updatesets()'s existing blockSignals dance, which isn't something to ship unverified. Co-Authored-By: Claude Sonnet 4.6 --- code/groupsets.py | 28 ++++++++++++++ code/main.py | 6 +++ code/tests/test_groupsets.py | 72 ++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) diff --git a/code/groupsets.py b/code/groupsets.py index 6bee426..39789d8 100644 --- a/code/groupsets.py +++ b/code/groupsets.py @@ -122,6 +122,34 @@ def remove(self, index=None): del self._items[index] self.select(self._selected) + def move(self, from_index, to_index): + """Reorder the groupset at ``from_index`` to ``to_index``. + + Both indices are clamped to the valid range; out-of-range or equal + indices are a no-op. Selection follows the moved item, so the + groupset that was selected before the move is still selected after + (by identity, not by index) -- a drag-and-drop reorder shouldn't + change which groupset is being edited. + """ + if not self._items: + return + from_index = max(0, min(from_index, len(self._items) - 1)) + to_index = max(0, min(to_index, len(self._items) - 1)) + if from_index == to_index: + return + selected_item = self.selected + groupset = self._items.pop(from_index) + self._items.insert(to_index, groupset) + if selected_item is not None: + # Identity, not '==' -- GroupSet.__eq__ is value-based, and two + # distinct groupsets can compare equal (e.g. freshly added ones + # before either is edited), so list.index() could pick the wrong + # one. + for i, item in enumerate(self._items): + if item is selected_item: + self._selected = i + break + def update(self, index, *, name=None, src=None, incl=None, excl=None, colour=None): """Overwrite the given fields of the groupset at ``index``.""" groupset = self._items[index] diff --git a/code/main.py b/code/main.py index 442b315..5f21441 100644 --- a/code/main.py +++ b/code/main.py @@ -84,6 +84,12 @@ - add other ordination options like pca, pls-da, etc etc - add custom keyword arguments for each plot to make calling them easier - make it so groups can be reordered in the groupsets widgets? + ~model-layer support done: GroupSetModel.move() (groupsets.py), tested in + test_groupsets.py. UI drag-drop wiring (listWidget_pltgrps InternalMove + + syncing its rowsMoved signal to model.move()) not done -- needs a live + GUI session to verify the selection-tracking interacts correctly with + updatesets()'s existing blockSignals dance, which isn't something to + guess at unverified - consider if indexing and feature highly functions in plot options have any easy wins for optimization or disk use. (prob not) - make goto buttons just one class and lambda an index for the stacked widgets when connecting! diff --git a/code/tests/test_groupsets.py b/code/tests/test_groupsets.py index 297b1b3..d4c89c4 100644 --- a/code/tests/test_groupsets.py +++ b/code/tests/test_groupsets.py @@ -99,6 +99,78 @@ def test_remove_all_items_leaves_selection_at_negative_one(): assert model.selected is None +# --------------------------------------------------------------------------- # +# GroupSetModel: move (reordering) +# --------------------------------------------------------------------------- # + +def test_move_reorders_items(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.move(0, 2) + assert [g.name for g in model] == ['b', 'c', 'a'] + + +def test_move_keeps_selection_on_the_moved_item(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.select(0) # 'a' selected + model.move(0, 2) + assert model.selected.name == 'a' + assert model.selected_index == 2 + + +def test_move_keeps_selection_on_a_different_item_that_shifted_position(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.select(1) # 'b' selected + model.move(0, 2) # moves 'a' past 'b', so 'b' shifts from index 1 to 0 + assert model.selected.name == 'b' + assert model.selected_index == 0 + + +def test_move_with_equal_indices_is_a_noop(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.move(1, 1) + assert [g.name for g in model] == ['a', 'b'] + + +def test_move_clamps_out_of_range_indices(): + model = GroupSetModel() + model.add('a') + model.add('b') + model.add('c') + model.move(-5, 99) + assert [g.name for g in model] == ['b', 'c', 'a'] + + +def test_move_on_empty_model_is_a_noop(): + model = GroupSetModel() + model.move(0, 1) # must not raise + assert len(model) == 0 + + +def test_move_disambiguates_value_equal_groupsets_by_identity(): + # Two freshly-added default groupsets compare equal (GroupSet.__eq__ is + # value-based, and both start with identical fields), so move() must + # track the selected item by identity, not by list.index()'s '=='. + model = GroupSetModel() + model.add('dup') + model.add('dup') + model.select(0) + first = model.selected + model.move(0, 1) + assert model.selected is first + assert model.selected_index == 1 + + # --------------------------------------------------------------------------- # # GroupSetModel: CRUD # --------------------------------------------------------------------------- # From 504410f9a67068227f122d86b0b316e2af4d84c3 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 01:47:03 -0400 Subject: [PATCH 04/20] Clarify stale PCA TODO note PCA itself already exists (plot_PCA/goto_pca/checkbox field) -- the remaining gap is specifically loadings/biplot visualization of which features drive each component, which plot_PCA doesn't do yet. Reworded so the TODO doesn't read as if PCA support is still missing. Co-Authored-By: Claude Sonnet 4.6 --- code/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/code/main.py b/code/main.py index 5f21441..641a07c 100644 --- a/code/main.py +++ b/code/main.py @@ -69,7 +69,10 @@ -add bypass for plots based on checkmark. possibly use if check: ... else: button.hide() then pass - distribution of CVs on bottom of cvplt? -- add pca option and allow visualization of key features on multivar plt? +- allow visualization of key features (loadings/biplot) on multivar plt + (PCA itself already exists -- plot_PCA/goto_pca/checkbox field -- this is + specifically about showing which original features drive each component, + which plot_PCA doesn't do yet) #TODO# - in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm From c0c160c013316202ddd33eab3dd47e3603b8a28d Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 13:17:19 -0400 Subject: [PATCH 05/20] Add Qt-free multivariate ordination backend (PCA/NMDS/PLS-DA) New code/ordination.py: PCA, NMDS (the metric-MDS-warm-started non-metric MDS already used by the soon-to-be-renamed plot_PCA, kept verbatim), and PLS-DA, plus a Qt-free port of the data-loading/ technical-replicate-collapsing logic the plot currently has hardcoded off (plotting.py: `parent.collapsereps = False#...`). The collapse-replicate logic is a near-verbatim port of the original, not a rewrite -- its header-juggling (round-tripping through a CSV to relabel an unstack() result) is easy to get subtly wrong by inspection alone, so it's preserved as-is and verified empirically instead: test_ordination.py constructs a synthetic peak table with 3 samples across 2 biological groups, 3 technical-replicate injections each, and asserts collapsing lands on exactly 3 rows (one per Sample) -- not 9 (uncollapsed) and not 2 (would mean biological replicates got merged too). Cross-checked against real example data with a scratch script (27 injections / 9 samples / 3 groups -> collapses to 9, not 3). Also caught and fixed a real bug while validating against real data: PLSRegression's default scale=True standardizes X internally, so the original explained-variance-ratio calc (component score variance / unscaled total variance) silently produced ratios around 1e-6 instead of the ~0.7 a well-separated dataset should show. Fixed with scale=False, matching PCA's plain-centered treatment. OPLS-DA is intentionally not implemented (see ordination.py's module docstring) -- no scikit-learn support, and the alternatives (an unmaintained third-party package, or a from-scratch implementation with no reference dataset to validate against) are both riskier than shipping PCA/NMDS/PLS-DA now and revisiting OPLS-DA later. Co-Authored-By: Claude Sonnet 4.6 --- code/ordination.py | 240 ++++++++++++++++++++++++++++++++++ code/tests/test_ordination.py | 195 +++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 code/ordination.py create mode 100644 code/tests/test_ordination.py diff --git a/code/ordination.py b/code/ordination.py new file mode 100644 index 0000000..1457168 --- /dev/null +++ b/code/ordination.py @@ -0,0 +1,240 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free multivariate ordination backend: PCA, NMDS, and PLS-DA on the +samples x features intensity matrix, plus the data prep (technical-replicate +collapsing) and loadings-selection logic the "multivariate" plot tab needs. + +OPLS-DA is intentionally not implemented here: scikit-learn has no native +support, and the only alternatives (the unmaintained ``pyopls`` package, or a +from-scratch orthogonal-signal-correction implementation) are both +meaningfully riskier than PCA/NMDS/PLS-DA without a reference dataset to +validate against. Logged as future work, not started. + +This module is Qt-free and unit-tested (see ``tests/test_ordination.py``). +""" + +import numpy as np +import pandas as pd +from sklearn.cross_decomposition import PLSRegression +from sklearn.decomposition import PCA +from sklearn.metrics import pairwise_distances +from sklearn import manifold + + +def load_ordination_matrix(file, raw_msdata_header, collapse_replicates): + """Load the samples x features intensity matrix used for ordination. + + This is a near-verbatim port of the data-loading half of the original + (dead-checkbox) ``plot_PCA.plot()`` -- deliberately not redesigned, since + the original's row-grouping math is correct (verified empirically in + ``test_ordination.py``/the scratch script, not re-derived by inspection + here -- this header-juggling is genuinely easy to get subtly wrong by + reasoning about it instead of testing it). Only the hardcoded + ``collapsereps = False`` is now a real parameter. + + Args: + file: path to the canonical ``_filtered.csv`` peak table (3-row + header: Biolgroup, Sample, Injection; see devnotes.md). + raw_msdata_header: the same peak table's 3 header rows, read + *raw* (``header=None, index_col=[0,1,2]).iloc[:3,:].transpose()``, + exactly as the original code reads it -- NOT yet renamed or + re-indexed; that happens inside this function (for the + ``collapse_replicates=True`` case, a *different* header -- + read from the freshly-collapsed intermediate file -- is used + instead, matching the original's control flow exactly). + collapse_replicates: if True, average technical replicates (multiple + Injections under the same Sample) together, keeping biological + replicates (distinct Samples) separate. If False, every + Injection is its own row, as-is. + + Returns: + (X, biolgroup): ``X`` is a DataFrame indexed by sample identifier + ('File'; an Injection name, or a Sample name when collapsed), + columns = features. ``biolgroup`` is a Series, same index as ``X``, + mapping each sample to its biological group. + """ + if collapse_replicates: + # Average technical replicates (Injection) while keeping biological + # replicates (Sample) distinct -- level order is Compound, m/z, RT, + # Biolgroup, Sample, Injection (MSFaST.py's msdata_header.columns + # assignment). Round-trips through a CSV (matching the original) + # so the relabeled 3-row header can be read back the same way the + # uncollapsed path reads the real file, rather than hand-deriving + # unstack()'s resulting column-level order. + msdata = pd.read_csv(file, sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) + try: + msdata = msdata.stack([0, 1, 2], future_stack=True) + except TypeError: + msdata = msdata.stack([0, 1, 2]) + msdata = msdata.groupby(level=[0, 1, 2, 3, 4]).mean().unstack(level=[-1, -2]) + collapsed_columns = msdata.columns.to_list() + msdata = msdata.reset_index() + header = [('', '', 'Compound'), ('', '', 'm/z'), ('', '', 'Retention time (min)')] + for elem in collapsed_columns: + header.append((elem[1], '', elem[0])) + msdata.columns = pd.MultiIndex.from_tuples(header) + msdata.to_csv('averagepca.csv', header=True, index=False) + + msdata_header = pd.read_csv('averagepca.csv', sep=',', header=None, + index_col=[0, 1, 2]).iloc[:3, :].transpose() + pcadf = (pd.read_csv('averagepca.csv', sep=',', header=[2], index_col=[0]) + .drop(['m/z', 'Retention time (min)'], axis=1) + .transpose().astype(float).reset_index().rename(columns={'index': 'File'})) + else: + msdata_header = raw_msdata_header + pcadf = (pd.read_csv(file, sep=',', header=[2], index_col=[0]) + .drop(['m/z', 'Retention time (min)'], axis=1) + .transpose().astype(float).reset_index().rename(columns={'index': 'File'})) + + msdata_header.columns = ['Biolgroup', 'Sample', 'Injection'] + msdata_header = msdata_header.set_index('Injection') + + x = pcadf.set_index('File') + biolgroup = pd.Series( + {file_id: msdata_header.loc[file_id, 'Biolgroup'] for file_id in pcadf['File']}, + name='Biolgroup', + ) + biolgroup.index.name = 'File' + return x, biolgroup + + +def run_pca(x, n_components): + """Plain PCA on the samples x features matrix. + + Returns: + (scores, loadings, explained_variance_ratio): ``scores`` is a + DataFrame (index=samples, columns=PC1..PCn); ``loadings`` is a + DataFrame (index=features, columns=PC1..PCn) of each feature's + contribution to each component; ``explained_variance_ratio`` is an + ndarray of length ``n_components``. + """ + pca = PCA(n_components=n_components) + scores = pca.fit_transform(x.values - x.values.mean(axis=0)) + columns = [f'PC{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(scores, index=x.index, columns=columns) + loadings = pd.DataFrame(pca.components_.T, index=x.columns, columns=columns) + return scores, loadings, pca.explained_variance_ratio_ + + +def run_nmds(x, n_components): + """Non-metric MDS on Bray-Curtis sample dissimilarities, warm-started + from a metric MDS solution, then rotated onto principal axes purely for + a stable/sensible orientation (this rotation is NOT a second ordination + of the original features -- it doesn't change the NMDS embedding's + inter-point distances, only its axis orientation). + + Returns: + (scores, explained_variance_ratio, stress): ``explained_variance_ratio`` + here is the variance of the *embedded* (already-reduced) NMDS + coordinates explained by each rotated axis -- not, unlike PCA's, a + measure of how much of the original feature-space variance is + captured. Callers should label this distinctly (e.g. "% of + embedding variance") rather than implying it's the same quantity as + PCA's explained variance. + """ + similarities = pairwise_distances(x.values - x.values.mean(), metric='braycurtis') + + mds = manifold.MDS(n_components=n_components, max_iter=3000, eps=1e-9, + random_state=1, dissimilarity="precomputed", n_jobs=1) + pos = mds.fit(similarities).embedding_ + + nmds = manifold.MDS(n_components=n_components, metric=False, max_iter=3000, + eps=1e-12, dissimilarity="precomputed", random_state=1, + n_jobs=1, n_init=1) + npos = nmds.fit_transform(similarities, init=pos) + stress = nmds.stress_ + + pca = PCA(n_components=n_components) + rotated = pca.fit_transform(npos) + columns = [f'NMDS{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(rotated, index=x.index, columns=columns) + return scores, pca.explained_variance_ratio_, stress + + +def nmds_loading_proxy(x, scores): + """Per-feature correlation with each NMDS axis, as a loadings-equivalent. + + NMDS has no linear feature loadings (it's a rank-based embedding, not a + linear projection of the original features) -- this is the standard + ecology "vector fitting" approach (cf. vegan::envfit): correlate each + original feature with each ordination axis and use that as the + loadings-plot substitute. + + Returns: + DataFrame (index=features, columns=same as ``scores``) of Pearson + correlation coefficients. + """ + return pd.DataFrame( + {col: x.corrwith(scores[col]) for col in scores.columns}, + index=x.columns, + ) + + +def run_plsda(x, y, n_components): + """PLS-DA: PLS regression of the samples x features matrix against + one-hot-encoded group labels. + + Args: + x: samples x features DataFrame. + y: Series of group labels, indexed the same as ``x``. + n_components: number of PLS components. + + Returns: + (scores, loadings, explained_variance_ratio): shapes match + ``run_pca``'s. scikit-learn doesn't expose an explained-variance + ratio for PLS directly, so it's computed manually here as each + component's X-score variance divided by the total variance of + (centered) ``x`` -- the standard approach for reporting %-explained + on a PLS biplot. + """ + y_dummies = pd.get_dummies(y) + # scale=False: PLSRegression's default scale=True standardizes X (and Y) + # to unit variance per column internally, so x_scores_ would otherwise + # live on a different scale than x_centered below -- comparing the two + # directly (as the explained-variance-ratio calc does) silently produced + # a near-zero, meaningless ratio until this was caught by running this + # against real data (see the scratch script / devnotes.md). + pls = PLSRegression(n_components=n_components, scale=False) + pls.fit(x.values, y_dummies.values) + x_scores = pls.x_scores_ + columns = [f'PLS{i + 1}' for i in range(n_components)] + scores = pd.DataFrame(x_scores, index=x.index, columns=columns) + loadings = pd.DataFrame(pls.x_loadings_, index=x.columns, columns=columns) + + x_centered = x.values - x.values.mean(axis=0) + total_variance = np.sum(x_centered ** 2) + component_variance = np.sum(x_scores ** 2, axis=0) + explained_variance_ratio = component_variance / total_variance + return scores, loadings, explained_variance_ratio + + +def top_loadings(loadings, n=25, always_include=()): + """Subset of ``loadings`` to actually draw on a loadings plot. + + High-dimensional data (thousands of features) can't all be plotted + legibly, so this returns only the top ``n`` features by vector magnitude + (Euclidean norm across all of ``loadings``'s columns) -- plus any + feature named in ``always_include``, even if its magnitude wouldn't + otherwise make the cut. That's what lets the app highlight a specific + (possibly tiny) feature on demand without changing the default view. + + Args: + loadings: DataFrame (index=features, columns=components). + n: how many top-magnitude features to include by default. + always_include: iterable of feature names (must be a subset of + ``loadings.index``) to include regardless of magnitude. + + Returns: + DataFrame: subset of ``loadings`` (same columns), index order + preserved from ``loadings``, with at most ``n + len(always_include)`` + rows (fewer if there's overlap or ``loadings`` itself is smaller). + """ + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + top_n_index = magnitude.nlargest(min(n, len(loadings))).index + forced = [feat for feat in always_include if feat in loadings.index] + keep = top_n_index.union(forced, sort=False) + # Preserve loadings' original row order rather than magnitude-sorted order. + keep = [feat for feat in loadings.index if feat in keep] + return loadings.loc[keep] diff --git a/code/tests/test_ordination.py b/code/tests/test_ordination.py new file mode 100644 index 0000000..1813600 --- /dev/null +++ b/code/tests/test_ordination.py @@ -0,0 +1,195 @@ +"""Unit tests for the multivariate ordination backend (``ordination.py``). + +Covers data loading/replicate-collapsing (verified against a synthetic +3-header-row peak table with a known technical/biological-replicate +structure -- see the plan for why this is empirically checked rather than +trusted by inspection) and the PCA/NMDS/PLS-DA/top_loadings math against +small synthetic matrices. +""" + +import numpy as np +import pandas as pd +import pytest + +from ordination import ( + load_ordination_matrix, nmds_loading_proxy, run_nmds, run_pca, run_plsda, + top_loadings, +) + + +# --------------------------------------------------------------------------- # +# load_ordination_matrix / collapse_replicates +# --------------------------------------------------------------------------- # + +def _write_synthetic_filtered_csv(path): + """3 samples (S1, S1b in groupA; S2 in groupB), 3 technical-replicate + injections each (9 injections total) -- enough to tell "collapsed to + one row per Sample" apart from both "per Injection" (9) and "per + Biolgroup" (2, since there are only 2 biolgroups but 3 samples). + """ + with open(path, 'w') as f: + f.write(',,,groupA,groupA,groupA,groupA,groupA,groupA,groupB,groupB,groupB\n') + f.write(',,,S1,S1,S1,S1b,S1b,S1b,S2,S2,S2\n') + f.write('Compound,m/z,Retention time (min),inj1,inj2,inj3,inj4,inj5,inj6,inj7,inj8,inj9\n') + f.write('feat1,100.0,1.0,10,12,11,30,32,31,50,52,51\n') + f.write('feat2,200.0,2.0,5,6,4,15,16,14,20,19,21\n') + + +def _raw_header(path): + return pd.read_csv(path, sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + + +def test_uncollapsed_keeps_one_row_per_injection(tmp_path): + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, biolgroup = load_ordination_matrix(path, _raw_header(path), collapse_replicates=False) + assert x.shape == (9, 2) + assert len(biolgroup) == 9 + + +def test_collapsed_averages_technical_not_biological_replicates(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) # 'averagepca.csv' lands here, not the repo + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, biolgroup = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) + + # 3 distinct Samples (S1, S1b, S2) -- not 9 (uncollapsed) and not 2 + # (the number of Biolgroups, which would mean biological replicates got + # wrongly merged too). + assert x.shape[0] == 3 + assert x.shape[1] == 2 + assert biolgroup.nunique() == 2 + assert sorted(biolgroup.unique()) == ['groupA', 'groupB'] + # Two of the three collapsed rows belong to groupA (S1, S1b). + assert (biolgroup == 'groupA').sum() == 2 + assert (biolgroup == 'groupB').sum() == 1 + + +def test_collapsed_values_are_the_mean_of_their_technical_replicates(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + x, _ = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) + + # S1's feat1 replicates are 10, 12, 11 -> mean 11. + s1_row = x.loc[x.index.str.contains('S1') & ~x.index.str.contains('S1b')] + assert s1_row['feat1'].iloc[0] == pytest.approx(11.0) + + +# --------------------------------------------------------------------------- # +# run_pca / run_nmds / run_plsda / top_loadings +# --------------------------------------------------------------------------- # + +def _make_low_rank_matrix(): + # 12 samples, 5 features, but only 2 real underlying dimensions of + # variation -- PCA on this should recover ~100% explained variance in + # the first 2 components. + rng = np.random.RandomState(0) + latent = rng.normal(size=(12, 2)) + loading_matrix = rng.normal(size=(2, 5)) + x = pd.DataFrame( + latent @ loading_matrix + rng.normal(scale=0.01, size=(12, 5)), + index=[f's{i}' for i in range(12)], + columns=[f'f{i}' for i in range(5)], + ) + return x + + +def test_pca_recovers_known_low_rank_structure(): + x = _make_low_rank_matrix() + scores, loadings, expvar = run_pca(x, n_components=3) + assert scores.shape == (12, 3) + assert loadings.shape == (5, 3) + # Two real dimensions of variation + tiny noise -> first two components + # should capture almost all the variance. + assert expvar[:2].sum() > 0.99 + + +def test_plsda_separates_two_groups_along_first_component(): + rng = np.random.RandomState(1) + group_a = rng.normal(loc=0, scale=0.5, size=(10, 6)) + group_b = rng.normal(loc=5, scale=0.5, size=(10, 6)) + x = pd.DataFrame( + np.vstack([group_a, group_b]), + index=[f's{i}' for i in range(20)], + columns=[f'f{i}' for i in range(6)], + ) + y = pd.Series(['A'] * 10 + ['B'] * 10, index=x.index) + + scores, loadings, expvar = run_plsda(x, y, n_components=2) + assert scores.shape == (20, 2) + assert loadings.shape == (6, 2) + # The groups are cleanly separated along PLS1: every A score should be + # on one side of 0 and every B score on the other (sign is arbitrary). + pls1 = scores['PLS1'] + assert (pls1[:10] > 0).all() != (pls1[10:] > 0).all() + # A real, well-separated signal should explain a meaningful share of + # variance -- catches the scale=True/scale=False bug (manually + # confirmed against real data: that bug produced ratios on the order of + # 1e-6 instead of comparable-to-PCA's ~0.7). + assert expvar[0] > 0.1 + + +def test_nmds_smoke_test_on_clustered_data(): + rng = np.random.RandomState(2) + cluster_a = rng.normal(loc=0, scale=0.2, size=(6, 8)) + cluster_b = rng.normal(loc=10, scale=0.2, size=(6, 8)) + x = pd.DataFrame( + np.vstack([cluster_a, cluster_b]), + index=[f's{i}' for i in range(12)], + columns=[f'f{i}' for i in range(8)], + ) + scores, expvar, stress = run_nmds(x, n_components=2) + assert scores.shape == (12, 2) + assert len(expvar) == 2 + assert np.isfinite(stress) + assert stress >= 0 + + proxy = nmds_loading_proxy(x, scores) + assert proxy.shape == (8, 2) + assert proxy.values.min() >= -1.0001 and proxy.values.max() <= 1.0001 + + +# --------------------------------------------------------------------------- # +# top_loadings +# --------------------------------------------------------------------------- # + +def _make_loadings(n=30): + rng = np.random.RandomState(3) + return pd.DataFrame( + rng.normal(size=(n, 2)), + index=[f'feat{i}' for i in range(n)], + columns=['PC1', 'PC2'], + ) + + +def test_top_loadings_returns_n_rows_by_default(): + loadings = _make_loadings(30) + top = top_loadings(loadings, n=10) + assert len(top) == 10 + + +def test_top_loadings_includes_forced_feature_outside_top_n(): + loadings = _make_loadings(30) + top = top_loadings(loadings, n=5) + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + smallest_feature = magnitude.idxmin() + assert smallest_feature not in top.index + + top_forced = top_loadings(loadings, n=5, always_include=[smallest_feature]) + assert len(top_forced) == 6 + assert smallest_feature in top_forced.index + + +def test_top_loadings_forced_feature_already_in_top_n_is_not_duplicated(): + loadings = _make_loadings(30) + magnitude = np.sqrt((loadings ** 2).sum(axis=1)) + largest_feature = magnitude.idxmax() + top = top_loadings(loadings, n=10, always_include=[largest_feature]) + assert len(top) == 10 # already in the top 10, no duplicate row added + + +def test_top_loadings_n_larger_than_available_returns_everything(): + loadings = _make_loadings(5) + top = top_loadings(loadings, n=100) + assert len(top) == 5 From 9c15631fd4c3f541209c6f46f2d15fae1ee49cbb Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 13:26:03 -0400 Subject: [PATCH 06/20] Rework the mislabeled "PCA" plot into a multivariate ordination tab plot_PCA only ever ran NMDS (with a PCA rotation applied afterward purely to orient axes) -- renamed to plot_ordination and reworked to genuinely support PCA, NMDS, and PLS-DA, selectable via a combo-box switcher bar inserted above the plot canvas (same runtime widget-substitution pattern as searchtree.py's filter bar), plus a Scores/Loadings view toggle. The math moved to the new Qt-free ordination.py (previous commit); this is the Qt plumbing on top of it. - Axis labels now show percent-variance-explained where meaningful (PCA/PLS-DA: real feature-space variance; NMDS: labeled distinctly as embedding variance, since it isn't the same quantity). - Loadings view shows the top-25 features by vector magnitude as origin-anchored arrows (thousands of features can't all be drawn legibly) -- but whichever feature is currently highlighted elsewhere in the app is always included regardless of magnitude, via a second pre-created highlight artist (plot_ordination.highlight_loading(), called from MainWindow._refresh_highlight()) following the same convention every other plot's highlight marker already uses. - Restored "Collapse Technical Replicates": plotting.py previously had this hardcoded off (`parent.collapsereps = False#...isChecked()`); now reads the real checkbox via ordination.load_ordination_matrix(). - checkBox_pca's visible text/btn_pca's tooltip changed from "PCA" to "Multivariate" -- the underlying objectName/analysis_params.PCA attribute are unchanged for .mpct save-file compatibility. - Verified the view/method-switching lifecycle (Scores<->Loadings, method changes, the highlight-on-demand path for a feature outside the default top-25) against real example data with an offscreen Qt harness before considering this done -- in particular confirmed ui_plot.reset()'s mpl_disconnect(self.event) doesn't error when switching away from Scores view (where the pick-event connection lives) and back. devnotes.md documents all of the above plus the OPLS-DA deferral (unmaintained pyopls package, or a from-scratch implementation with no reference dataset to validate against -- both riskier than shipping PCA/NMDS/PLS-DA now). Co-Authored-By: Claude Sonnet 4.6 --- code/main.py | 29 +++-- code/plotting.py | 303 +++++++++++++++++++++++++++++------------------ devnotes.md | 69 ++++++++++- 3 files changed, 277 insertions(+), 124 deletions(-) diff --git a/code/main.py b/code/main.py index f34b82e..fc91309 100644 --- a/code/main.py +++ b/code/main.py @@ -41,7 +41,7 @@ from biogroups import compute_biological_groups from dbsearch import search_npatlas from searchtree import SearchTreePanel -from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_PCA, prev_cv, gen_upsetplt, gen_treemap +from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, gen_upsetplt, gen_treemap import getfragdb from indigo import Indigo @@ -70,10 +70,6 @@ -add bypass for plots based on checkmark. possibly use if check: ... else: button.hide() then pass - distribution of CVs on bottom of cvplt? -- allow visualization of key features (loadings/biplot) on multivar plt - (PCA itself already exists -- plot_PCA/goto_pca/checkbox field -- this is - specifically about showing which original features drive each component, - which plot_PCA doesn't do yet) #TODO# - in source spectra viewer in spectrum details tab plot with preexisting in source fragment deconvolution algoirthm @@ -245,7 +241,15 @@ def __init__(self): self.ui.setupUi(self) self.ui.label_credits.setText('v1.00.01 r26.06.29') - + + # "PCA" was a misnomer left over from when this checkbox/button only + # ran NMDS (see plotting.plot_ordination) -- the underlying + # checkBox_pca objectName/analysis_params.PCA attribute stay + # unchanged for .mpct save-file compatibility; only the visible text + # and tooltip change. + self.ui.checkBox_pca.setText('Multivariate') + self.ui.btn_pca.setToolTip('Multivariate Ordination (PCA/NMDS/PLS-DA)') + #initialize other dialog windows self.dialog = dialog() self.ftrdialog = ftrdialog() @@ -766,6 +770,13 @@ def _refresh_highlight(self): ) self.canvas['kmd'].draw_idle() + # Update the multivariate plot's loadings-view highlight (a separate + # concept from its scores view, which highlights a clicked *sample* + # via parent.pickedsample, not a feature -- so this only applies + # when self.pca exists and is currently showing loadings). + if getattr(self, 'pca', None) is not None: + self.pca.highlight_loading(self.pickedfeature, self.highlightcol) + # Update feature plot with the selected feature self.highlight['featureplt'].set_data( [iondict.loc[self.pickedfeature, 'Retention time (min)']], @@ -1062,10 +1073,10 @@ def _generate_plots(self): stop_functime('dendrogram complete') if params.PCA: - self._create_or_reset('pca', 'PCA/NMDS plot', - lambda: plot_PCA(self, 'pca', self.ui.frame_pca, pltfile, '', ''), + self._create_or_reset('pca', 'multivariate ordination plot', + lambda: plot_ordination(self, 'pca', self.ui.frame_pca, pltfile, '', ''), lambda: self.pca.reset(pltfile, '', '')) - stop_functime('nmds complete') + stop_functime('ordination complete') if params.FC3Dplt: self._create_or_reset('fc3d', '3D fold-change plot', diff --git a/code/plotting.py b/code/plotting.py index b1b2e3f..046de0c 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -9,6 +9,7 @@ import pickle from csvcache import cached_read_csv, invalidate as invalidate_csv_cache +import ordination import matplotlib #matplotlib.style.use('ggplot') @@ -35,9 +36,6 @@ import scipy.cluster.hierarchy as shc from sklearn.preprocessing import normalize -from sklearn import manifold -from sklearn.metrics import pairwise_distances -from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from matplotlib.patches import Ellipse from filter import listfilter @@ -833,152 +831,233 @@ def plot(self, parent, file, filtereddfs, groupsets): left=0.1, right=0.95, bottom=0.35, top=0.9, hspace=0.2, wspace=0.2) parent.canvas[self.currplt].draw() -class plot_PCA(ui_plot): - #plots NMDS data - # should include opion to allow user specified pca colors - # need to fix selection of samples on PCA plot - # should add PCA vs NMDS option +_ORDINATION_SWITCHER_STYLE = """ +QWidget { + background-color: rgba(70,70,70,25); +} +QComboBox { + background-color: rgb(50,50,50); + color: rgb(200,200,200); + border: 1px solid rgb(70,70,70); + border-radius: 2px; + padding: 2px; +} +QLabel { + color: rgb(200,200,200); + background: transparent; +} +""" + + +class plot_ordination(ui_plot): + """Multivariate ordination plot: PCA, NMDS, or PLS-DA, with a + scores-vs-loadings view toggle. + + A combo-box switcher bar (built once in ``__init__``, inserted above the + canvas the same way ``SearchTreePanel``'s filter bar is substituted into + a Designer placeholder -- see searchtree.py) lets the user pick the + ordination method and the scores/loadings view; both redraw onto the + same axes via ``self.plot(...)`` rather than rebuilding the canvas. + + The actual math lives in the Qt-free ``ordination.py`` (PCA/NMDS/PLS-DA, + technical-replicate collapsing, top-N loadings selection); this class is + just the Qt plumbing and rendering on top of it. + """ + + METHODS = ('NMDS', 'PCA', 'PLS-DA') + VIEWS = ('Scores', 'Loadings') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + # Defaults match the plot's previous (NMDS-only, scores-only) + # behaviour exactly, so existing sessions see no change until they + # explicitly switch the new controls. + self.method = 'NMDS' + self.view = 'Scores' + self.loadings_df = None + self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) + def _build_switcher_bar(self, parent, currplt): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_ORDINATION_SWITCHER_STYLE) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('Method:')) + method_combo = QtWidgets.QComboBox() + method_combo.addItems(self.METHODS) + method_combo.setCurrentText(self.method) + method_combo.currentTextChanged.connect(self._on_method_changed) + layout.addWidget(method_combo) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + layout.addStretch() + + self.method_combo = method_combo + self.view_combo = view_combo + parent.pltlayout[currplt].insertWidget(0, bar) + + def _on_method_changed(self, method): + self.method = method + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): - """Plot principal component analysis (PCA) or NMDS data. - + """(Re)draw the ordination plot for the current method/view. + Args: - parent (QWidget): Parent widget. - currplt (int): Current plot number. - frame (QFrame): Frame to hold the plot. - file (str): Path to the file containing the PCA data. - filtereddfs (list): List of filtered data. - groupsets (list): List of group sets. - - Attributes: - highlightcol (tuple): Tuple containing RGBA values used for highlighting. - event (int): Identifier for the pick event used to select points on the plot. - - Methods: - plot(self, parent, file, filtereddfs, groupsets): Plot the PCA data. - plot_point_cov(self, points, nstd=2, ax=None, **kwargs): Generate an ellipse for the confidence interval. - lighten_color(self, color, amount=0.5): Lighten a given color by a given amount. - plot_cov_ellipse(self, cov, pos, nstd=2, ax=None, **kwargs): Generate an optimized ellipse for the confidence interval. + parent (QWidget): Parent widget (MainWindow). + file (str): Path to the ``_filtered.csv`` peak table. + filtereddfs, groupsets: unused here (kept for the shared + ``_create_or_reset``/``reset`` call signature every plot + class follows). """ parent = self.parent - parent.collapsereps = False#parent.dialog.ui.checkBox_collapsereps.isChecked() - - if parent.collapsereps: - # Average techreps if replicate collapse is selected - msdata = pd.read_csv(file, sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - try: - msdata = msdata.stack([0, 1, 2], future_stack=True) - except TypeError: - msdata = msdata.stack([0, 1, 2]) - msdata = msdata.groupby(level=[0, 1, 2, 3, 4]).mean().unstack(level=[-1, -2]) - test2 = msdata.columns.to_list() - msdata = msdata.reset_index() - header = [('','','Compound'), ('','','m/z'), ('','','Retention time (min)')] - for elem in test2: - header.append((elem[1], '', elem[0])) - msdata.columns = pd.MultiIndex.from_tuples(header) - msdata.to_csv('averagepca.csv', header=True, index=False) - - msdata_header = pd.read_csv('averagepca.csv', sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() - pcadf = pd.read_csv('averagepca.csv', sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1).transpose().astype(float).reset_index().rename(columns={'index': 'File'}) - else: - msdata_header = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() - pcadf = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1).transpose().astype(float).reset_index().rename(columns={'index': 'File'}) + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + collapse_replicates = parent.dialog.ui.checkBox_collapsereps.isChecked() + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix(file, raw_header.copy(), collapse_replicates) + + n_components = max(2, min(len(x) - 1, 10)) - components = len(msdata_header.index) - if components > 10: - components = 10 - msdata_header.columns = ['Biolgroup', 'Sample', 'Injection'] - msdata_header = msdata_header.set_index('Injection') colors = ['red', 'blue', 'black', 'grey', 'purple', 'orange', 'green', 'yellow', 'lime', 'plum', 'teal', 'olivedrab', 'sienna', 'maroon', 'navy', 'lightcoral', 'darkgoldenrod', 'seagreen', 'lightseagreen', 'aqua', 'lightsteelblue', 'slateblue', 'blueviolet', 'plum', 'burlywood', 'salmon', 'aquamarine', 'magenta', 'tan'] colorpos, biolgroupmap = 0, {} - for elem in msdata_header['Biolgroup']: + for elem in biolgroup: if elem not in biolgroupmap and elem != parent.analysis_paramsgui.blnkgrp: ###### delete blank clause OR CHANGE TO THE BLNKFILTER OPTION biolgroupmap[elem] = colors[colorpos] colorpos += 1 - - features = pcadf.columns.values[1:] - x = pcadf[features].values - y = pcadf[['File']].values - x -= x.mean() - similarities = pairwise_distances(x, metric='braycurtis') - - mds = manifold.MDS(n_components=components, max_iter=3000, eps=1e-9, random_state=1, - dissimilarity="precomputed", n_jobs=1) - pos = mds.fit(similarities).embedding_ - - nmds = manifold.MDS(n_components=components, metric=False, max_iter=3000, eps=1e-12, - dissimilarity="precomputed", random_state=1, n_jobs=1, - n_init=1) - npos = nmds.fit_transform(similarities, init=pos) - stress_value = nmds.stress_ - print("NMDS stress: " +str(stress_value)) - - pca = PCA(n_components=components) - nmdspc = pca.fit_transform(npos) - expvar = pca.explained_variance_ratio_ - pcadftest = pd.DataFrame(data=nmdspc) - - ncomplist = list(range(components)) - nmdspc = pd.DataFrame(data=nmdspc, columns=ncomplist) - nmdspc['File'] = pcadf['File'] - nmdspc['Biolgroup'] = '' - for i, elem in enumerate(nmdspc.iloc[:, components]): - nmdspc.iloc[i, components + 1] = msdata_header.loc[elem, 'Biolgroup'] - principalDf = nmdspc.set_index('File') - + + if self.method == 'PCA': + scores, loadings, expvar = ordination.run_pca(x, n_components) + axis_labels = [f'PC{i + 1} ({100 * expvar[i]:.1f}%)' for i in range(2)] + elif self.method == 'PLS-DA': + scores, loadings, expvar = ordination.run_plsda(x, biolgroup, n_components) + axis_labels = [f'PLS{i + 1} ({100 * expvar[i]:.1f}%)' for i in range(2)] + else: + scores, expvar, stress = ordination.run_nmds(x, n_components) + loadings = ordination.nmds_loading_proxy(x, scores) + print("NMDS stress: " + str(stress)) + # Labeled distinctly from PCA/PLS-DA's: this is the variance of + # the embedded 2D NMDS coordinates, not of the original feature + # space (see ordination.run_nmds's docstring). + axis_labels = [f'NMDS{i + 1} ({100 * expvar[i]:.1f}% of embedding variance)' for i in range(2)] + + self.loadings_df = loadings + principalDf = scores.copy() + principalDf['Biolgroup'] = biolgroup + + if self.view == 'Loadings': + self._plot_loadings(parent, loadings, axis_labels) + else: + self._plot_scores(parent, principalDf, biolgroupmap, axis_labels) + + parent.fig[self.currplt].subplots_adjust(left=.1, right=.9, bottom=0.1, top=0.9, hspace=0.2, wspace=0.2) + parent.canvas[self.currplt].draw() + + def _plot_scores(self, parent, principalDf, biolgroupmap, axis_labels): for elem in biolgroupmap: scatterframe = principalDf[principalDf['Biolgroup'] == elem] - points = scatterframe.iloc[:,[0,1]].to_numpy() + points = scatterframe.iloc[:, [0, 1]].to_numpy() if np.shape(points)[0] > 2: self.plot_point_cov(points, nstd=2, ax=parent.ax[self.currplt], alpha=0.5, color=self.lighten_color(biolgroupmap[elem], 0.3)) - parent.ax[self.currplt].scatter(scatterframe.iloc[:,0], scatterframe.iloc[:,1], color=biolgroupmap[elem], marker='o', s=30, label=str(elem), picker=True) - + parent.ax[self.currplt].scatter(scatterframe.iloc[:, 0], scatterframe.iloc[:, 1], color=biolgroupmap[elem], marker='o', s=30, label=str(elem), picker=True) + parent.highlight[self.currplt], = parent.ax[self.currplt].plot([], [], 'o', markersize=12, color='yellow') - parent.ax[self.currplt].set_xlabel('NMDS1', **self.fcsfont) # (' + str(round(100*expvar[0], 2)) + '%)' - parent.ax[self.currplt].set_ylabel('NMDS2', **self.fcsfont) #(' + str(round(100*expvar[1], 2)) + '%)' - - #following two lines put a hard limit on the axis tick distance - #parent.ax[self.currplt].xaxis.set_major_locator(ticker.MultipleLocator(0.1)) - #parent.ax[self.currplt].yaxis.set_major_locator(ticker.MultipleLocator(0.1)) - + parent.ax[self.currplt].set_xlabel(axis_labels[0], **self.fcsfont) + parent.ax[self.currplt].set_ylabel(axis_labels[1], **self.fcsfont) + self.highlightcol = (0, 0, 0, 0) parent.pickedsample = pd.DataFrame(0, index=['empty'], columns=['empty']) - - def picksample(event): # fix this + + def picksample(event): if _is_duplicate_pick(parent, event): return ind = event.ind - coord = event.artist.get_offsets()[ind,:] - newsample = principalDf.loc[principalDf.iloc[:,0] == coord[0,0], :].loc[principalDf.iloc[:,1] == coord[0,1], :].reset_index() + coord = event.artist.get_offsets()[ind, :] + newsample = principalDf.loc[principalDf.iloc[:, 0] == coord[0, 0], :].loc[principalDf.iloc[:, 1] == coord[0, 1], :].reset_index() if newsample.empty: return - if newsample.iloc[0,0] == parent.pickedsample.iloc[0,0] and self.highlightcol != (0, 0, 0, 0): + if newsample.iloc[0, 0] == parent.pickedsample.iloc[0, 0] and self.highlightcol != (0, 0, 0, 0): self.highlightcol = (0, 0, 0, 0) else: self.highlightcol = 'yellow' - + parent.pickedsample = newsample - parent.ui.lbl_injname.setText('Injection/Sample: ' + str(parent.pickedsample.iloc[0,0])) - parent.highlight[self.currplt].set_data(coord[0,0],coord[0,1]) + parent.ui.lbl_injname.setText('Injection/Sample: ' + str(parent.pickedsample.iloc[0, 0])) + parent.highlight[self.currplt].set_data(coord[0, 0], coord[0, 1]) parent.highlight[self.currplt].set_color(self.highlightcol) parent.canvas[self.currplt].draw_idle() - + self.event = parent.canvas[self.currplt].figure.canvas.mpl_connect('pick_event', picksample) - parent.fig[self.currplt].subplots_adjust(left=.1, right=.9, bottom=0.1, top=0.9, hspace=0.2, wspace=0.2) - #x0,x1 = parent.ax[self.currplt].get_xlim() - #0,y1 = parent.ax[self.currplt].get_ylim() - #parent.ax[self.currplt].set_aspect(abs(x1-x0)/abs(y1-y0)) - #parent.ax[self.currplt].set_aspect('equal') parent.ax[self.currplt].legend() - parent.canvas[self.currplt].draw() - + + def _plot_loadings(self, parent, loadings, axis_labels): + """Loadings (biplot-style) view: origin-anchored arrows for the + top-N features by vector magnitude, plus -- regardless of + magnitude -- whichever feature is currently highlighted elsewhere + in the app (``parent.pickedfeature``), so a feature too small to + make the default cut is still visible on demand. + """ + always_include = [parent.pickedfeature] if getattr(parent, 'pickedfeature', '') else [] + subset = ordination.top_loadings(loadings, n=25, always_include=always_include) + + for feature, row in subset.iterrows(): + xcoord, ycoord = row.iloc[0], row.iloc[1] + parent.ax[self.currplt].annotate( + '', xy=(xcoord, ycoord), xytext=(0, 0), + arrowprops=dict(arrowstyle='->', color='steelblue', lw=1)) + parent.ax[self.currplt].annotate( + str(feature), xy=(xcoord, ycoord), fontsize=8, color='black') + + # Pre-created empty artist for the highlighted-loading marker, + # following the same convention as the scores view's + # parent.highlight[currplt] -- updated on demand by + # MainWindow._refresh_highlight() via self.highlight_loading(), + # even when the highlighted feature isn't in the default top-25. + self.loadings_highlight, = parent.ax[self.currplt].plot([], [], 'o', markersize=12, color='yellow', zorder=5) + self.highlight_loading(getattr(parent, 'pickedfeature', ''), getattr(parent, 'highlightcol', (0, 0, 0, 0))) + + parent.ax[self.currplt].axhline(0, color='grey', lw=0.5) + parent.ax[self.currplt].axvline(0, color='grey', lw=0.5) + parent.ax[self.currplt].set_xlabel(axis_labels[0], **self.fcsfont) + parent.ax[self.currplt].set_ylabel(axis_labels[1], **self.fcsfont) + + def highlight_loading(self, feature, colour): + """Update the loadings-view highlight marker for ``feature`` (a + no-op outside the loadings view or before it's been drawn once). + + Called from ``MainWindow._refresh_highlight()`` -- the same + pre-create-empty-artist/update-via-set_data convention every other + plot's highlight already follows, just driven by this plot's own + last-computed loadings instead of ``iondict``. + """ + if self.view != 'Loadings' or self.loadings_df is None or not hasattr(self, 'loadings_highlight'): + return + if not feature or feature not in self.loadings_df.index: + self.loadings_highlight.set_data([], []) + else: + row = self.loadings_df.loc[feature] + self.loadings_highlight.set_data([row.iloc[0]], [row.iloc[1]]) + self.loadings_highlight.set_color(colour) + self.parent.canvas[self.currplt].draw_idle() + def plot_point_cov(self, points, nstd=2, ax=None, **kwargs): """Generate an ellipse for the confidence interval. diff --git a/devnotes.md b/devnotes.md index b99c9dd..981b1fb 100644 --- a/devnotes.md +++ b/devnotes.md @@ -107,8 +107,10 @@ that way. Required deps (gate startup): `epam.indigo`→`indigo`, `UpSetPlot`→ (shared save/restore schema for simple `analysis_parameters` checkbox fields), `biogroups.py` (`getgroups()`'s metadata-join/group-derivation core), `dbsearch.py` (`fulldbsearch()`'s NPAtlas ppm-window matching - core). Each corresponding `MainWindow` method is now a thin wrapper: - call the module function, then apply the result to widgets/`self`. + core), `ordination.py` (PCA/NMDS/PLS-DA + technical-replicate collapsing + + top-N loadings selection for the multivariate plot tab). Each + corresponding `MainWindow` method is now a thin wrapper: call the module + function, then apply the result to widgets/`self`. - **Runtime widget substitution into a Designer placeholder** is an established pattern here, not a one-off — `plotting.py` does it for every matplotlib canvas (inserted into a Designer-created `QFrame`), and @@ -159,7 +161,7 @@ python -m pytest code/tests -q ``` Covers `filter`, `stats`, `importdependencies`, `translators`, `groupsets`, -`searchtree`. Add tests here for any new Qt-free logic. +`searchtree`, `ordination`. Add tests here for any new Qt-free logic. `conftest.py` sets `QT_QPA_PLATFORM=offscreen` and provides a session-scoped `qapp` fixture: PyQt5 widgets/models/signals *can* be exercised headlessly via @@ -188,6 +190,67 @@ on load. UI uses `QListWidget` (generated, off-limits), so this is not a true `QAbstractListModel`/`QListView` setup — the "view" side is the existing hand-written widget-sync code in `ui_functions.py`, kept thin. +## Multivariate ordination plot (`plotting.plot_ordination`, `ordination.py`) + +What used to be called "PCA" (`plot_PCA`, `checkBox_pca`/`btn_pca`'s old +tooltip) actually only ran NMDS, with a PCA rotation applied to the NMDS +coordinates purely to orient the axes — not a second ordination of the +original features. `plot_ordination` now genuinely supports PCA, NMDS, and +PLS-DA, switchable via a combo-box bar inserted above the plot canvas (same +runtime widget-substitution pattern as `searchtree.py`'s filter bar — see +above), plus a Scores/Loadings view toggle. The math lives in the Qt-free +`ordination.py` (unit-tested in `tests/test_ordination.py`); `plotting.py` +only handles the combo boxes, axes, and pick events. + +- **Save-file compatibility preserved on purpose**: `analysis_params.PCA` + and `checkBox_pca`'s objectName are unchanged (still pickled into `.mpct` + saves) — only the visible checkbox text/tooltip changed (set at runtime in + `MainWindow.__init__`, same mechanism as `label_credits.setText`). Only + the hand-written class name (`plot_PCA` → `plot_ordination`) changed, + since that's never pickled. +- **"Collapse Technical Replicates" used to be dead** (`plotting.py` had + `parent.collapsereps = False#parent.dialog.ui.checkBox_collapsereps.isChecked()` + — hardcoded off, the real read commented out). Now wired for real via + `ordination.load_ordination_matrix(..., collapse_replicates=...)`. The + collapse logic itself (average technical replicates/Injections, keep + biological replicates/Samples distinct) was ported verbatim from the + original rather than rewritten — its header-relabeling-via-CSV-round-trip + is easy to get subtly wrong by inspection, so it's verified empirically + instead (`test_ordination.py`'s synthetic-replicate-structure test, cross- + checked against real example data with a scratch script during + development). +- **Loadings view and high-dimensional data**: thousands of features can't + all be drawn legibly, so only the top-25 by loading-vector magnitude are + shown by default (`ordination.top_loadings()`). Whichever feature is + currently highlighted elsewhere in the app (`MainWindow.pickedfeature`) is + always included regardless of magnitude — `plot_ordination.highlight_loading()`, + called from `MainWindow._refresh_highlight()`, follows the same + pre-create-an-empty-artist/update-via-`set_data()` convention every other + plot's highlight marker already uses. This is a *feature* highlight + (Loadings view), a different concept from the Scores view's existing + *sample* highlight (`parent.pickedsample`, set by clicking a sample point) + — the two views show different kinds of points and were never the same + selection concept. +- **NMDS has no linear feature loadings** (it's a rank-based embedding, not + a linear projection) — its Loadings view uses `ordination.nmds_loading_proxy()`, + per-feature correlation with each NMDS axis (the standard ecology "vector + fitting"/`envfit` approach), not real loadings. Its percent-explained axis + label is also captioned distinctly from PCA/PLS-DA's ("% of embedding + variance" vs. real original-feature-space variance), since the two + quantities aren't comparable. +- **PLS-DA's explained-variance gotcha**: `sklearn.cross_decomposition.PLSRegression` + defaults to `scale=True` (standardizes X internally), which silently + produced explained-variance ratios off by ~6 orders of magnitude when + compared against unscaled total variance — caught only by running against + real data, not by inspection. Fixed with `scale=False`, matching PCA's + plain-centered (not standardized) treatment. +- **OPLS-DA intentionally not implemented**: no native scikit-learn support; + the alternatives (the unmaintained `pyopls` package, or a from-scratch + orthogonal-signal-correction implementation) are both riskier than + shipping PCA/NMDS/PLS-DA without a reference dataset to validate against. + Logged here as the next ordination method to add if ever revisited, not + started. + ## Conventions - Don't edit the generated UI files (above). Put behaviour in `main.py` / From c7b6a0107ee0dcf07de0e4cb95010a98af7cdbcf Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 14:48:41 -0400 Subject: [PATCH 07/20] Fix ordination feedback: scaling, axis limits, NMDS %explained, bar styling All four issues caught only by checking against real data / the live GUI, not by inspection: - PCA/PLS-DA now autoscale features (mean-center + unit-variance) before fitting. Raw mass-spec intensities span a huge range across features (confirmed: feature std devs from ~1.8 to ~10,000 on real example data) -- without scaling, a handful of high-abundance features dominated both explained-variance and loadings, which is why loadings were showing up "in the thousands" while most were tiny, and why %explained looked unusually high. NMDS is deliberately left unscaled (Bray-Curtis dissimilarity is conventionally computed on raw/relative abundances). - NMDS axis labels no longer show percent-explained at all -- it's a rank-based embedding, not a linear decomposition, so it doesn't canonically have that quantity the way PCA/PLS-DA do. Shows stress (the conventional NMDS fit-quality metric) as the plot title instead. - Loadings-view axis limits are now set explicitly from the actually- plotted data: ax.annotate()'s arrows don't reliably drive matplotlib's autoscale the way ax.scatter()/ax.plot() do (confirmed empirically -- plotted points could fall outside the auto-picked view), which is what required manually rescaling each axis before. Also fixed top_loadings() being called against the full (up to 10-component) loadings instead of just the 2 displayed ones, which could let an irrelevant-to-this-view feature crowd out a genuinely prominent one. - Switcher bar: capped to a fixed max height so it doesn't eat canvas space, and restyled for page_pca's light background (white combo boxes, dark text) instead of searchtree.py's dark-theme styling, which was the wrong context here. Co-Authored-By: Claude Sonnet 4.6 --- code/ordination.py | 49 ++++++++++++++++++++++++++++++++++------------ code/plotting.py | 48 ++++++++++++++++++++++++++++++++++----------- devnotes.md | 46 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 110 insertions(+), 33 deletions(-) diff --git a/code/ordination.py b/code/ordination.py index 1457168..c941453 100644 --- a/code/ordination.py +++ b/code/ordination.py @@ -100,8 +100,29 @@ def load_ordination_matrix(file, raw_msdata_header, collapse_replicates): return x, biolgroup +def autoscale(x): + """Mean-center and scale each feature to unit variance ("UV-scaling" / + "autoscaling" in chemometrics terminology -- the standard pre-treatment + for PCA/PLS-DA on mass-spec intensity data). + + Without this, raw intensities (which can span several orders of + magnitude between features -- confirmed on real example data: feature + standard deviations ranged from ~1.8 to ~10,000, a ~5800x spread) let a + handful of high-abundance features dominate both the apparent + explained-variance and the loadings, regardless of which features + actually separate the biological groups. NMDS is deliberately NOT put + through this -- its Bray-Curtis dissimilarity is conventionally computed + on raw (or relative) abundances, not standardized ones. + """ + std = x.std(axis=0) + std = std.replace(0, 1) # a zero-variance feature would divide by zero; leave it at 0 instead + return (x - x.mean(axis=0)) / std + + def run_pca(x, n_components): - """Plain PCA on the samples x features matrix. + """PCA on the samples x features matrix, after autoscaling (see + ``autoscale()``) so the result isn't dominated by whichever features + happen to have the largest raw intensity. Returns: (scores, loadings, explained_variance_ratio): ``scores`` is a @@ -110,8 +131,9 @@ def run_pca(x, n_components): contribution to each component; ``explained_variance_ratio`` is an ndarray of length ``n_components``. """ + x_scaled = autoscale(x) pca = PCA(n_components=n_components) - scores = pca.fit_transform(x.values - x.values.mean(axis=0)) + scores = pca.fit_transform(x_scaled.values) columns = [f'PC{i + 1}' for i in range(n_components)] scores = pd.DataFrame(scores, index=x.index, columns=columns) loadings = pd.DataFrame(pca.components_.T, index=x.columns, columns=columns) @@ -186,25 +208,26 @@ def run_plsda(x, y, n_components): ``run_pca``'s. scikit-learn doesn't expose an explained-variance ratio for PLS directly, so it's computed manually here as each component's X-score variance divided by the total variance of - (centered) ``x`` -- the standard approach for reporting %-explained - on a PLS biplot. + (autoscaled) ``x`` -- the standard approach for reporting + %-explained on a PLS biplot. """ + x_scaled = autoscale(x) y_dummies = pd.get_dummies(y) - # scale=False: PLSRegression's default scale=True standardizes X (and Y) - # to unit variance per column internally, so x_scores_ would otherwise - # live on a different scale than x_centered below -- comparing the two - # directly (as the explained-variance-ratio calc does) silently produced - # a near-zero, meaningless ratio until this was caught by running this - # against real data (see the scratch script / devnotes.md). + # scale=False: we already autoscaled x ourselves (above), consistent + # with run_pca -- letting PLSRegression's default scale=True scale it + # AGAIN (and scale the 0/1 dummy y columns, which doesn't make sense for + # group-membership indicators) would both double-scale x and distort y. + # (scale=False also previously fixed a bug where x_scores_ lived on a + # different scale than the unscaled total-variance denominator below, + # before autoscaling was added -- see devnotes.md.) pls = PLSRegression(n_components=n_components, scale=False) - pls.fit(x.values, y_dummies.values) + pls.fit(x_scaled.values, y_dummies.values) x_scores = pls.x_scores_ columns = [f'PLS{i + 1}' for i in range(n_components)] scores = pd.DataFrame(x_scores, index=x.index, columns=columns) loadings = pd.DataFrame(pls.x_loadings_, index=x.columns, columns=columns) - x_centered = x.values - x.values.mean(axis=0) - total_variance = np.sum(x_centered ** 2) + total_variance = np.sum(x_scaled.values ** 2) component_variance = np.sum(x_scores ** 2, axis=0) explained_variance_ratio = component_variance / total_variance return scores, loadings, explained_variance_ratio diff --git a/code/plotting.py b/code/plotting.py index 046de0c..9647767 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -831,19 +831,24 @@ def plot(self, parent, file, filtereddfs, groupsets): left=0.1, right=0.95, bottom=0.35, top=0.9, hspace=0.2, wspace=0.2) parent.canvas[self.currplt].draw() +_ORDINATION_SWITCHER_BAR_HEIGHT = 32 + +# Unlike searchtree.py's filter bar (a dark-themed tab), page_pca's +# background is light (rgba(225,225,225,255), see ui_main.py) -- dark text +# on a light/white combo box, not searchtree's light-on-dark scheme. _ORDINATION_SWITCHER_STYLE = """ QWidget { - background-color: rgba(70,70,70,25); + background: transparent; } QComboBox { - background-color: rgb(50,50,50); - color: rgb(200,200,200); - border: 1px solid rgb(70,70,70); + background-color: rgb(255,255,255); + color: rgb(30,30,30); + border: 1px solid rgb(150,150,150); border-radius: 2px; padding: 2px; } QLabel { - color: rgb(200,200,200); + color: rgb(30,30,30); background: transparent; } """ @@ -883,6 +888,7 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): def _build_switcher_bar(self, parent, currplt): bar = QtWidgets.QWidget() bar.setStyleSheet(_ORDINATION_SWITCHER_STYLE) + bar.setMaximumHeight(_ORDINATION_SWITCHER_BAR_HEIGHT) layout = QtWidgets.QHBoxLayout(bar) layout.setContentsMargins(4, 2, 4, 2) @@ -943,6 +949,7 @@ class follows). biolgroupmap[elem] = colors[colorpos] colorpos += 1 + plot_title = None if self.method == 'PCA': scores, loadings, expvar = ordination.run_pca(x, n_components) axis_labels = [f'PC{i + 1} ({100 * expvar[i]:.1f}%)' for i in range(2)] @@ -952,11 +959,12 @@ class follows). else: scores, expvar, stress = ordination.run_nmds(x, n_components) loadings = ordination.nmds_loading_proxy(x, scores) - print("NMDS stress: " + str(stress)) - # Labeled distinctly from PCA/PLS-DA's: this is the variance of - # the embedded 2D NMDS coordinates, not of the original feature - # space (see ordination.run_nmds's docstring). - axis_labels = [f'NMDS{i + 1} ({100 * expvar[i]:.1f}% of embedding variance)' for i in range(2)] + # NMDS doesn't canonically report percent-variance-explained the + # way PCA/PLS-DA do (it's a rank-based embedding, not a linear + # decomposition of the feature space) -- stress is the + # conventional thing to report for NMDS instead. + axis_labels = ['NMDS1', 'NMDS2'] + plot_title = f'Stress: {stress:.4f}' self.loadings_df = loadings principalDf = scores.copy() @@ -967,6 +975,9 @@ class follows). else: self._plot_scores(parent, principalDf, biolgroupmap, axis_labels) + if plot_title: + parent.ax[self.currplt].set_title(plot_title, fontsize=10) + parent.fig[self.currplt].subplots_adjust(left=.1, right=.9, bottom=0.1, top=0.9, hspace=0.2, wspace=0.2) parent.canvas[self.currplt].draw() @@ -1016,7 +1027,22 @@ def _plot_loadings(self, parent, loadings, axis_labels): make the default cut is still visible on demand. """ always_include = [parent.pickedfeature] if getattr(parent, 'pickedfeature', '') else [] - subset = ordination.top_loadings(loadings, n=25, always_include=always_include) + # Rank by magnitude within the 2 displayed components only, not the + # full (up to 10-component) loadings -- a feature could rank in the + # overall top-25 purely from a large contribution to some other, + # unplotted component while barely showing up here, displacing a + # feature that's actually prominent in this 2D view. + subset = ordination.top_loadings(loadings.iloc[:, :2], n=25, always_include=always_include) + + # ax.annotate()'s arrows don't reliably drive matplotlib's autoscale + # the way ax.scatter()/ax.plot() do (confirmed empirically: points + # can end up outside the auto-picked view limits), so the axis + # range is set explicitly here instead of relying on autoscale. + # Symmetric around 0 since loadings/correlations are naturally + # origin-centered (a biplot convention). + limit = subset.iloc[:, :2].abs().values.max() * 1.2 if len(subset) else 1.0 + parent.ax[self.currplt].set_xlim(-limit, limit) + parent.ax[self.currplt].set_ylim(-limit, limit) for feature, row in subset.iterrows(): xcoord, ycoord = row.iloc[0], row.iloc[1] diff --git a/devnotes.md b/devnotes.md index 981b1fb..9ece8b8 100644 --- a/devnotes.md +++ b/devnotes.md @@ -234,16 +234,44 @@ only handles the combo boxes, axes, and pick events. - **NMDS has no linear feature loadings** (it's a rank-based embedding, not a linear projection) — its Loadings view uses `ordination.nmds_loading_proxy()`, per-feature correlation with each NMDS axis (the standard ecology "vector - fitting"/`envfit` approach), not real loadings. Its percent-explained axis - label is also captioned distinctly from PCA/PLS-DA's ("% of embedding - variance" vs. real original-feature-space variance), since the two - quantities aren't comparable. + fitting"/`envfit` approach), not real loadings. **NMDS's axis labels don't + show percent-explained at all** (just "NMDS1"/"NMDS2") — NMDS is a + rank-based embedding, not a linear decomposition of the feature space, so + it doesn't canonically have a %-variance-explained quantity the way + PCA/PLS-DA do; the plot title shows stress instead, the conventional NMDS + fit-quality metric. +- **PCA/PLS-DA are autoscaled** (`ordination.autoscale()`: mean-center + + scale each feature to unit variance) before fitting — without this, raw + intensities (confirmed on real example data: feature standard deviations + ranged from ~1.8 to ~10,000, a ~5800x spread) let a handful of + high-abundance features dominate both the apparent explained-variance and + the loadings, drowning out features that actually separate the + biological groups but happen to have lower raw intensity. This is the + standard chemometrics pretreatment for PCA/PLS-DA on this kind of data; + NMDS is deliberately NOT autoscaled (its Bray-Curtis dissimilarity is + conventionally computed on raw/relative abundances). - **PLS-DA's explained-variance gotcha**: `sklearn.cross_decomposition.PLSRegression` - defaults to `scale=True` (standardizes X internally), which silently - produced explained-variance ratios off by ~6 orders of magnitude when - compared against unscaled total variance — caught only by running against - real data, not by inspection. Fixed with `scale=False`, matching PCA's - plain-centered (not standardized) treatment. + defaults to `scale=True` (standardizes X internally), which -- before + autoscaling was added -- silently produced explained-variance ratios off + by ~6 orders of magnitude when compared against unscaled total variance, + caught only by running against real data, not by inspection. Fixed with + `scale=False` and autoscaling `x` ourselves first instead (matching PCA's + treatment, and avoiding PLSRegression's `scale=True` also incorrectly + scaling the 0/1 group-membership dummy columns). +- **Loadings-view rendering gotchas** (both only surfaced by checking + against real data, not by inspection): (1) `ax.annotate()`-drawn arrows + don't reliably participate in matplotlib's autoscale the way + `ax.scatter()`/`ax.plot()` do — confirmed empirically that plotted arrow + tips could fall outside the axis' auto-picked view limits — so + `plot_ordination._plot_loadings()` now sets `ax.set_xlim`/`set_ylim` + explicitly from the actually-plotted subset's coordinates, symmetric + around 0. (2) `ordination.top_loadings()` must be called with only the 2 + displayed components (`loadings.iloc[:, :2]`), not the full (up to + 10-component) loadings — ranking by overall magnitude across all + components could let a feature into the "top 25" purely from a large + contribution to some unplotted component while barely showing up in the + actual PC1-vs-PC2 view, displacing a feature that's genuinely prominent + there. - **OPLS-DA intentionally not implemented**: no native scikit-learn support; the alternatives (the unmaintained `pyopls` package, or a from-scratch orthogonal-signal-correction implementation) are both riskier than From ede19688c7d36fc81f288f206d279af78d350a9b Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 17:41:48 -0400 Subject: [PATCH 08/20] Add dendrogram purity coloring: technical/biological replicate QC view New clusterpurity.py colors dendrogram branches green wherever a whole group's leaves merge together before meeting any other group, plus a Technical/Biological Replicates switcher on the dendrogram tab (mirrors plot_ordination's method/view bar) and a plot-title purity summary (n_pure/n_total). Applies to both the regular and bootstrap (PvClust) dendrogram paths. Co-Authored-By: Claude Sonnet 4.6 --- code/clusterpurity.py | 76 +++++++++++++++++++ code/plotting.py | 126 +++++++++++++++++++++++++------ code/pvclust.py | 16 ++-- code/tests/test_clusterpurity.py | 78 +++++++++++++++++++ devnotes.md | 67 +++++++++++++++- 5 files changed, 334 insertions(+), 29 deletions(-) create mode 100644 code/clusterpurity.py create mode 100644 code/tests/test_clusterpurity.py diff --git a/code/clusterpurity.py b/code/clusterpurity.py new file mode 100644 index 0000000..afcebbd --- /dev/null +++ b/code/clusterpurity.py @@ -0,0 +1,76 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free dendrogram "purity" coloring: a branch is colored green if every +leaf beneath it shares the same group label -- i.e. that group is a +monophyletic clade, it clustered together before merging with anything +else -- and left at the default color otherwise. Used by the dendrogram tab +to make it visually obvious whether technical replicates of one Sample +cluster tightly together, and separately whether biological replicates of +one Biolgroup are well separated from other groups. + +This module is Qt-free and unit-tested (see ``tests/test_clusterpurity.py``). +""" + + +def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='black'): + """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``. + + Args: + Z: linkage matrix (``scipy.cluster.hierarchy.linkage`` or + fastcluster's drop-in) built on observations in the same order + as ``leaf_labels``. + leaf_labels: sequence, length == number of observations clustered by + ``Z``, giving each leaf's group label (e.g. its Sample or + Biolgroup), in the same order as the data passed to ``linkage``. + + Returns: + callable: ``link_color_func(k)`` as expected by ``dendrogram``'s + ``link_color_func`` argument -- for link index ``k`` + (``len(leaf_labels) <= k``), returns ``true_color`` if every leaf + descending from that link shares one label, else ``false_color``. + """ + n_leaves = len(leaf_labels) + leaf_label_sets = {i: {leaf_labels[i]} for i in range(n_leaves)} + colors = {} + for i, row in enumerate(Z): + a, b = int(row[0]), int(row[1]) + node_id = n_leaves + i + merged = leaf_label_sets[a] | leaf_label_sets[b] + leaf_label_sets[node_id] = merged + colors[node_id] = true_color if len(merged) == 1 else false_color + return lambda k: colors.get(k, false_color) + + +def purity_summary(Z, leaf_labels): + """Count how many distinct group labels form one pure clade each. + + A label is "pure" only if *every* leaf carrying that label ends up + together in one clade before that clade merges with any other leaf -- + i.e. the group is exactly monophyletic in the dendrogram. (A node whose + descendants are a uniform-but-incomplete subset of a label -- e.g. 2 of + a Sample's 3 technical replicates -- does NOT count: the third + replicate clustering elsewhere means that Sample isn't really pure.) + + Returns: + (n_pure, n_total): number of distinct labels that are fully pure + clades, out of the total number of distinct labels in + ``leaf_labels``. + """ + n_leaves = len(leaf_labels) + leaf_index_sets = {i: frozenset((i,)) for i in range(n_leaves)} + target_sets = { + label: frozenset(i for i in range(n_leaves) if leaf_labels[i] == label) + for label in set(leaf_labels) + } + pure_labels = set() + for i, row in enumerate(Z): + a, b = int(row[0]), int(row[1]) + node_id = n_leaves + i + merged = leaf_index_sets[a] | leaf_index_sets[b] + leaf_index_sets[node_id] = merged + for label, target in target_sets.items(): + if merged == target: + pure_labels.add(label) + return len(pure_labels), len(target_sets) diff --git a/code/plotting.py b/code/plotting.py index 9647767..ff0e76f 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -10,6 +10,7 @@ from csvcache import cached_read_csv, invalidate as invalidate_csv_cache import ordination +import clusterpurity import matplotlib #matplotlib.style.use('ggplot') @@ -798,45 +799,126 @@ def plot(self, parent, file, filtereddfs, groupsets): class plot_dendrogram(ui_plot): """ - Dendrogram generation. - - A CSV file of data for clustering is read and code performs hierarchical clustering using the ward method - and the euclidean distance metric. The resulting dendrogram is plotted on the given frame using the parent object's - figure and canvas. The dendrogram can be either regular or bootstrapped depending on the value of the - parent.analysis_paramsgui.bootstrap parameter. The resulting plot is saved to the parent object's figure and - displayed on the canvas. + Dendrogram generation, with a combo-box switcher (same pattern as + plot_ordination's method/view bar) between two purity-colored views: + + - "Technical Replicates": every injection is its own leaf, colored + green wherever an entire Sample's injections cluster together before + merging with anything else -- a quick visual QC for whether technical + replicates are tight. + - "Biological Replicates": injections are first averaged per Sample + (same collapsing logic as the ordination tab's "Collapse Technical + Replicates" checkbox, via ordination.load_ordination_matrix), then + leaves are colored green wherever an entire Biolgroup's samples + cluster together -- a quick visual QC for whether biological groups + are separable at all, independent of technical noise. + + Either view can be regular or bootstrapped (PvClust), depending on + parent.analysis_paramsgui.bootstrap, same as before this rework. The + purity-coloring math lives in the Qt-free clusterpurity.py. """ + VIEWS = ('Technical Replicates', 'Biological Replicates') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + # Default matches the plot's previous (injection-level) behaviour + # exactly, so existing sessions see no change until they explicitly + # switch to the biological-replicate view. + self.view = 'Technical Replicates' + self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) + def _build_switcher_bar(self, parent, currplt): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + layout.addStretch() + + self.view_combo = view_combo + parent.pltlayout[currplt].insertWidget(0, bar) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): - heirarch = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1) - data_scaled = normalize(heirarch, axis=0) # normalize features - data_scaled = pd.DataFrame(data_scaled, columns=heirarch.columns, index=heirarch.index) - textlabels = [elem for elem in data_scaled.columns.tolist()] - + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + # PvClust (bootstrap path) expects "variables x objects" -- it + # bootstraps over the rows (features) and transposes internally + # before clustering the columns (the objects/leaves). shc.linkage + # (regular path) expects the opposite, "objects x variables" -- + # build both orientations from the same scaled data below. + if self.view == 'Biological Replicates': + # Collapse technical replicates first -- leaves are Samples, + # purity is judged against Biolgroup. + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix(file, raw_header.copy(), collapse_replicates=True) + data_scaled = normalize(x.values, axis=1) # normalize each sample's profile + data_scaled = pd.DataFrame(data_scaled, columns=x.columns, index=x.index) # samples x features + textlabels = data_scaled.index.tolist() + leaf_labels = [biolgroup[sample] for sample in textlabels] + data_for_linkage = data_scaled + data_for_pvclust = data_scaled.transpose() + purity_noun = 'biological groups separable' + else: + heirarch = pd.read_csv(file, sep=',', header=[2], index_col=[0]).drop(['m/z', 'Retention time (min)'], axis=1) + data_scaled = normalize(heirarch, axis=0) # normalize features + data_scaled = pd.DataFrame(data_scaled, columns=heirarch.columns, index=heirarch.index) # features x injections + textlabels = data_scaled.columns.tolist() + raw_header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + raw_header.columns = ['Biolgroup', 'Sample', 'Injection'] + sample_of_injection = raw_header.set_index('Injection')['Sample'].to_dict() + leaf_labels = [sample_of_injection[name] for name in textlabels] + data_for_linkage = data_scaled.transpose() + data_for_pvclust = data_scaled + purity_noun = "samples' replicates clustered together" + if parent.analysis_paramsgui.bootstrap: # bootstrap dendrogram - pv = PvClust(data_scaled, method="ward", metric="euclidean", nboot=1000, parallel=True) - dend = pv.plot(parent.ax[self.currplt], labels=textlabels) + pv = PvClust(data_for_pvclust, method="ward", metric="euclidean", nboot=1000, parallel=True) + link_color_func = clusterpurity.purity_link_color_func(pv.linkage_matrix, leaf_labels) + dend = pv.plot(parent.ax[self.currplt], labels=textlabels, link_color_func=link_color_func) + Z = pv.linkage_matrix else: # regular dendrogram - dend = shc.dendrogram(shc.linkage(data_scaled.transpose(), method='ward'), ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', labels=textlabels) # default leaf label size 16 + Z = shc.linkage(data_for_linkage, method='ward') + link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels) + dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, above_threshold_color='black', link_color_func=link_color_func, labels=textlabels) # default leaf label size 16 + + n_pure, n_total = clusterpurity.purity_summary(Z, leaf_labels) + parent.ax[self.currplt].set_title(f'{n_pure}/{n_total} {purity_noun}', fontsize=10) parent.fig[self.currplt].subplots_adjust( left=0.1, right=0.95, bottom=0.35, top=0.9, hspace=0.2, wspace=0.2) parent.canvas[self.currplt].draw() -_ORDINATION_SWITCHER_BAR_HEIGHT = 32 +# Shared by plot_dendrogram's and plot_ordination's combo-box switcher bars +# -- page_dend and page_pca both have the same light background +# (rgba(225,225,225,255), see ui_main.py), unlike searchtree.py's filter bar +# (a dark-themed tab) -- dark text on a light/white combo box, not +# searchtree's light-on-dark scheme. +_SWITCHER_BAR_HEIGHT = 32 -# Unlike searchtree.py's filter bar (a dark-themed tab), page_pca's -# background is light (rgba(225,225,225,255), see ui_main.py) -- dark text -# on a light/white combo box, not searchtree's light-on-dark scheme. -_ORDINATION_SWITCHER_STYLE = """ +_SWITCHER_BAR_STYLE = """ QWidget { background: transparent; } @@ -887,8 +969,8 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): def _build_switcher_bar(self, parent, currplt): bar = QtWidgets.QWidget() - bar.setStyleSheet(_ORDINATION_SWITCHER_STYLE) - bar.setMaximumHeight(_ORDINATION_SWITCHER_BAR_HEIGHT) + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) layout = QtWidgets.QHBoxLayout(bar) layout.setContentsMargins(4, 2, 4, 2) diff --git a/code/pvclust.py b/code/pvclust.py index 353d301..1efc2dd 100644 --- a/code/pvclust.py +++ b/code/pvclust.py @@ -187,10 +187,11 @@ def _result(self): columns=['AU', 'BP', 'SE.AU', 'SE.BP', 'pchi', 'v', 'c']) return result - def plot(self, ax, labels=None): #added axis input + def plot(self, ax, labels=None, link_color_func=None): #added axis input """Plot dendrogram with AU BP values for each node""" plot_dendrogram(self.linkage_matrix, - np.array(self.result[['AU', 'BP']]), ax, labels) + np.array(self.result[['AU', 'BP']]), ax, labels, + link_color_func) def seplot(self, pvalue='AU', annotate=False): """p-values vs Standard error plot""" @@ -271,7 +272,7 @@ def find_clusters(self): return clusters -def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None): #added axis input +def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None, link_color_func=None): #added axis input """ Plot dendrogram with AU BP values for each node""" d = dendrogram(linkage_matrix, no_plot=True) xcoord = d["icoord"] @@ -280,13 +281,18 @@ def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None): #added axis inp x = {i: (j[1]+j[2])/2 for i, j in enumerate(xcoord)} y = {i: j[1] for i, j in enumerate(ycoord)} pos = node_positions(y, x) - + plt.figure(figsize=(12, 8)) plt.tight_layout() set_link_color_palette(['c', 'g']) + # link_color_func, when given, takes priority over color_threshold/ + # above_threshold_color (scipy's own precedence rule) -- that's how the + # dendrogram tab's purity coloring (clusterpurity.py) reaches the + # bootstrap dendrogram too. d = dendrogram(linkage_matrix, labels=labels, above_threshold_color='black', - color_threshold=0, leaf_rotation=90, ax = axis) + color_threshold=0, leaf_rotation=90, ax=axis, + link_color_func=link_color_func) maxval = max(y.values()) ax = axis for node, (x, y) in pos.items(): #modifications added to scale y axis label shifts diff --git a/code/tests/test_clusterpurity.py b/code/tests/test_clusterpurity.py new file mode 100644 index 0000000..6cb74cb --- /dev/null +++ b/code/tests/test_clusterpurity.py @@ -0,0 +1,78 @@ +"""Unit tests for dendrogram purity coloring (``clusterpurity.py``).""" + +import numpy as np +from scipy.cluster.hierarchy import linkage + +from clusterpurity import purity_link_color_func, purity_summary + + +def _two_clean_groups(): + """6 points: 3 tightly clustered near (0, 0) labeled 'A', 3 tightly + clustered near (10, 10) labeled 'B' -- each group should merge with + itself long before the two groups merge with each other. + """ + data = np.array([ + [0.0, 0.0], [0.1, 0.0], [0.0, 0.1], + [10.0, 10.0], [10.1, 10.0], [10.0, 10.1], + ]) + labels = ['A', 'A', 'A', 'B', 'B', 'B'] + return data, labels + + +def test_purity_summary_both_groups_pure(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + n_pure, n_total = purity_summary(Z, labels) + assert (n_pure, n_total) == (2, 2) + + +def test_purity_link_color_func_roots_to_false_color_leaves_to_true_color(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + n_leaves = len(labels) + color_func = purity_link_color_func(Z, labels) + + # The final merge (root) joins group A's clade with group B's clade -- + # that link must NOT be the "pure" color. + root_node_id = n_leaves + len(Z) - 1 + assert color_func(root_node_id) == 'black' + + # Every internal node strictly below the root is a within-group merge + # for this dataset (each group's 3 points cluster before the cross-group + # merge) -- those links must be the "pure" color. + for i in range(len(Z) - 1): + node_id = n_leaves + i + assert color_func(node_id) == 'green' + + +def test_purity_link_color_func_custom_colors(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + color_func = purity_link_color_func(Z, labels, true_color='cyan', false_color='grey') + n_leaves = len(labels) + root_node_id = n_leaves + len(Z) - 1 + assert color_func(root_node_id) == 'grey' + assert color_func(n_leaves) == 'cyan' + + +def test_purity_summary_one_mismatched_leaf_breaks_purity_for_its_group(): + # Same as the clean two-group case, but one of group A's points is + # actually closest to group B -- A should no longer be reported pure + # (its leaves don't all merge together before meeting a 'B' leaf), + # while B (unaffected) should still be pure. + data = np.array([ + [0.0, 0.0], [0.1, 0.0], [9.9, 9.9], # last "A" point sits with B + [10.0, 10.0], [10.1, 10.0], [10.0, 10.1], + ]) + labels = ['A', 'A', 'A', 'B', 'B', 'B'] + Z = linkage(data, method='ward') + n_pure, n_total = purity_summary(Z, labels) + assert n_pure == 1 + assert n_total == 2 + + +def test_purity_link_color_func_unknown_link_id_falls_back_to_false_color(): + data, labels = _two_clean_groups() + Z = linkage(data, method='ward') + color_func = purity_link_color_func(Z, labels) + assert color_func(99999) == 'black' diff --git a/devnotes.md b/devnotes.md index 9ece8b8..912e641 100644 --- a/devnotes.md +++ b/devnotes.md @@ -108,7 +108,8 @@ that way. Required deps (gate startup): `epam.indigo`→`indigo`, `UpSetPlot`→ fields), `biogroups.py` (`getgroups()`'s metadata-join/group-derivation core), `dbsearch.py` (`fulldbsearch()`'s NPAtlas ppm-window matching core), `ordination.py` (PCA/NMDS/PLS-DA + technical-replicate collapsing - + top-N loadings selection for the multivariate plot tab). Each + + top-N loadings selection for the multivariate plot tab), `clusterpurity.py` + (dendrogram branch-purity coloring for the dendrogram tab). Each corresponding `MainWindow` method is now a thin wrapper: call the module function, then apply the result to widgets/`self`. - **Runtime widget substitution into a Designer placeholder** is an @@ -161,7 +162,8 @@ python -m pytest code/tests -q ``` Covers `filter`, `stats`, `importdependencies`, `translators`, `groupsets`, -`searchtree`, `ordination`. Add tests here for any new Qt-free logic. +`searchtree`, `ordination`, `clusterpurity`. Add tests here for any new +Qt-free logic. `conftest.py` sets `QT_QPA_PLATFORM=offscreen` and provides a session-scoped `qapp` fixture: PyQt5 widgets/models/signals *can* be exercised headlessly via @@ -279,6 +281,67 @@ only handles the combo boxes, axes, and pick events. Logged here as the next ordination method to add if ever revisited, not started. +## Dendrogram purity coloring (`plotting.plot_dendrogram`, `clusterpurity.py`) + +The dendrogram tab has a combo-box switcher (same runtime-widget-substitution +pattern as `plot_ordination`'s method/view bar) between two views, both +purity-colored to make a QC judgment visible at a glance rather than read off +leaf labels one at a time: + +- **Technical Replicates** (default — matches the tab's previous, only, + behaviour): every Injection is its own leaf. A branch is colored green + wherever *all* of one Sample's injections merge together before merging + with anything else (i.e. that Sample is a monophyletic clade) — a + tight green clump means that sample's replicates agree; black means they + don't. +- **Biological Replicates**: technical replicates are averaged first (same + `ordination.load_ordination_matrix(..., collapse_replicates=True)` used by + the multivariate tab's checkbox), so leaves are Samples, and purity is + judged against Biolgroup instead — green means a whole biological group's + samples cluster together before meeting another group, i.e. the groups are + separable; black means they're not. + +The plot title reports `n_pure/n_total` (e.g. "7/9 samples' replicates +clustered together", "3/3 biological groups separable") using +`clusterpurity.purity_summary()` — the same Qt-free linkage-traversal logic +that drives the coloring, unit-tested in `tests/test_clusterpurity.py`. + +- **Purity is a strict, whole-group check, not "any uniform subset"**: a + label only counts as pure if *every* leaf carrying it ends up in one clade + before that clade touches a different label — 2 of a Sample's 3 replicates + merging together does NOT make that Sample pure if the third replicate + clusters elsewhere. An earlier version of `purity_summary()` got this + wrong (counted a label pure as soon as ANY uniform-label merge occurred, + which is right for `purity_link_color_func`'s per-branch coloring but wrong + for the whole-group summary count) — caught by a test built from a + deliberately "rogue" planted point, not by inspection. +- **PvClust orientation gotcha**: `pvclust.PvClust` expects "variables x + objects" (rows = the things bootstrapped over, i.e. features; it + transposes internally before clustering the columns) — the *opposite* + orientation from `scipy.cluster.hierarchy.linkage`, which expects "objects + x variables". `plot_dendrogram.plot()` builds both orientations + (`data_for_linkage`, `data_for_pvclust`) from the same scaled data rather + than reusing one array, since which one is "transposed" flips between the + Technical (features x injections is the natural read) and Biological + (samples x features, from `load_ordination_matrix`) views. +- **`link_color_func` threaded through the bootstrap path too**: + `PvClust.plot()` and the free function `pvclust.plot_dendrogram()` both + gained a `link_color_func=None` passthrough parameter into their inner + `scipy.cluster.hierarchy.dendrogram()` call, so the AU/BP bootstrap + dendrogram gets the same purity coloring as the regular one (`scipy`'s own + precedence rule: `link_color_func`, when given, overrides + `color_threshold`/`above_threshold_color`). +- **Multiprocessing safety note (re-learned, not new)**: validating the + bootstrap path's wiring during development used `parallel=False` and a + tiny `nboot`, never `PvClust(..., parallel=True)` in an ad hoc script — + `multiprocessing.Pool()` re-executes a script's top-level code in each + spawned child on Windows unless the call site is guarded by + `if __name__ == '__main__':` (the same class of hazard as the frozen-exe + fork-bomb bug elsewhere in this file, just without needing + `freeze_support()` specifically). The real app is fine — `main.py` already + guards its entry point — but throwaway test scripts need the same + discipline. + ## Conventions - Don't edit the generated UI files (above). Put behaviour in `main.py` / From 573cbfa383921c8f4c53ed9a21da67881b013a31 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 18:04:08 -0400 Subject: [PATCH 09/20] Dendrogram: polyphyletic branches in red, add a no-coloring option Purity coloring now uses red (not black) for branches that mix more than one group, and a new "Color: None" mode reproduces the tab's pre-purity-coloring appearance exactly (plain black, no title) -- fixes a regression where dropping color_threshold=0 made "None" fall back to scipy's default multi-color palette instead of plain black. Co-Authored-By: Claude Sonnet 4.6 --- code/plotting.py | 80 ++++++++++++++++++++++++++++++++++-------------- devnotes.md | 54 +++++++++++++++++++------------- 2 files changed, 89 insertions(+), 45 deletions(-) diff --git a/code/plotting.py b/code/plotting.py index ff0e76f..faa3c92 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -799,35 +799,46 @@ def plot(self, parent, file, filtereddfs, groupsets): class plot_dendrogram(ui_plot): """ - Dendrogram generation, with a combo-box switcher (same pattern as - plot_ordination's method/view bar) between two purity-colored views: - - - "Technical Replicates": every injection is its own leaf, colored - green wherever an entire Sample's injections cluster together before - merging with anything else -- a quick visual QC for whether technical - replicates are tight. + Dendrogram generation, with combo-box switchers (same pattern as + plot_ordination's method/view bar) for which leaves to cluster and how + to color the branches: + + Views: + - "Technical Replicates": every injection is its own leaf -- branches + are judged for purity against Sample membership, a quick visual QC + for whether technical replicates are tight. - "Biological Replicates": injections are first averaged per Sample (same collapsing logic as the ordination tab's "Collapse Technical Replicates" checkbox, via ordination.load_ordination_matrix), then - leaves are colored green wherever an entire Biolgroup's samples - cluster together -- a quick visual QC for whether biological groups - are separable at all, independent of technical noise. - - Either view can be regular or bootstrapped (PvClust), depending on - parent.analysis_paramsgui.bootstrap, same as before this rework. The - purity-coloring math lives in the Qt-free clusterpurity.py. + leaves are Samples, judged for purity against Biolgroup -- a quick + visual QC for whether biological groups are separable at all, + independent of technical noise. + + Coloring: + - "Purity": green wherever a branch's leaves are entirely one group + (correctly clustered), red wherever a branch mixes more than one + group (polyphyletic). + - "None": plain black dendrogram, no purity coloring or title -- the + tab's original (pre-purity-coloring) appearance. + + Either view/coloring combination can be regular or bootstrapped + (PvClust), depending on parent.analysis_paramsgui.bootstrap, same as + before this rework. The purity-coloring math lives in the Qt-free + clusterpurity.py. """ VIEWS = ('Technical Replicates', 'Biological Replicates') + COLOR_MODES = ('Purity', 'None') def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt - # Default matches the plot's previous (injection-level) behaviour - # exactly, so existing sessions see no change until they explicitly - # switch to the biological-replicate view. + # Defaults match the plot's previous (injection-level, uncolored) + # behaviour exactly, so existing sessions see no change until they + # explicitly switch the new controls. self.view = 'Technical Replicates' + self.color_mode = 'Purity' self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) @@ -844,15 +855,27 @@ def _build_switcher_bar(self, parent, currplt): view_combo.setCurrentText(self.view) view_combo.currentTextChanged.connect(self._on_view_changed) layout.addWidget(view_combo) + + layout.addWidget(QtWidgets.QLabel('Color:')) + color_combo = QtWidgets.QComboBox() + color_combo.addItems(self.COLOR_MODES) + color_combo.setCurrentText(self.color_mode) + color_combo.currentTextChanged.connect(self._on_color_mode_changed) + layout.addWidget(color_combo) layout.addStretch() self.view_combo = view_combo + self.color_combo = color_combo parent.pltlayout[currplt].insertWidget(0, bar) def _on_view_changed(self, view): self.view = view self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def _on_color_mode_changed(self, color_mode): + self.color_mode = color_mode + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): self._last_file = file self._last_filtereddfs = filtereddfs @@ -895,17 +918,28 @@ def plot(self, parent, file, filtereddfs, groupsets): if parent.analysis_paramsgui.bootstrap: # bootstrap dendrogram pv = PvClust(data_for_pvclust, method="ward", metric="euclidean", nboot=1000, parallel=True) - link_color_func = clusterpurity.purity_link_color_func(pv.linkage_matrix, leaf_labels) - dend = pv.plot(parent.ax[self.currplt], labels=textlabels, link_color_func=link_color_func) Z = pv.linkage_matrix else: # regular dendrogram Z = shc.linkage(data_for_linkage, method='ward') - link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels) - dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, above_threshold_color='black', link_color_func=link_color_func, labels=textlabels) # default leaf label size 16 - n_pure, n_total = clusterpurity.purity_summary(Z, leaf_labels) - parent.ax[self.currplt].set_title(f'{n_pure}/{n_total} {purity_noun}', fontsize=10) + if self.color_mode == 'Purity': + # Green = monophyletic (correctly clustered); red = polyphyletic + # (mixes more than one group). + link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red') + else: + link_color_func = None # plain black dendrogram, scipy's own default + + if parent.analysis_paramsgui.bootstrap: + dend = pv.plot(parent.ax[self.currplt], labels=textlabels, link_color_func=link_color_func) + else: + dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', link_color_func=link_color_func, labels=textlabels) # default leaf label size 16 + + if self.color_mode == 'Purity': + n_pure, n_total = clusterpurity.purity_summary(Z, leaf_labels) + parent.ax[self.currplt].set_title(f'{n_pure}/{n_total} {purity_noun}', fontsize=10) + # "None" coloring intentionally leaves no title -- this tab had none + # before purity coloring was added. parent.fig[self.currplt].subplots_adjust( left=0.1, right=0.95, bottom=0.35, top=0.9, hspace=0.2, wspace=0.2) diff --git a/devnotes.md b/devnotes.md index 912e641..69eac0a 100644 --- a/devnotes.md +++ b/devnotes.md @@ -283,28 +283,38 @@ only handles the combo boxes, axes, and pick events. ## Dendrogram purity coloring (`plotting.plot_dendrogram`, `clusterpurity.py`) -The dendrogram tab has a combo-box switcher (same runtime-widget-substitution -pattern as `plot_ordination`'s method/view bar) between two views, both -purity-colored to make a QC judgment visible at a glance rather than read off -leaf labels one at a time: - -- **Technical Replicates** (default — matches the tab's previous, only, - behaviour): every Injection is its own leaf. A branch is colored green - wherever *all* of one Sample's injections merge together before merging - with anything else (i.e. that Sample is a monophyletic clade) — a - tight green clump means that sample's replicates agree; black means they - don't. -- **Biological Replicates**: technical replicates are averaged first (same - `ordination.load_ordination_matrix(..., collapse_replicates=True)` used by - the multivariate tab's checkbox), so leaves are Samples, and purity is - judged against Biolgroup instead — green means a whole biological group's - samples cluster together before meeting another group, i.e. the groups are - separable; black means they're not. - -The plot title reports `n_pure/n_total` (e.g. "7/9 samples' replicates -clustered together", "3/3 biological groups separable") using -`clusterpurity.purity_summary()` — the same Qt-free linkage-traversal logic -that drives the coloring, unit-tested in `tests/test_clusterpurity.py`. +The dendrogram tab has two combo-box switchers (same runtime-widget- +substitution pattern as `plot_ordination`'s method/view bar): + +- **View** — which leaves to cluster: + - **Technical Replicates** (default — matches the tab's original + behaviour): every Injection is its own leaf, purity judged against + Sample membership — a tight monophyletic clump means that sample's + replicates agree. + - **Biological Replicates**: technical replicates are averaged first + (same `ordination.load_ordination_matrix(..., collapse_replicates=True)` + used by the multivariate tab's checkbox), so leaves are Samples, purity + judged against Biolgroup instead — a monophyletic clade means a whole + biological group's samples cluster together before meeting another + group, i.e. the groups are separable. +- **Color** — how to render purity: + - **Purity** (default): green wherever a branch's leaves are entirely one + group (correctly clustered), red wherever a branch mixes more than one + group (polyphyletic) — a QC judgment visible at a glance rather than + read off leaf labels one at a time. The plot title reports + `n_pure/n_total` (e.g. "7/9 samples' replicates clustered together", + "3/3 biological groups separable") via `clusterpurity.purity_summary()`. + - **None**: plain black, no title — deliberately reproduces the tab's + appearance from *before* purity coloring existed (there was no title at + all previously), for anyone who just wants the dendrogram shape without + the QC overlay. Implemented as `link_color_func=None` with + `color_threshold=0` still set (dropping `color_threshold=0` here was a + real regression caught while testing: without it, scipy falls back to + its own default 0.7-of-max-height threshold and a multi-color palette + instead of plain black). + +Both views' purity math is the same Qt-free linkage-traversal logic in +`clusterpurity.py`, unit-tested in `tests/test_clusterpurity.py`. - **Purity is a strict, whole-group check, not "any uniform subset"**: a label only counts as pure if *every* leaf carrying it ends up in one clade From bab713d05d38d87e5a40f5838ffdd66732db1052 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 18:16:19 -0400 Subject: [PATCH 10/20] Replace treemap/upset PNG round-trip with real canvas plots gen_treemap/gen_upsetplt used to savefig() a PNG to the repo root and load it into a QLabel via QPixmap -- no zoom/pan/save toolbar, and a flat raster rewritten on every run. plot_treemap/plot_upset now draw directly onto a persistent FigureCanvas, wired into _generate_plots() via the same _create_or_reset pattern every other plot uses, so they regenerate on both a fresh analysis and the Apply button (previously only on a fresh analysis). Co-Authored-By: Claude Sonnet 4.6 --- code/main.py | 23 ++--- code/plotting.py | 263 ++++++++++++++++++++++++++++------------------- devnotes.md | 49 +++++++++ 3 files changed, 217 insertions(+), 118 deletions(-) diff --git a/code/main.py b/code/main.py index fc91309..418e2bd 100644 --- a/code/main.py +++ b/code/main.py @@ -41,7 +41,7 @@ from biogroups import compute_biological_groups from dbsearch import search_npatlas from searchtree import SearchTreePanel -from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, gen_upsetplt, gen_treemap +from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, plot_upset, plot_treemap import getfragdb from indigo import Indigo @@ -1056,6 +1056,11 @@ def _generate_plots(self): dfs = self.filtereddfs grpsts = self.groupsets + self._create_or_reset('treemap', 'treemap', + lambda: plot_treemap(self, 'treemap', self.ui.frame_treemap, pltfile, '', ''), + lambda: self.treemap.reset(pltfile, '', '')) + stop_functime('treemap complete') + if params.CVfil: self._create_or_reset('prevcv', 'CV plot', lambda: prev_cv(self, 'cvplt', self.ui.frame_cvplt, 'none', 'none', 'none'), @@ -1119,6 +1124,11 @@ def _generate_plots(self): lambda: self.samplecorr.reset(iondictfile, dfs, grpsts)) stop_functime('samplecorr complete') + self._create_or_reset('upset', 'upset plot', + lambda: plot_upset(self, 'upset', self.ui.frame_upset, iondictfile, '', ''), + lambda: self.upset.reset(iondictfile, '', '')) + stop_functime('upsetplt complete') + def run_analysis(self): # Ignore re-clicks while an analysis is already running on the worker thread. if getattr(self, '_analysis_thread', None) is not None and self._analysis_thread.isRunning(): @@ -1171,12 +1181,6 @@ def _on_compute_finished(self): self.ui.btn_run.setEnabled(True) def _finish_analysis(self): - try: - gen_treemap(self) # move back to end - except Exception: - print("not generating tremap due to an error") - stop_functime('treemap complete') - # Used for point opacity based on abundance colouring iondict = cached_read_csv(self.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=None) self.analysis_paramsgui.maxval = iondict['logmax'].max() @@ -1224,11 +1228,6 @@ def _finish_analysis(self): self.fillfttree() self.dbsearchdone = True - try: - gen_upsetplt(self) - except Exception: - print("not generating upset plot due to an error") - stop_functime('upsetplt complete') self.ui.label_status.setText('Analysis Complete') stop_functime('analysis complete') print('') diff --git a/code/plotting.py b/code/plotting.py index faa3c92..06d1b7f 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -31,7 +31,7 @@ import platform from PyQt5 import QtCore, QtGui, QtWidgets from PyQt5.QtCore import (QCoreApplication, QPropertyAnimation, QDate, QDateTime, QMetaObject, QObject, QPoint, QRect, QSize, QTime, QUrl, Qt, QEvent) -from PyQt5.QtGui import (QBrush, QColor, QIcon, QPalette, QPainter, QPixmap) +from PyQt5.QtGui import (QBrush, QColor, QIcon, QPalette, QPainter) from PyQt5.QtWidgets import * from pathlib import Path @@ -1361,114 +1361,165 @@ def plot(self, parent, file, filtereddfs, groupsets): parent.canvas[currplt].draw() -def gen_upsetplt(parent): #need to do something to handle groups with names that are substrings of other group names +def _detach_placeholder_widget(frame, old_widget): + """Remove a Designer-placed placeholder widget (and the layout holding + it) from ``frame`` so a fresh layout can be installed in its place. + + Most plot frames in this app start out empty in Designer, so + ``ui_plot.__init__`` can just call ``frame.setLayout(...)`` directly. + ``frame_treemap``/``frame_upset`` are the exception -- Designer gave + them a layout with a placeholder ``QLabel`` (the old static-image + target) already in it, and Qt refuses ``setLayout()`` on a frame that + already has one. Reparenting the old layout onto a throwaway widget + (the standard Qt trick for "delete this layout") detaches it from + ``frame`` without touching anything else -- same runtime + widget-substitution pattern as searchtree.py's filter-bar swap. + """ + old_layout = frame.layout() + if old_layout is not None: + old_layout.removeWidget(old_widget) + old_widget.setParent(None) + old_widget.deleteLater() + QtWidgets.QWidget().setLayout(old_layout) + + +class plot_treemap(ui_plot): + """Treemap of how many features each enabled filter removed. + + Drawn directly onto a persistent FigureCanvas (same runtime-widget- + substitution + ui_plot pattern as every other plot tab) instead of the + previous ``squarify.plot()`` -> ``savefig('treemap.png')`` -> ``QPixmap`` + round trip into a Designer-placed ``QLabel`` (``label_treemap``) -- that + PNG round trip meant no zoom/pan/save-at-resolution toolbar, and a flat + raster file rewritten at the repo root on every run. """ - Generate an upset plot to visualize sets of compounds in groups. This function also handles groups with names that are substrings of other group names. - Parameters: - parent (object): The parent object that the generated plot will be a child of. + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): + _detach_placeholder_widget(frame, parent.ui.label_treemap) + ui_plot.__init__(self, parent, currplt, frame) + self.parent = parent + self.currplt = currplt + self.plot(parent, file, filtereddfs, groupsets) - Returns: - None - """ - iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=0, index_col=None) - - # Apply filters if required - if parent.analysis_paramsgui.relfil: - iondict = iondict[iondict['pass_relfil']] - if parent.analysis_paramsgui.decon: - iondict = iondict[iondict['pass_insource']] - if parent.analysis_paramsgui.blnkfltr: - iondict = iondict[iondict['pass_blnkfil']] - if parent.analysis_paramsgui.CVfil: - iondict = iondict[iondict['pass_cvfil']] - - # Prepare data for upset plot - iongroups = iondict['groups'].tolist() - freq = {} - biolgroups = [] - for item in iongroups: - if item not in freq: - freq[item] = 0 - freq[item] += 1 - - header = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0, 1, 2]).iloc[0, :] - for elem in header: - if elem not in biolgroups: - biolgroups.append(elem) - - sets = [' ' + elem for elem in list(freq.keys())] - size = list(freq.values()) - setdf = pd.DataFrame({'groups': sets}) - for elem in biolgroups: #have to do this if one group is a substring of another, add space - setdf[elem] = setdf['groups'].str.contains(' ' + elem) - setdf['size'] = size - setdf = setdf.iloc[:, 1:] - setdf = setdf.set_index(biolgroups)['size'] - - # Plot and display the upset plot - with plt.rc_context({"font.size": 8}): - upsetplt = upsetplot.plot(setdf, show_counts='%d', show_percentages=True, sort_categories_by=None) - - figup = upsetplt['matrix'].figure - figup.set_size_inches(5, 4) - figup.set_facecolor((0, 0, 0, 0)) - upsetplt['intersections'].set_facecolor((1, 1, 1, .25)) - figup.savefig('test_upsetplt.png', dpi=150, bbox_inches='tight') - pixmap = QPixmap('test_upsetplt.png') - parent.ui.label_upset.setPixmap(pixmap) - -def gen_treemap(parent): - #generate treemap for visualization of filtering levels - #needed to refilter data and see how df row lengths change to avoid issues with one feature being in multiple filter lists - """ - The gen_treemap function generates a treemap for visualizing filtering levels. The function reads a CSV file containing the - filtered data and another CSV file containing information about the ions. The function then filters the ion data based on - various filter options and calculates the number of ions filtered by each filter. Finally, the function generates a treemap - to display the number of ions that passed each filter and saves it as a PNG file. The treemap is then displayed in a QLabel in the GUI. + def plot(self, parent, file, filtereddfs, groupsets): + msdata_filtered = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) + iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) - Args: - - parent: the parent widget where the treemap will be displayed + fltrcnt, color = {}, [] + current = len(iondict.index) + + if parent.analysis_paramsgui.relfil: + filteredsetsize = len(iondict[iondict['pass_relfil']].index) + fltrcnt['Mispicked'] = current - filteredsetsize + current = filteredsetsize + color.append('#0000ff') + + if parent.analysis_paramsgui.blnkfltr: + filteredsetsize = len(iondict[iondict['pass_blnkfil']].index) + fltrcnt['Blank'] = current - filteredsetsize + current = filteredsetsize + color.append('#00aaaa') + + if parent.analysis_paramsgui.CVfil: + fltrcnt['Nonreproducible'] = len(parent.ionfilters['cv'].ions) + current = current - fltrcnt['Nonreproducible'] + color.append('#ff0000') + + if parent.analysis_paramsgui.decon: + fltrcnt['Insource'] = len(parent.ionfilters['insource'].ions) + color.append('#00aa00') + + fltrcnt['High Quality'] = len(msdata_filtered.index) + color.append('#000000') + + sizes = list(fltrcnt.values()) + total_size = sum(sizes) + labels = [f"{label}\n{size}\n{round(100 * size / total_size, 1)}%" for label, size in fltrcnt.items()] + + ax = parent.ax[self.currplt] + ax.clear() + squarify.plot(sizes=sizes, label=labels, color=color, alpha=0.3, text_kwargs={'fontsize': 10}, ax=ax) + ax.axis('off') + parent.canvas[self.currplt].draw() + + +class plot_upset: + """Upset plot of how filtered features distribute across groupsets. + Drawn directly onto a persistent Figure -- ``upsetplot.plot()`` accepts + an existing ``fig=`` instead of always creating its own -- rather than + the previous ``upsetplot.plot()`` -> ``savefig('test_upsetplt.png')`` -> + ``QPixmap`` round trip into the Designer-placed ``label_upset``. + + Doesn't subclass ``ui_plot``, the same as ``plot_heatmap`` and for the + same reason: ``upsetplot`` lays out several axes (matrix, totals, + intersections, shading) on the figure itself via its own gridspec -- + there's no single "ax" to hand callers the way every scatter/line plot + here has, so ``ui_plot.__init__``'s single pre-made ``ax`` would just be + an unused, overlapping blank axes. """ - plt.clf() - msdata_filtered = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - fltrcnt, color = {}, [] - iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) - total = len(iondict.index) - current = total - - if parent.analysis_paramsgui.relfil: - filteredsetsize = len(iondict[iondict['pass_relfil']].index) - fltrcnt['Mispicked'] = current - filteredsetsize - current = filteredsetsize - color.append('#0000ff') - - if parent.analysis_paramsgui.blnkfltr: - filteredsetsize = len(iondict[iondict['pass_blnkfil']].index) - fltrcnt['Blank'] = current - filteredsetsize - current = filteredsetsize - color.append('#00aaaa') - - if parent.analysis_paramsgui.CVfil: - fltrcnt['Nonreproducible'] = len(parent.ionfilters['cv'].ions) - current = current - fltrcnt['Nonreproducible'] - color.append('#ff0000') - - if parent.analysis_paramsgui.decon: - fltrcnt['Insource'] = len(parent.ionfilters['insource'].ions) - color.append('#00aa00') - - fltrcnt['High Quality'] = len(msdata_filtered.index) - color.append('#000000') - - sizes = list(fltrcnt.values()) - total_size = sum(fltrcnt.values()) - labels = [f"{label}\n{size}\n{round(100*size/total_size,1)}%" for label, size in fltrcnt.items()] - - squarify.plot(sizes=sizes, label=labels, color=color, alpha=0.3, text_kwargs={'fontsize': 10}) - plt.axis('off') - plt.savefig('treemap.png', dpi=150, bbox_inches='tight') - pixmap = QPixmap('treemap.png') - parent.ui.label_treemap.setPixmap(pixmap) \ No newline at end of file + + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): + _detach_placeholder_widget(frame, parent.ui.label_upset) + self.parent = parent + self.currplt = currplt + + parent.fig[currplt] = Figure() + parent.pltlayout[currplt] = QtWidgets.QVBoxLayout() + parent.canvas[currplt] = FigureCanvas(parent.fig[currplt]) + parent.pltlayout[currplt].addWidget(parent.canvas[currplt]) + parent.toolbar[currplt] = NavigationToolbar(parent.canvas[currplt], parent) + parent.toolbar[currplt].setStyleSheet("background-color:rgba(225,225,225,0);") + parent.pltlayout[currplt].addWidget(parent.toolbar[currplt]) + frame.setLayout(parent.pltlayout[currplt]) + + self.plotbackground = (.89, .89, .89, 0) + self.plot(parent, file, filtereddfs, groupsets) + + def plot(self, parent, file, filtereddfs, groupsets): + # upsetplot.plot() lays out fresh axes via its own gridspec on + # whatever figure it's given -- clear the figure first so repeated + # calls (regenerate/Apply) don't pile up axes on top of each other. + parent.fig[self.currplt].clf() + + iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=0, index_col=None) + if parent.analysis_paramsgui.relfil: + iondict = iondict[iondict['pass_relfil']] + if parent.analysis_paramsgui.decon: + iondict = iondict[iondict['pass_insource']] + if parent.analysis_paramsgui.blnkfltr: + iondict = iondict[iondict['pass_blnkfil']] + if parent.analysis_paramsgui.CVfil: + iondict = iondict[iondict['pass_cvfil']] + + iongroups = iondict['groups'].tolist() + freq = {} + for item in iongroups: + freq[item] = freq.get(item, 0) + 1 + + header = cached_read_csv( + parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[0, :] + biolgroups = [] + for elem in header: + if elem not in biolgroups: + biolgroups.append(elem) + + sets = [' ' + elem for elem in freq.keys()] + setdf = pd.DataFrame({'groups': sets}) + for elem in biolgroups: # space-prefix handles one group name being a substring of another + setdf[elem] = setdf['groups'].str.contains(' ' + elem) + setdf['size'] = list(freq.values()) + setdf = setdf.iloc[:, 1:].set_index(biolgroups)['size'] + + with plt.rc_context({"font.size": 8}): + upsetplt = upsetplot.plot(setdf, fig=parent.fig[self.currplt], show_counts='%d', show_percentages=True, sort_categories_by=None) + + parent.fig[self.currplt].set_facecolor(self.plotbackground) + upsetplt['intersections'].set_facecolor((1, 1, 1, .25)) + parent.canvas[self.currplt].draw() + + def reset(self, file, filtereddfs, groupsets): + self.plot(self.parent, file, filtereddfs, groupsets) \ No newline at end of file diff --git a/devnotes.md b/devnotes.md index 69eac0a..9b496b7 100644 --- a/devnotes.md +++ b/devnotes.md @@ -352,6 +352,55 @@ Both views' purity math is the same Qt-free linkage-traversal logic in guards its entry point — but throwaway test scripts need the same discipline. +## Treemap / upset plot canvases (`plotting.plot_treemap`, `plotting.plot_upset`) + +These two tabs used to be the only plots in the app that weren't real +matplotlib canvases: `gen_treemap`/`gen_upsetplt` (free functions, not +`ui_plot` subclasses) drew with `squarify`/`upsetplot`, `savefig()`'d a PNG +to the repo root (`treemap.png`/`test_upsetplt.png`), then loaded that PNG +into a `QPixmap` on the Designer-placed `label_treemap`/`label_upset`. That +meant no zoom/pan/save-at-resolution toolbar, a flat raster rewritten from +scratch on every run, and files left sitting at the repo root. + +Both are now `ui_plot`-style classes (`plot_treemap`/`plot_upset`) drawing +directly onto a persistent `FigureCanvas`, wired into `MainWindow._generate_plots()` +via `_create_or_reset()` exactly like every other plot — so they're created +once and `.reset()` afterward, regenerating on both a fresh analysis run and +the dialog's "Apply" button (`regenerateplts()`), the same as every other +plot. They previously were NOT regenerated by Apply at all (`gen_treemap`/ +`gen_upsetplt` were only ever called once, directly from `_finish_analysis`) +— a small behavior change, but one that brings them in line with how every +other plot already worked, not a new inconsistency. + +- **`frame_treemap`/`frame_upset` needed a different substitution trick**: + unlike most plot frames (empty in Designer, so `ui_plot.__init__` can just + call `frame.setLayout(...)`), these two already have a Designer-built + layout holding the old placeholder `QLabel` — Qt refuses `setLayout()` on + a frame that already has one. `plotting._detach_placeholder_widget()` + removes the old label and reparents the old layout onto a throwaway + widget (the standard Qt "delete this layout" trick) before the normal + `ui_plot.__init__`/manual canvas setup runs — same runtime + widget-substitution pattern as `searchtree.py`'s filter-bar swap, just + with an extra detach step first. Verified headlessly (offscreen Qt) that + this doesn't raise and the frame ends up with exactly the new layout. +- **`plot_upset` doesn't subclass `ui_plot`**, same as `plot_heatmap` and for + the same reason: `upsetplot.plot()` lays out several axes (matrix, + totals, intersections, shading) via its own gridspec on whatever figure + it's given — there's no single "ax" to hand callers the way every + scatter/line plot here has. Unlike `plot_heatmap` (which has to transplant + axes from a brand-new seaborn figure onto the persistent one, since + `sns.clustermap()` doesn't accept an existing figure), `upsetplot.plot()` + takes a `fig=` kwarg directly — `reset()` just `fig.clf()`s and re-plots + onto the same figure, no axes-transplant needed. +- **Verified against real data, not just import-checked**: a throwaway + headless-Qt script built fake Designer-style frames (QFrame + layout + + placeholder QLabel, matching `frame_treemap`/`frame_upset`'s actual + structure), ran both classes against the real example dataset, and + asserted (1) the new canvas/toolbar actually replaced the old layout, (2) + axes count is stable across `.reset()` calls (not growing — would mean + old axes/figures were leaking), and (3) no PNG got written to disk by + either plot anymore. + ## Conventions - Don't edit the generated UI files (above). Put behaviour in `main.py` / From 3be44dc3db4088966ac78e3ae527663de7305bfb Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 18:34:35 -0400 Subject: [PATCH 11/20] Dendrogram: bridge-only red coloring; move bootstrap/collapse checkboxes to per-plot bars - clusterpurity.purity_link_color_func now distinguishes pure (green), bridge (red -- the specific merge where a new group first meets an existing one), and neutral (black -- combining two already-impure clades, no new information). Previously every ancestor of a single mixing event also rendered red, painting most of the tree's upper structure red regardless of how localized the actual mixing was. - "Bootstrap Analysis" and "Collapse Technical Replicates" moved off the global plot-config dialog (where each only ever affected one plot) onto that plot's own switcher bar: plot_dendrogram gets a "Bootstrap" checkbox, plot_ordination gets a "Collapse Replicates" checkbox. The now-orphaned dialog widgets are hidden at runtime (not edited out of the generated ui_plotparam.py); 'bootstrap' is dropped from paramfields.CHECKBOX_FIELDS since it's no longer pickled, consistent with the dendrogram/ordination tabs' other per-session-only view state. - Delete code/treemap.png and code/test_upsetplt.png: dead tracked files left over from before the canvas-based rendering change -- nothing reads or writes them anymore. Co-Authored-By: Claude Sonnet 4.6 --- code/clusterpurity.py | 34 ++++++++++++++++++---- code/paramfields.py | 1 - code/plotting.py | 48 +++++++++++++++++++++++++------ code/test_upsetplt.png | Bin 41302 -> 0 bytes code/tests/test_clusterpurity.py | 43 +++++++++++++++++++++++---- code/treemap.png | Bin 29414 -> 0 bytes code/ui_functions.py | 9 +++++- devnotes.md | 36 +++++++++++++++++++++++ 8 files changed, 150 insertions(+), 21 deletions(-) delete mode 100644 code/test_upsetplt.png delete mode 100644 code/treemap.png diff --git a/code/clusterpurity.py b/code/clusterpurity.py index afcebbd..acf8657 100644 --- a/code/clusterpurity.py +++ b/code/clusterpurity.py @@ -14,9 +14,25 @@ """ -def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='black'): +def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red', neutral_color='black'): """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``. + Three-way coloring, not just pure-vs-not: + + - ``true_color`` ("pure"/monophyletic): every leaf under this link + shares one label. + - ``false_color`` ("bridge"): this link is impure, but at least one of + its two children was itself pure (a single leaf counts as trivially + pure) -- this is the *specific* merge where a different label first + gets bridged in, i.e. exactly the "bridge sample"/"two groups meet + here" point. + - ``neutral_color``: this link is impure AND both children were already + impure -- i.e. it's just continuing an already-known mix further up + the tree, not new information. Without this third state, every + ancestor of a single bridge point would also render in + ``false_color``, painting most of the upper tree the "bad" color even + though only one merge actually caused it. + Args: Z: linkage matrix (``scipy.cluster.hierarchy.linkage`` or fastcluster's drop-in) built on observations in the same order @@ -27,20 +43,26 @@ def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='blac Returns: callable: ``link_color_func(k)`` as expected by ``dendrogram``'s - ``link_color_func`` argument -- for link index ``k`` - (``len(leaf_labels) <= k``), returns ``true_color`` if every leaf - descending from that link shares one label, else ``false_color``. + ``link_color_func`` argument. """ n_leaves = len(leaf_labels) leaf_label_sets = {i: {leaf_labels[i]} for i in range(n_leaves)} + is_pure = {i: True for i in range(n_leaves)} # every leaf is trivially pure colors = {} for i, row in enumerate(Z): a, b = int(row[0]), int(row[1]) node_id = n_leaves + i merged = leaf_label_sets[a] | leaf_label_sets[b] leaf_label_sets[node_id] = merged - colors[node_id] = true_color if len(merged) == 1 else false_color - return lambda k: colors.get(k, false_color) + pure = len(merged) == 1 + is_pure[node_id] = pure + if pure: + colors[node_id] = true_color + elif is_pure[a] or is_pure[b]: + colors[node_id] = false_color + else: + colors[node_id] = neutral_color + return lambda k: colors.get(k, neutral_color) def purity_summary(Z, leaf_labels): diff --git a/code/paramfields.py b/code/paramfields.py index fc76a77..eb0100e 100644 --- a/code/paramfields.py +++ b/code/paramfields.py @@ -23,7 +23,6 @@ CHECKBOX_FIELDS = ( ('PCA', ('ui', 'checkBox_pca')), ('Dendrogram', ('ui', 'checkBox_dend')), - ('bootstrap', ('dialog.ui', 'checkBox_bootstrap')), ('MZRTplt', ('ui', 'checkBox_mzrt')), ('KMD', ('ui', 'checkBox_kmd')), ('mdguide', ('dialog.ui', 'checkBox_mdguide')), diff --git a/code/plotting.py b/code/plotting.py index 06d1b7f..1467cd0 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -822,9 +822,10 @@ class plot_dendrogram(ui_plot): tab's original (pre-purity-coloring) appearance. Either view/coloring combination can be regular or bootstrapped - (PvClust), depending on parent.analysis_paramsgui.bootstrap, same as - before this rework. The purity-coloring math lives in the Qt-free - clusterpurity.py. + (PvClust) depending on the "Bootstrap" checkbox in this tab's own + switcher bar (formerly the plot-config dialog's global "Bootstrap + Analysis" checkbox -- moved here since it only ever applied to this + plot). The purity-coloring math lives in the Qt-free clusterpurity.py. """ VIEWS = ('Technical Replicates', 'Biological Replicates') @@ -836,9 +837,13 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): self.currplt = currplt # Defaults match the plot's previous (injection-level, uncolored) # behaviour exactly, so existing sessions see no change until they - # explicitly switch the new controls. + # explicitly switch the new controls. ``bootstrap`` defaults to True + # to match the checked-on-startup state the old global checkbox was + # forced to in ui_functions.py (its Designer default was actually + # False, overridden at runtime -- True is what users actually saw). self.view = 'Technical Replicates' self.color_mode = 'Purity' + self.bootstrap = True self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) @@ -862,10 +867,16 @@ def _build_switcher_bar(self, parent, currplt): color_combo.setCurrentText(self.color_mode) color_combo.currentTextChanged.connect(self._on_color_mode_changed) layout.addWidget(color_combo) + + bootstrap_check = QtWidgets.QCheckBox('Bootstrap') + bootstrap_check.setChecked(self.bootstrap) + bootstrap_check.toggled.connect(self._on_bootstrap_toggled) + layout.addWidget(bootstrap_check) layout.addStretch() self.view_combo = view_combo self.color_combo = color_combo + self.bootstrap_check = bootstrap_check parent.pltlayout[currplt].insertWidget(0, bar) def _on_view_changed(self, view): @@ -876,6 +887,10 @@ def _on_color_mode_changed(self, color_mode): self.color_mode = color_mode self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def _on_bootstrap_toggled(self, checked): + self.bootstrap = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): self._last_file = file self._last_filtereddfs = filtereddfs @@ -915,7 +930,7 @@ def plot(self, parent, file, filtereddfs, groupsets): data_for_pvclust = data_scaled purity_noun = "samples' replicates clustered together" - if parent.analysis_paramsgui.bootstrap: + if self.bootstrap: # bootstrap dendrogram pv = PvClust(data_for_pvclust, method="ward", metric="euclidean", nboot=1000, parallel=True) Z = pv.linkage_matrix @@ -930,7 +945,7 @@ def plot(self, parent, file, filtereddfs, groupsets): else: link_color_func = None # plain black dendrogram, scipy's own default - if parent.analysis_paramsgui.bootstrap: + if self.bootstrap: dend = pv.plot(parent.ax[self.currplt], labels=textlabels, link_color_func=link_color_func) else: dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', link_color_func=link_color_func, labels=textlabels) # default leaf label size 16 @@ -983,6 +998,10 @@ class plot_ordination(ui_plot): The actual math lives in the Qt-free ``ordination.py`` (PCA/NMDS/PLS-DA, technical-replicate collapsing, top-N loadings selection); this class is just the Qt plumbing and rendering on top of it. + + The switcher bar also has a "Collapse Replicates" checkbox (formerly the + plot-config dialog's global "Collapse Technical Replicates" checkbox -- + moved here since it only ever applied to this plot). """ METHODS = ('NMDS', 'PCA', 'PLS-DA') @@ -994,9 +1013,12 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): self.currplt = currplt # Defaults match the plot's previous (NMDS-only, scores-only) # behaviour exactly, so existing sessions see no change until they - # explicitly switch the new controls. + # explicitly switch the new controls. ``collapse_replicates`` + # defaults to True, matching the old global checkbox's Designer + # default. self.method = 'NMDS' self.view = 'Scores' + self.collapse_replicates = True self.loadings_df = None self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) @@ -1021,10 +1043,16 @@ def _build_switcher_bar(self, parent, currplt): view_combo.setCurrentText(self.view) view_combo.currentTextChanged.connect(self._on_view_changed) layout.addWidget(view_combo) + + collapse_check = QtWidgets.QCheckBox('Collapse Replicates') + collapse_check.setChecked(self.collapse_replicates) + collapse_check.toggled.connect(self._on_collapse_replicates_toggled) + layout.addWidget(collapse_check) layout.addStretch() self.method_combo = method_combo self.view_combo = view_combo + self.collapse_check = collapse_check parent.pltlayout[currplt].insertWidget(0, bar) def _on_method_changed(self, method): @@ -1035,6 +1063,10 @@ def _on_view_changed(self, view): self.view = view self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def _on_collapse_replicates_toggled(self, checked): + self.collapse_replicates = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def plot(self, parent, file, filtereddfs, groupsets): """(Re)draw the ordination plot for the current method/view. @@ -1050,7 +1082,7 @@ class follows). self._last_filtereddfs = filtereddfs self._last_groupsets = groupsets - collapse_replicates = parent.dialog.ui.checkBox_collapsereps.isChecked() + collapse_replicates = self.collapse_replicates raw_header = cached_read_csv( parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() diff --git a/code/test_upsetplt.png b/code/test_upsetplt.png deleted file mode 100644 index da8893e51d26552daeb18fb94e66721d2aa467af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 41302 zcmd?RXIK>L)-75M3JL-?2&hCG5y=Vyl2KbwK+pyW5>=8!C1=`zHlQE~C_z9G5J{3H zVaF>XIp-K-mfux1B^s)uR0x95 zT)u=SAP8AA{4(yNfL9nUI=q6vBpom6IBM9LIlA1mH$|@8bi8YA=V*QB7N@hRy~7(?}sJh3JzQd<*xZvLttdeR&S3~fjAynL#hhAL@y(G0i%v+L1 zlJPvQ(v&oO_IT~RsLaf-BP*37g-%61UDG{9>e=lJbDuc6(kC3e(pS8SdR8{p^=Bp= z{0LNd^e5EAaGI7#f}fLRw?wX>FPZS1Xh#3WJO7_}A=DiA^E_UYDd@4;)UZj@i8kd} z-6e_c+y)^lxqqD5^hqwn`lHVB%kO=>2zypmZRABHQAa@fj*&%RrAZ`u@~mSY36^@a zHX=3_x9MX8@`Nfy=y2$d^y!Eb=-*0_$8ck~kmI;A(j^9d8#4T0D#?UY#fx3j#QU2! z^=aTjaJcr1$8X?coXlnmzW@EW{UbaQi~QQxk0bISy~xLFcqzUF@6(-Ar`Z#w#)px| zhy^!pDU48U=CzF&a}#}#P_jn4atRzkmv<@sMQqR&Jnzu-Ve||~X^-PBVkXJ&UgmZ3 z5uYkNZ)C9=Nf;ksUDv0LV8dg}qJxRcBkeJ1cHB=)L5`X= zR}6ZLj{8$Cu*Q*}`g>L8^aJK8-K zvqrDaYRn!C1Op{ zsKT4uZ1G|bCwsY7b&;R;`p5r*Ki-Sax>k%IgW~}MZoEuY@1UC3O&R!Q! zzz=>T6zfquswPnAy|BE6T|uwd+SI`X!Ov zP(pj!q?I`FhjQ|8$9EIz-W*@jNuPamR8)Ki;(Uw9kG+-FPP>u2&LQP^lKa@P-?*m@ zEIvLyJ8v%rxW`I5RP?=d+dF_kNdEqOzo({T`DJ80*FHa_EIVc8^364;%C@hK!j+4Q zi^H(MKEbZYdCFVD#wbNAZF_ER&ZnZXat1@Je=Xf$lxx|lfb@}36fb{Ilv0MhTi{h-GhP+FtRfCi=mC*~j5I`?=)Kojac)5{d-dU(5TH z<^=^c@CgfNIGsCp&LQ&5oq4>y{k)*tpdSN|<;ME@37hUurv=WQUA2Pu7;SBChIczO z$Yy;1_U%x;jf|1!+{FTwmqM-SI+=ggkBV8odllZEs_Wq6>FK#8(dU_65h-Tnzdc_g zY;fVig&i_tPQm>73amY`Y)0^m(o9Lrx|f7Zq*JxcT+5L%wa#`okvi zM7M?Eu_$pH_q8x>!(<7E;aUrp+jD`>p4qS*I^^j2<oQ%*oTIPs^M-HS2im)*lDu%H5dVg;6F?Tug25Kt)JB#{R?l8%FD~~386Ev zy#&ptK0PBSBjq(U6T`m#Z{EC7O3SUQtu0QoQS;tfYNy48h}(955|hl8^;*xFfqP*X zL?W%G8_{7x=G#aJ*d?DaMbSM4kzVi*dIz|JwmSe*E|X`Djud zk|mk@%-`RV9D$`P)_u^FWmJ3aLu z*<^9DLzDt(%P{iEc8qrzsZaayQdmCmc4goc1ug5ULJ+%bAkR!B@_n^U(L7{uc&U*ku|giL2CUP{*cq?nw_1MC@wDc zel1~Vc&FxUug6SDo-*&u@>G|(?w^|?Uh9h=U%h(8PEJW1O~|omI)IG!6s^c_Z*R|S zZLDyJ7?%^>;YJ=N<76ur52#}yBVAKAs2@^sqVB~6lTLCR4Bi50@XgRKm`^Rbje@9! z#VAi{X=!_XjO+#zQoPirCuUX`*(!YcbmpxaH*S0+y<_Gj46Z(;ll$cU-Jh{Wa3nr3 zAYj2`GP81>Rm$;w>(WH~1!_h<$2T`}Ehn_nZiwYucVyj3bPnU!W@l$+Zik>On`%nY z&d472+CxF%of0~KMOk?~xMz}MY;5eLLE$0~w@SDS6AOz+&BYK{%rq&d@s`h*FJBI$ zW)|{X{}IN&6-FqFYzw$_km!oc@FmFosIjVmvgRa*F%)F!vx6_x#q~ZKe|~>4fXQj* zXM#$MrJroA;(|Q>9S2bz>8=aDgL_Lxgq&G$-rZdaKu9cJGnw#2K*?jF;8~&({u^aE z25|@@3@mh>9{F21AnvpyST03X!%ugA?+`RJxbyQF)z#H2o8pxmqG`TizkL0w;o!h< zSk#4lmA(9s(znm(K|G0E&G=9bRvFjEGWVtN6obqR$OOx+AFiv^6>uW~B6LcBtar1- z|C!qH|HGr~4I$+m#D;y3e{->`5SOKf54m{!9R4l-ojuc5DGry3vp2y1!JCAVRttx9 zp;Sp(`y`Fva1tySj~%2c7ubV67#-w;=*Mk%*xv09>tE_GPB9xPe?aw5^wookHh^K z+5Joi!BT?QV-^6*#ofqN^8(qt)hkn!K56mNKwm6PkU4k@HjXb;HnF)095|fIS=ZS( zd17n&FYT$$T%%OjpC9;_!`K3$kUSDf*pFx-?*(xfMCfQr>N7s(Cu+|bo|Xxd%`yjn z(ap&fFNA()jr9dYwqC^e)*)g%LO@!wex@@KxfDyz%o?RF3ff$JcYsKLD#{jse3}(4 zO%C7)2i%tf4<~4TAn>u3+am_}cO>1SQ=?K7C$sB(D4tWr1zpVdiep$vP@|3}k-8s? zHRClo{i^5?HL_oO`^ktO5JSqVNApj7i$^B+d_0do@uw^*Paiq;z2y?G(`9@w`6)HL z&~JF{DaEYV$g2~$LQ&{q-afDtCQc)Udk*d~Xf8gYAe_*mrookZXG&y+_PMqf{ggZsi*jbU+jHuRJ zuz)r&=%d+gBUPpLxBMbae*4>5{r~8Nc^Yi;3EZ=b#~XVx4`Q(xSP3+xnBxZz5y=s8 z%zHBdBTigHk=R)rj;|5Qs$eCC;7Uv_W|oKe5<}`3m&eHbe*6)NR|L(d4J%B<4&`Je z#Pd$(v>hU*&p{7Bfg-z1PYL2(d#i^Vc_Z}H7kWb`I8;@$0r|m@?}he=P|+zS9Kq%{N1_il z;hz}yuad%EW(wYe^`j5|p}V-?Px|Ix<%IQP2)>5;HgWBS<Hu_|OIPysw{{<6DarYdrQ`P69HCA`p2^FMJHSsfYcoP&4ciEm%Oz2|>PF7APrwxpPtwVx*GuVVbu8MzvDDXOI z@-98$4Q9d`VfQp83m6iQ-1HQk$HPc%X%OU5f)A3VWL?kOD`%iz>8SD@x zRrv|5+pqmzJ{0|�cF~iMrcf)jtYV8oBDf*LE6eD;w0k6&(2n+x+WY2#)!OhR}EG z9K%6b;K0_DOmf`YxWAlSIXPNUB1h*r@pqg<VV^#w${TArF!P0XAGXGOF^9!F$5WyTc80>?2DkdaT zTeHt;|7gTHLkae9A#xoG*=!{$;tj3N7P7nF-oK>SP6!l=+7a?5%rvm;NQ z{E}~}=zw$fkXM}2$Cty$3~{)51$1}zT~n^Z24^Eo!*7B#bqW{@NQ;=~Z$d?kB&f>O zMkA;E8k_r`?Y)axR}@K<6ZWg}Kq~yIxObE9siWoX(YGT>qxE0W9T`dZF`mn&oE&@c z9TSmCh;&Q~e*;-X%5lgT(nfyxL#UKplXEdiA9L#9`ePw;jOrtDI}bIW3m!t>JcuhPeCoePT4w5J-A_V8Eif-=#Qk5v8#g`y-w9H(EcPWkf3-o@4 zG?0Df@eatDM=_SRYP2?t2-VKAQX&c0tw@fa{Hh;mZ6ye3TG{l zbmm|%^Tk0!m&)^a{9YC%I7+-+|@AeVBk-U=$Kc6}cVWJ!CRp6_6 z2KJKRvx&$mR)KC{rn0szm+WJED+XGHh<{+vjgz;zhr~qpV7SM zslb~{v(tJxN|MKWy?HAw00E2_WWqpf;o9md3c|`W>7|tA{pQDozN?0k96M!{AS948> z>{I{0zKZ<{Z|j$y(}3}&zHlV;$N3LHh?@+Sm)3Y6QLAyOnHfnXKXskgspEAf7JKb; z1(fCU7AR83BE1v`$@-5I-y*$xXOl>wPDYM+yk_IyvyvkMpB+jN3nS3+Cs>kM7z_CH zr8U7f+~3Vd2GYpP$pq&};WOLp`{PRV-ogozp^@o%3<)&P6sp%z|7TD(f5(iz`d+wP z;STS~?}u`KaLkPS)k&NoGE8=syk7`WCwJXjepjMG5n>W8uI3#%$o5griV*Lc)$A>Q zshnJG>*o&y4;pV@c@qv1nel^Mh~S%%h|+z=C4}+kdJnmvp$UQ|KoN%)IwEc5XA<7S z0R6#IxGVx*&Hs`=NuNGA4L6??mgjWQQqwU)C=HdUORXFP`Ny!h>RlrM@DrolrjXns6NBLlOjGtHW3+2oG{S?*%ZQY*GV4= z&c}F?aZ)MhA+blc8t#sZ!tKnz89|A4W5re@jhA?HFW^Nm@2w;G6ExH1YeESuZ(f)< z@Z;an6St9G(Qp7AHidTtzG{;2(ej(f1x)cB0qVW8v_uF1uBSE0dq+xH&UKmTZJ^_j zvu-w4F)=aWHvj$^jcE&<2m$}YEOS~2dnBULz>|WLrd$%?Aa}jayX2cG)UxVU{locX zT+#@?S=(|UVce3*=0chP59Ej&jZ!L3G~zV@bAQlQf2{Chc#LvsF6m7@@tec-jqaUjOJrCqQ-xA29APA z*A)mJXbzBV`@7K)V;B@Lc*YMHlwwAk`clW=Y9MC|C*N=rRgub2!Wb@#Re>3MBr&v* z4B9%goQ+@BMX`XRqvY8povV5no$-FOj@qU6)fc&mjE&sv(s2xCHNia-NFZMVVY1^s z$#Xwv^{70%`{L?7qzB{q9@5eow!OwYi>xgSZ<5MbsAYY74{7))C#&G**@^dX_WR3^ zB1@S6Xe4k!@u5=W{Rf~3Q$d=PlT9dR%=jHcdRf+ON!7?QGMW&&-YXEUgS~_4Cr1L1 zcjP*B#_x5%*jP#%;?t4C`@Al^%tlU}ZxDs{$%S)w;b|2bI`384`;xlJ`$Guh=WV60 zE7w&_N$6C6usK6)6VH21VSK}O;27@hCQ9zR6&^w8F*0_haA^-Pi+As!SVw`9;4%ic zkusCaX=db#{$Xl>VZmJ(=D)u)?2+eqMBy@X1t75__3lys-1LeuKU|ZooC8TQI&k-k zV==`x`qr*5NSb?RnTXf_LQOFeOp@ud08M1szls7$LPDRE-ubb2)5U=qQ0(OZ3bJPs zTJ9ipkTc`N7mFKnF`jgsU;e@#^vk#)^W(K!-E5Rl!E-|ry;gb**NT6aN!n8=4YfKB z&3{AWxcR4~=m*eooj_Wj4pHO3b=K*Vs?;`?r>NgwPido0P!3yfY-q4cCRw}Aex}^e z$x~5RR|;SfEZnnquh+!TP;TPYH|GOp3Pxg%>E+wfh}bb7Intqq7dKh5XnEhHA>?6? zS1Si;3O3;US)wGgw>mUlO86JhcwqYZ+m#d@&HerS_WM11Hg$-e-u}3Nfb{}UI?^|+ zw3F0cV>VW1X0(8yUVC;_%pdsMv`?lr;pScWc1ByHP4U9Mj~|;MKg(cA_jD2sa=2h{hd$!cd(K)z;jeJ1CCV+v>A{c9RH$2 zns$|wlbfBZ_D;3={9vzGD8cX5 zENP6BxQ#YZ(?^NV1=_+MmL{CV7Bs@=JdyIgzMaQw^Ut3@KY;K7NoeofW!IIL25QDi zYqCa?t)pQmKK(OVjrA}g!T}^5-FC`y#g~c~@b8Vvw)Z1Ifh!3v41suY?h0Z*Q1Prk zHSe{Ym;0}wnwK*vz)P<4UH_o(V*ehmhDw(*Jvs;tW)hj@L-xi6kOU|j50tw?>eS>c z0jg37gou?*V6b~)UH|wEOFK{QJ1TCiQzz*#Tzmo-fqR?PxeQA;i6F4-#(tDNSah=} zX(&&THL@!lB=RDA%J(5F$Y@;Xdd;)nSVkTB&DpP`^xVpuxp#kmX#y%$N6u}2FepVc zdEVph-B~P1Ow?_81+H^%SKeHB(s>BD(a~3@Z_^e^nc>WLll>gzI;5K)_Zv|r{{%GC z;P6KhRgSn#=kaJ+59ifg4F%5lJBRn`@74LC8Y5xt^zL{@KF!s-PH~i>k4G7LtL{&8 zF6n_Ut8~%CT>2(dimLx4LL!wB_*(0ZVHsU*I&&^tG$&pmLx2W;s{)#d542Y=b*zd& zmzn(2PLnTd2iuD zUc4Yje-TbOxzh71(>+n-<*Rc8`PY*PL++ryNRugq@m*IlDs<$v7^#0v6UZ*x`38<< z6vQey|IsuSlp~sFK|w*EHl~Z(B`jM`cH}mYs@!(biG|Aeen%e%*-46i2BaizBB}50 z1%13rD!K-_EiElS6U)?JOSRsD02M?yp*K{5L)G&o3k3z&3{=~gKfBrn}%4Ekv z&rI2lcR;QjkLs&NHQpfzIvzvUOD>0Y0gsP-mAWy34e2Cvy#_4=iS?0MUW6)xZv9H8z7^DvAC9Edo> zFm!%veJOT76;7~xIX>OlZ2u}VGy-}`JY0eT*`({2S5~wEo6>>T?@B!HOXY(?C+4T?Gy!T9Q*95a&aI ztdc)FYijsQKwjv|vXpS0?eCmu&)_(O#b$m4@o1nSR^Dg}(DJBpZ{md~v_a-3CWnBD z{k{_3MR)jAzb2>`?V#@2#HD3W2WQVx`vbTHAu;z{4&v9ogD_jPzPft6;kC3215&*D zCtw3rh+4oQV(zt2C(%^~FO7o6;sX?FZY*43oA>tS8uyhhkY@5fuPuylOtvI9S6a2D z>TZuE#a6IN+MC!{hj6>3frNO`imPK7WWAh`if1fAa0pwASUE2yvsPQU%QOJ!}c3u8GxHT8PFPNHwqtTzu-)RYXPGQ(SwzcN?M@_m}+974jwherkmURbV6 zpFDYz8${5C#g&gYzZ#~1VEkBJe(TJM6gkBpcCV8lTz&(Imm||O7Q%%3AaHkHR+|Ep zalT z|Lxm1O|p;j8TVk79duZ;DMI?6W=o>i zd9~B^RrU4t0~Qw@S65FZ99uzoC%3|INg3C=;$l9k@~5R%*x;h;~-euBzFTcArjnZUgt2?+^#z=jk!kjg*utHDBIEvHo}J7vDI0c7x1j~NkKt$5P#zLdJ3IH z2mbtd07{cnC)``#Cr`?-P}MkgWSLs=T#3}Y18VX^dIpBC`cte zWKhr?b=;X7sN{$|b@NLlv#{YJJLF>z$ay45$I+|pKotx%C#iFkfsT$F?xNhYryC)H zV#s7~qK_}wEhb_}#nk<0h^>ucEn!cG%KWSDx>P6*UIvYyA&gK;NyomJuMZOaCqAes zqUaAEJc#OVZalz4QyevP@M>k@p%&YSE&98- z`gMM7_dP&hW8|b>o^?3a9KL>+9f%|q?=e0^)J3Gu_f>s2@y&cnLLT}0zTQKGg$@-n zlJnkN-952CGc!Kk0Dl))#4JOtZ8jjfwqk;Vv9sXA5EC^jg=#a;mx{6Ql>J~;*=wu| z)THk2Zf*&HA?|XBiP`x5r58Nv6i7Mj{}NGA(XCKIeoI6&2&O_vM3BtkOk|<7@UHe&majf1iNc>wlM}f;X(v(`*HN#xUlTe z6~_362^>8SE*}4_J?5~VeF_BOY2>5#*80&zQwZx%9#A!EAaNjmXKQ5`8f>TK_kOtP zo~oBG^sU7icBQm(xIlpw1mJ~EFl@{Xy);VKfL+i&FfhOcF|*?9wsr|MJV1d`Rv#si zk>hrtr^v;wx5Q&*81{}Af&k7F>T+st)Ivni1^D@moxa;&&YP%v2-Eq#tm5Ouc8tuvxHN)gNmIHMk*#sTKz&a? zov{G$1mhZ6v{#EJJ5i^Py|a#y-?{RB-P>6lWt;*Ez79#9z-*bn9D@iRnF)6i^v zYj0oa1W!yK8L_IW%9&p~-BUDWnN8Yx(-?A)eAKqw-F^jycG%VMTB<3i?7?8;ctHoX zCmp0rL%ULgp&wySD!+fv-2(O8V{yE7BK1xI$-2tYNI5xt*e59|Dc*7HXKbi|?!1n* z^(4>mL`5K*9?0g&4_}0Y@G%SNuP%aZNOmB*L2jd2rXB$D;?|7M&c-k}ks9VHP7mHd zw@e14CQI!}esS?UFStnr2V1DmM^n$D5d_MOD{>)}Ag^;d(S!VOQs|GR9331%nb)cQ zmhM3!7qU%I&J__a?u8fx^P}#drl$c!pn}95tmqMLf$AP&rl$@If4 zl68Qs`Gs~jc{BBBe8Ccw8FqX(ax5lEST1*HN6jx^x-{FHb4McxFrw}@?#gt98x&>D z&-Q#|kg7Hy^Q!#%5+j3f9w=uB>uXt;m_ZvmZ&T{0Rh1x4A%> zDZ73`%MomWah_b#;jF?Ueb>8pKeqcb@(a9hdfWB%sMrseiMF&lpY^dMDMP~o76}dI zV`5^hrx}z-pp!D{5OrA^Yfkc#avVLl2tI~9!^~S3`FS2GIxAnke6QMr5?P+^*;xDi z^@$B%Ch4zsynA;vl-WQkL*&S2=@(DVh&FP4l?@{V7W?~95EYPtXJr?j#lB3-XBaRIkU5OixLA=u3QP@iE(}VApUli8C2gqWMpLa26i{{ ztUDHAPx72ScKoeA+6uBayce_O!TH9$_*tWzd`M~f0=@z>y1$<>K{Lm6Ux%-`Qc)@k z@Ac+Rdu&H+XsG5{5s?tF=0VL0)>ffpw={#I<$!|sZV8@ooTWZGz>Bcgy(&$o2QO7R ze}Wp^KWs85uJ5cdM?6wY-fxr8l4O=5bW}d=ADVP+=lZ{EEq{p&xyI%;T@5mYnPNxV z<3HmuPqmaw>@wF+5)UD{sn$2B%dzo>M^`hn*2D8~gbLC^M)5b4Nkr+DA@lDT{d||( z1L8yRnBt_+qqhqTdBbRpC5b{f!Z&Nss>kNv)<*6e;AE6qlW3=O-{X;7a*rMxJc2Uo zKyluoi1Y1uJA!}r`eHVM1f6e~zu6#X_@s30M+2hef9b-)l5(<5MMiv_IfL#o+^~dA z0b=a0yYXM7(@sy0=^0|if=DG0^=?9F@8WmkJWnQ(z12%7z?la*kmJDd`f2Rc(ZSr~y(n%WI8QQID1 zuLgt9MnRWz20OhTZo*#p9^H4L;4S1Os7>pDzFE8y-er?-HaBF41_doBj>(o zp=33cs2mDT6?0S?Y*lwq^PfYSUcTBM;4~;4w}q> zk&t;Ea?lLg!ZF;ud9w^Ib|Bg+o(Jau9WaTcKl#m%;WKg-*bDyf38>>sGFw3WBUQguktm+@HwH{s@& z4$S}~GyMrp&9K;Y?p8V2Q>_%0lvdt@&u$jlir%G_6?5K0!BM-mfgI|**2@7+_0ma? z@iFh9LA{LJh04sh#tD3xG!${*7Y(y6Er!ol0~sh~v3aB!ZhkZ5XZ8+ARuw2wWHzF7 z1>d&&b&IJ*}+SWK#Mui z0<_~bHLu!O`5g=brT3Sz`oNc@EOf__sy?qTO)v*XN3%YMUj%{9kFgNc304Fpg3`eC z*52nj@J#Z;;{$8OP-rr+do4C`fI(?9T-d0$y`e$)9@vo=u6X6%X-;g0rL=^jTbV;S zT5nKb&ut8hQ10?fUta>WBtBl-Ya@EyOEP{WKvQh+*R9#FbQuYWiQ(c~UQmTFQ8Nix zV36S8U|HC{_RC<4m2`u``E#t-atAf|V~=p*jzI43m4*Elkyyk*rtn(IDG)m8j3%fO zW6SOhD3VAfU>DrhgCfNZUF-*7?Y57(gOi|7aK)U$1}8E6mzVL2yS@Dflo5qn=atL9 z%m|!A>0fc5x!^HnnUA;>KdrF2lPb!hJ0_v|41WHOaSYD!X{XERqZG31L|VQjqJYxP zhxoN7MdjtoOrcJ-f}A+~#Mk%P=4gBnU*Q_>!6r6;W=_Zkl^z>Y`DLJJI z+I;!)MUes=nf$m3axyaSTFWl@x;-uLuML<2&3Uo6x7X^y!-r+low*Yi@+KvKmNU>w z5P))4z_$0l!y1*?P`vt2)~E!B4`oEXXV*#@*x1-y=Lf5Yp+(xvhB!4BZ0F(O@!;m> zc7r{#nhu`z<&}knA{poqqY@`>6gu7+1z7NKoa%IXTUJ&^Muc$52e%uqVU|htuZ58Y z4^zOVwXF^al+2gHM(Mp^2Np)P0v~S(*S2fduFW`4ck6~j)tn-D+HvyN+(QVG*Jq5{ z;EqH&sME2_d|HNOq7<{~T+D_IvsDBn@O^dJr4}NWQlO`WCAuvtMMXrY(tz>ssM)4G zaQN={S2}rp37})|!+Th*RaEgNY_P3yKU^NuarE+uie_I0`YJ6@`N^Y4o6!*wW7LQ* zHS@MP+M9zf(7QLh6h7C{T3N~C2E;{_YK>Zq%-Un#%+iVUq7%oDAJ@#vvjH$*iI(@- z{tTcN6yWcF!%iTO_xcCTvDOsrvRBd3#h*1pLS93X`2OsO=&-RC5MxGts6)D`3|fd= zD6}5XM~4}j*8z{iJ4(GgW-#Ow#n++GJ7eV~vrsFlQt|Z2!gp8+OEs7r@KHOB4R%6@ zZQgpC1KrE!>E-nTGURqq?h-IaN6&h$&fW($Vbs;gs4o;~;48VNib4NA`e3dcgDKx! znY8ZANlr#*9bklDA!=H0=H~`XoEs)YgOf&`@TM= z@kj$Qk_Dv=U!|@e2EyPC7|j__Pw+SFX)L2Annq3eC3x z&o5Kzt<9Z3kel#>+Hh&HkbWeb;q&>{+CUy}0je&+b<`b&-N%a>yB%qucU0MuCq0@x(ub5w8if7jcjOP!p0reqd4+5e7xx>aNQ8hvZI=!`R=%hC9!bZOU zHOG1#Fajx{LvLp9XE^aFs&i{~AczGvxOdcHuJ zvF~TcB2{O<5%mZUP#HdU1%#> zw>Z`u>jf~|i$ORzI9R2ft^dH~npJ%Nz8AQfi5%G1RCIJcT*r?Oj>7J3gHTL{jq-wm z`^1UIloS*((_Q&9NgR;5>7Z8`RRja$#2W_iF)}jZlB^mbavIo=qmm|7&lx0_I1V3v^*(f77R z9AtOkcD7r?E)j2b29DwY1rhE~r!2ktUkA-w&jE;RJCodW)cp^D77_h&`|Fo4YSSIr zlt6IvN&{frF6b!*38sug(*F27;IdU7(7QB=LvU$#^kB`Y0YNvV^SuD%@@r9wo8O?& z(NW6=9SCTKqSyvvWC6n|szH|;E8`{-hi)H~O{_;Jrb6bUpj>uhCe|vkyFKD&|A+V8 zWE7Z~TG*MHD2X*(y;#ZNqkj|hEJh3`T&OFEo^K$nNKg_%k!6>7=H38TTnX=Yffcj9 z1aG8o5uREj~z2s<+tlxQxu^GoXo`p)KgqfXe$RB@yc3E6Grm#du5? zjOf6Q$=HTf3xjfT`#2&7x5_T{yUagFH~`^TZXKnWLFYFBQT=E7%66W^shf;J1*GTy zKhau0p^i1FEDgc$r9{&U?gA8K4P}czRX{QB={j6a)jSXMY3|d&z-gOaXXxeQG5rvS z4o?mo5vk1a1YVR2%7IQh_zy4Op7f5NJUMCtJbQ&8WZUqchY~{b2|xHnMHhxG?tyL* z^u3}YuN%~ymD<|c^WNUxFKa@1vo}z6z`2}=NfH>i0^<;H0Hcf>x#J%Ib-GnUK>>Qb z-3C)H(&z|>k=IhI2jn`rEg*t}ATEaL>;usf<6C8A`&Qr1wz=449$XhNb#)iOK4p6x z_g`F^u1lqTlE-{CPn4j}N8=XA=!Gt0KZhO^4crR{Lfu6NE`=Z9bz@+dtN9;PR^gR= z!-f=e-eS2W@x{n||4M@**l~c`d0$Q+jKhgs#!b ziWyN*lpmG8^^FuW9r#@rM#PFC)wje0AG@IQ=Uk`+B%$}5)O#B_6pkS(*_Ov~t#6dX z{=+OK*jY*gs^>M(Bv%iB+APY-E-UE^9KPby!>1(y=+0YOS}rqO7}0=PIDTCSg7q2%><0i(#cWb*Mb5BI_zLgRCm^ zihcRz9DkZ`D~6GM8-);EHEyypn z2iT;})H`atYHVy&SAHqvwF*UTZ0Ed8b93{(#nEd%dY!{C;pX}wNPW8O2ZX9u9@Kh$ zQot*bJpr_4)cNr^KoKTz6pj<^uK49-PVMK4m^i={LD5JHb0NEf8K>CGZzEJDA|(Ky zjldSKj6y|Ol7l7Ske zPZen9`KL!j!)aL0&c;UHl5$^spVao0K9~V*0Dl;s1qY!=T58w7d&s$} z9qR&qRokh5fS~?69`ZIu7|IBV?@(cT!FXPWIZ(r1NZh@Fiu{~w$~S>Z9|D!G88*`5 zPhc5Roty$de95ik1a4i5H0Jfu2FpQ^;O#nfkfRqnb zzfjo$-d6=kmaUYw>A|@|D+k&q+Td^2gP>RX!OZL z`xX|>NNELiRu;46X0eGW81l*DUg>sCC?}`r=FsG3Z3ey!4+kI5Wp5^vi+WVI1!(E6eriivr9I-vngd zV*nnQz|=P%=^87cf%Eq!u$Vec3bFaam^@MzL7RKICuwqL+NJDl zBk2zKZc8!+7$v1OZ#An7WIG3{kEIs$&7+`_{R+9n1tui}2-SSF-tUHjEC9xMGrxYg zp3)1ti*1Me&gKFO@?JCf=QCKr&96Wd(AhP{eQt){^$u_rF;M8_FTjW0(6Z_m=(M3W z;Fts+KvKqg(=`_=)*DFH6#z10loAXr`8(y4^{*+P|M2pi$AmtAewB=9*_m_amX;0L zgOER=&MrXhCl0md)%$2@>SpKWE~}}k3Ix|A zspDPFq7(6^7yiZF=IG)H?^wkz{0D$FX^ry=Aztqyw|k-nbC;T!wByBwhKu1 z(mgOgkkk+@sUgb8H@*hGr_yC0?3_Foo0PhAm#3ztytrg#ORgIJ;d(2LMgU(U{#^;l zk-V{52Q=KmBSBv(vaTUi zDpT>|7i!^67Iya1o4_B^2Ee%n41p%XRP~p@`bjxhF7Ntpk{NCmqdr;RsD(xRLRN7m_%_1?9|HccP+xGHRsPxeE~ zcR`|D1_qE#@Y>sNVNjKJ=R$lWp+J`nHh?Xg2%gvWL1_B2P)_z2&kOk1teB2VC|pT} zfKe$cG~pRHBUGVO{W$~3oCYwy+zz=&?M9b3^ldBZv2t7Ss07smJ$XK{^!9IHOZdQA zIMDzz+|lw5!>UbSSaCY*GOfRbz7G&}$cC#5#%2$TS+-1|9j`15Ye$*y2XjG=1Jr>A zF`o0)8r9X+=>CCiZ|halLknPRM2>*O^sx(@vHzNFEufwy_&w-rU$ULOYg|cACv(aU zNJ{mnkTKX6l1x$4Kr*S*9$=SoorW3h8+5E^PZ`x~%#5KoSyN%MD_s_DO8w-uU>Zz#$7B0W_yXEtn_+`4tE0OdFq!BeA#50?ytxn=`&PX>zPzHqgQf{+iw+@b)*#UKS# zWkMbFeQ0@&VJOTLpb!@G5}2jryrI7c>9EoJCv8@*v7gcO8HgPCy?EfJFIih#CxP*i z3+U`9Xo@?VZWIC6odJfonG&ekT0X#7{f?MLqp=Tx5CSUo#>{0^)tnnE(J!8Gc)QL$ z*h>wjri)>tKND*9HqWEP7gc}zWZrAg3~F=kG~b9679Do4Z$0D$-Uq|I#$iMA5Q7Qq z;3N_uR=4GW7|y#1D)$+y)(@>oZEXw%7aScOjnAL|B8wVlo9zwk?X$Gu&?|qw2h2nQ zG2#O^rv|)GPSxEFdr=$UTe7!GLG0!pi=)`Ffq{n*&yQpl8!u?R7z5~Vmh@P*{x-Fyb4cOz+-Pb3NI-ul7x+7(p0I|tpU~`)P z85l@GIjbwcB_wNS^map4Qg=G{vYOhmC6FHNa5Amxs;a4AEzvs8$;l}P59X-RhECxE zFtS-Jv0rRpPfDHT;`+J+dw-|b-r&-us)mS&h*IP#z?BdBY=loxMvmCc__E$dpEI%d z5!BMPBWm?OplvLA{P^*7?xlAN#WU#iHtqiX1REUv|zXh>2u zHx&^T^@1KXRjmwVBG1EinPOmN^;~XJ;@ENoO#I3L^eZD+b4k{);)|a?ot4+Fd>sai z#j@Z4@b<{hA*>Xa5bkXw!#81WMDV35!~ zL;&P`ntl7u?g0JmjXrcB1;R5>9~=*}P`a#d!UI9vV!jATLtrfdK~ktPE`9swHAji4Mcj5h^;lRpS`|9XZ%~QQ4qkx#d}F zfLSdRchNsuRav>Pt7Gmj)i;{>^yvW%5$M4(*aM!f;Eo!=0K4XeKEi+*Xih!5QI@uY zjQe0IQ>YOjxMZj6&yKfe^zlnc6*d6K%^GNlw&`TU-hU2+;SVU9S8F2X7)?+jaWoMi z_c%u3>ea@}Fi$cXTmp>PRCl(ydV`o%+o4?X>BPw?@5_y*ni22i=Io8u`N_F^7S zxy0`L9Q1=QUjmR>41^`I^+1J$R~fP3cDfJORt{Ksr4*V~pEM}al7YpXfWE8H19~k3 zF!6YS>)g(POsxR5@sVuDJF-W{U`MoCc03$QCqj*)+xsDER4o%VM?vcru@6DpmBY~g zy`!g^xvpxgitQ+i3$tbA`SfFD1Bh>H68H>iM zV;3Gr-}M^YOAerbD@=y`egqtIee)>1>;RtqKusqCal%w+iHJv&gI-Y)XVYsFIO%f$ z717a=kvHtYrgk9)3MkR(rz7C^d+1=PffCBMOC9Y`gK~)9d+GnpS-cK(ZJFELcV-rF z^cepH&UOS`TKZc+rAeLO=Ju7J`^Ip)d*c8p+6p}~G(1uE@JEqiAnRHu?6;r-=@awT z=pCeb{{QSIUY=;5S%hmedM#qAyaNJQNqXo!^n{Z0wW1BhvXEo*Tc6QOM(N@VoWGH< z7f@aYo^j+S!?BB-$GN$OF6(t3aX4Q#H0esx=T6EN0jll;ge(V~=yjXk;IuRQhVQ13F-adn;|IW^Rgb0&Oa_`=FK#^l2^m)#5e1_KsALj&HC_I}51$IsWfigB@z?P&u4~iAh`pphk~_ zBlbF+c}l<`UJ+?HvvvRry8@3|fGN!AfaKqNFM<29Dzi=UgqlJLW>}YIEf#;7&*ic^(j73+V&D{0``%B*p zY{e>Q%V%^UzdqlXDf6*V+X7y99Z`TDkr}0ROdx6YB?F(72>VI48FJ!WqVqrdc5o8L z{(=hJMJ4p)J*ZFQj+8+Q$*BoPMn6Oy=XV~q16_2ftZ{}dk2ZpU?uhfM<{q+K`if5!fjx0{Dnd6 zisvex!^!oT)xOH?>MTWtJXusqkOwW!>lUONjrsK(-G1mWGMwmf*wg=V?m7;<1m=QU z@AyxI2#w!14dS!CW`5aYnUR$>dLPgnh7M99P@5zp2ag_w$GP1>{R>DV=&VnK z3<@=E;K4evrtNmnbG3Z~Z&@#lwGd_R_3SLeKzFdPvSKs{#eYR#pU+Kz|B8tGn|0j5TXOK6CJ#_6A8_Gtsezl3U1dH# z2H5J+Uzr2Edh2R1!Gl`Kp@8}gb%W3?P?OZbcCpn7nau&21&$_macSwwIQad7AbR%5 zLO~K_YGSeqcWA92n%9T$@SZh$2pY39R&4^jMO$dea{9=TBipd!Z?Z+jIAT!fJn1tF zQWEv)R(&$mblat0>RE6Ln9SQynnh3W7M%cYd#uAec55EWqcCus$uG2O8?hZda%3$6 z`W;3MmNtF*p}K!-$J^6=cDCt2oRIYdGqB=NXm_OJOz+kk(52Em`tMU})klg=JcN+8 z>&rH8*wefY`@Y8&9z-R-IgwE&B>|P-njK^&0zg+1*yyJ`AwN<8zOimVIomdz6ALiR zN)ZS250HPKsCFNyw07Y=$t2QJ2-6Q6(hkyW!VFkv`{21E+dLAl6Oxi{1Jl95P2sZ5`Qgo?ycT`Xe78m6 zBRrFCfAHmL**6h4X;D#7<>yy`{USnzD-3F%_UiijwkCAyV~K7_bwf+Fs!(idBNn@D z|J+kH>AZ3DAzjR%l$S$JEX!JQ11`|95&+$Z70&EAgnfoOI2lqQa&N0f2i{l5h~CyV3z_wQ@hLB)qI%a;j^z_mW;9N4?0q`s_JpcoGT zcnw1Grq8R;dXDmL^LKH7DC6D-1*vD<(L9RO`ZT0LBfhl_)0uqGI~iy~KFUe$%p4Ap zxj$rmSfRJRRzUyL$8A_r7BH}SX+Tk`Bn$HA45Q$U#lF4NOiw|0+ZIi94m>NW0at&4 zD0eECspiwYfe*!?=)6F!)L>_Gc6Hw25Cj-J=?0EiO0>Ov1$uM?`dp+$5Kj{Cp^|M> zKH9l<@@Ev=XQ6%sbWg+CElohtwzK=xD~M`H>7wSA&>6_e!%b=52BIMl^=V7jc)=7$ z-z9Z*VG}Fsy{K0N*U*j-^Fd7g5HGbk5F~F&t^j>jdf5L(-Frqg)qdZiJ5-Tg6i|u= zkSYi$y%UOvpjZF_L8M6Uoj?$UAWam6NbfI7Q9ycAkzPWVj?~ZvLML#ZyubfBRqgXW#D2D3CYUZw z_s{D;U{5#^xOz3SAFL^*_5Ac8A-XS%?-b@fs(`>yx#9+W9H)IDVGz$rpn{JZLV*DA#KDOG zAl?n49Qi$Q!B`DA!gU05^75vlh~og$5fqMEr~MT>Vq6jb>$a2%vBVCvCv%E=I>uPD#n zH323I<`B@d$x4+!0^#9%C<9FJ%0RmYpf}J0L*=v8g^_&JotO-EGDU+IPd$tQ6x9HB zCh+6PX@D);3Lw(_y#a)(oV@@28`Y+tmMB?f>=GizX1R~LlQk*j>O};yS%*I0y+TU)pX^A4VdlR zz~}DBR)zaOD7Ox%94S>+2Mas^`HhSwR#u#V_(l!E%L78AX(GsS3=qnTEMPOJkS!$A zSXX6bt0zFprzyX$l1GB2A_<>FY+RE>|V(rH}P?D6>$AZP}4^|&~{m@+6BVKjDn-n{}K7nqTCbscfj zB9gf@7Ny_T!P-1e)U|bktGjm2dxmenbcKR@{6zw$Yu--^6@Udd`~*DhEfDvG9BzY6 z;Bh@ra%~aif~arnMjp3|Km4bpB(Kj>D*%XcR(rwTX9|We8$?jMY2Ti#)b)gI5e3^! zad@ys4c%HiT=PKKu;B()?E#pDg}tsQgN}V5n=wy*08jl7{%~YF(sG~ZcvlqNoIDPqa=2mhyL4h+ z)4?rjV`VkLdEvsNEU*SyAqyfZ2X(dEsXN?CyhfxZbRc0TD%7pSc{(CVZrl(k3k(#^ ze=1;lIx!ee0P@>NsgNHC>5Yz{@t~s#!s%;v&9@*M}V$&6KKn)F5 z2V&NkfV^+2FPrcmMC;_wo5aMCT>$ss*8s4_zd?6R8q74mXI)`m!FE<34J*8Hy%&7f zbkK3pQ=)n`K0aOoux1Cp-P6Hw#wh`q2(}$-1xa<|fB6r67NLhYH2^!kPO%qNT)!#q zASzfwTtdHqs*!EsA4$myXNMSD8v$YBzQ0Z7;<8yzJo-BeQu9-<*>v+Q3=#ca?SQi8 zEBMf~eN<5^j~+|80PacNeK&yUBk9_5PP;?g%o6Demel> zvdm+JhmTLg-F{90B!PqzkeM6z96HdqxX8Bz%eT1HzGMmhFMAB)w)nO5*rK;HaX1@8sz=)6&M+Cg& zzy4p~0oQK*0R$YuU^6(L!gLWs9?cnwQF#P#{y%0FT3CYHyS=@YR7PGRl z>Y(08VKL&HjhrO_(|Ym)qRQ>@G;tFKuTrAp0rBcb9k8O@ z9T4Pq7*G;e(kMeH#mz;g`XWsR3j}*hc4eym&-|175MT5(GnC?E!b>z&Hx^vO(E(`e zmoc(7a?}Ifv00bd7$q6474&|3wuCl=9(4x$C&r|gg`ea%DWK1|qZ$=c1=HKaN`lp) z`l$3r8PoFD5}^|+3wG3NETv3Zo**h8hJemh8u`HFP6g)TR>rXIywDR8994!A>dnQ~ zIT(V}h=$gtr?gq1zlc>q=w3A2dg<2xp;b;+A8-bvyKw zMDNwoHTS2`8`W0xbCs>>B%A^7R@FO)k&x0$z1dPGk_^li8|};;Xj5I8U;hs&;fOUP zh|o<>;jGg75xO$E5NBP&h-!!Itq1aCP{S>Aioi=f>HL++(eI-fvMBmY`leC?5|zty zTS~yGz@|x(&V7s|2|_~e6Z#=JwpjY8J@cQTy~9Lj!hdE2sWL91Y9X@OVFU{lnE5_u zTeCK+n5Ks!H}idj1eWq_c$S{RRT>o#H9dJ3q9Of4HOk-rfdrx<`9iK@#^0P^@)XM_ zxUZ6Uk!)tq^auSB3fMq>!fuND?1bob1ZnO?l;<))Jg~y>Akhtjb8?yHF>B0rcYeX5 zD8Eot#zM6Q&!BYmSiEhbwuul7dE*Py5ps!GlEQd_qXBDfm1}Xl)JP8uJ%6~a7P$Hd zkfmbd+cZQPu(!PqO&#Z6kjy-QHk(c*e>{Q;Uvd?BCqg_imR|_p@S%FM8}0_W6)iXD zn^bJQ#&b7Kdq+;6n~jdYk&7%sLP%8OJqashN%o!&S%t; z9t*>eAHF~GS0AC1*D=PqGa9l(8Lvr?KcBP6B0csv7kfKTBPRk~$Q0K}G~f@{m^X`* zg2PMRXcwYQS>q((5};VuxSX0Lj$30s18oY7R^g@Q2w*at>`;lAXJ?dksWB@@KV)G{ z=$^BUTt{jEOI8l)+2{*#hVEa_C=pf##IwbrWsgTA=Zi+IXp6^Prnn1m%k@r67)q>A zMC|+wH^k`_9f{sf?f(*t7%{kk7bp=FU`Y073!=VzSJDuikA4#2lrP}3PD zbwLzoX5a1nHfJUGLUrVXXJf-gXA$VAFVD_yK_@UIi_~x52IWh=f6ltbUSIBH zCBab;RBw%B)%IX_4TkrUEt>baIQFeI=JgaaZC$lra$P9D)?(Dy5pqnRZqhAlOrB>5 z{d}O3p5~p?Pq!f^AB4SC6Z5d;JHeWdspjS{_NOl$l*&jmm!5gEgse^}>wn+cL^2z4 zbm2pG=uVz`88j}Yg{KL&v77qc!0=S+N83ha1yRjV=MLVn`7C8*Cw;DoCU-EoW%e7~ zoz9;NoM{cfTmK=6vvN_H9^yQV?HW8G!JyUsT#+>D9&Xvl0v`a3orQa!k>C~L7jHY? zg$PhxQT!3ssO~;O4OYhkPZ!P=wJtl$&zOE?mSCV2ZM~{7FA5Pfa*_;IB=CdMZaM`% zQG{!xzeV7AJZB8hd#H|f#`5CvM{;Ds;dAV5*}l}7^Z4(FHjxW=p@&k#@&jB@5!5W_ zrz|T-*E%AbY;)|o=WtGjpM5)2kEuj1)5W7#_UmRpw!LQIN=&xw+%EmAQ)$<|lkYu& z1P8U=4tEp1nJZ7*#>vBFLH~rX=jrc{_Crrkygod}rq9VPxQqUw;u+{wJ!?I>67Ef# zsV`|9!&`JC$otzf^Ss)X-x7xN5Y!pntZ=8L&D8^$fd?D-;ZMRtHu_- zoZXzuU*IUA!g^?YRC;ydw#uqj`qg|uL-RR|<5dH6MR+MQpRk;$u8TOl&4M=;?2Jpb zg&ZkeJKB#MNi6tH-@Dqh8uPIi3+=iN2c;;QJ%vy$dQ)1${QLAWE>?r1o9Q=Xf`#O5 z8%~F2PL6_XTg)kYh(z!jA;w_7oK0tfxq3FNZ_)ALN|xhq&E(b1ar3(lkqi4UnUeU@I@R{4 zz)cuKohsAI64wC-IwUx(4pNkJ(aAW2TpQ5}aQH7+9c;#B?n=g06#ujFUt-D8d&R}C zdEGA#7=LiK>3o^VytY|bzW@64jF}cm`BesIP;@WSTM0{9pp~&X_>DqEQJ7#XyyFWU zp^n6$rKt4jCciQc-|ueU`ahW7-(MkJN>%mROEq%%emMQrD`g@WN|6qTESpx|m#+?P zsxCPr)mt0S(NkIX~apQx% zJH-@K68&>jV>gPWxGGY-57M`Gde<%i9~4hn?Lq_l$X_e>aG|eE*lz!g7MZ=};=@tH zb*uzwWj4kqs`HPAc8xiOr=f!cJ=wK{A*4n#ebY^x+)j3zF%AAWvw_wrVmnkAdnVSE z&$~*2wBhf_DpD~I_~;?`p1c?>4$>$>blP$I#OTe zqqQMz+|lIpOv-(Zfc)=eugl(fdh~p%Bun9L6n6gmXH7OV_3+fqW*X(N_`rKnY5Vw? zzp*u4*trA~rZ;l}i@L9a$&()yfJ5v$pI+FBcYPO0R6`l#m80U%;F&v_jAODT~QS`L(YrR)6B*kjVS0(;V*HsPd zQ>>4b{P`IwT(6{-`yMKysk-vOYcXZGC&($q@5;<4q|DAO_r1V{L&C0)#z~JthTAWU zur2r7(Jp4qyt}Nw2$o+1zikbi>=neSPaAl_P@PzF%lp{uPiIfO3*&1Fvm zxmALpSe5j}?wIAfZ$GNn24Q)or=J-6Fh15yk1jYov9KN-`+>kc=yysmh)gg@j-qC# zRF74dmI$de5CVR=SnZt8YQ6AnAy4GN2)5nb3rEdpJGVFP_p!s)^p-VaoUmh9gGw$d z{=1K&k=WJ?O0&n`E**{03_KUZ!<-&ur!H@JP>WQ=zH}O~BvVU7H>(7P@j)`6IfBj4 zz3PQ7gmmNn=GyO6q#LJez-V^%!qUj}Xi0S0{+~Ddv6B07)8=QUTd(4l>#^TzKIm!{ zNo&1LFj#^-8R87E`I`EWc1RlhMI7z%*t3(a<-sQMIa@c}@?U3qy;fjV$j>8s=NM8D zgjREeJCFAYWx21!#I|>iTQ8I;$n5O)Zf~ua-NeS&j^euSOloGo z9dpPr0GD8@IW0{v=(stiaN%gS;2V5f9RI>k9k(4#dQd_`VMB<2)!^%aG7##k@_1gl z6*v{jMe9J5D&tcAyH3e7>kD^JV@GORLJxF6b`&*8Z(M%Jel$kr0@ao-ulmd_F<0$| z)=|5kDm=nce*M6cRsW}|w$s>yT-Nk&N_ddjRjMA@-`XA3x6@2tpH9TO&MKA4$uA3> zKay`|ADELp-c_cU;l*n-_LTNTAh}VZy&}uaU~v=5E+0+*#yq$ucT8}RtmGdE+rYMa z1kWwm;o_3TFRqi-*i@0=(VfmvYJ$Os_AMnGQxp}}hfFtIss30A6&r6bIxHoW>x^!L zJuqr_B4+#5;EKDcumm+04|}SEiqNtC37sRZka>(~BFp?!>Cf-xis% zX~{;bDD;HxNF|UDuTdMGVM|~;N2o4^)m%J&G@rs@8q%=%oB^+Wa2sd0lNM5(>iyx4 zYoJ7G%=l81&%bYxq?P8cR%ps*>a32KQi6Y890=k_)y;3J=HtCq<{H5DM<5*L=x#8L zQsR-MG}d>0YZ&mB(4_h_S1tVl$4t&?Wm9NRP5?rpC|)N-?^5^@cH<7NmTz%zn!D{7 zxs(t4EeNNLT3!-4?%mR{%6)hV)z<=wFdmNs5Addn>f*cW0oGqvWbijljqT_%6r$v{ znJ%bcu#~%Es`@_ra-KhYFCD>KNNt=b)q1D;m}|Nm#ZdWf$Fv^{m#L9j{chOw#_sNF4Zr*1sOjzroVOZk$fFl(-d0s>rq!wJNCOp_cxk;U!m{hi0+tPb%bCH_tme{hatQJ~%a0 zN3Ga_feap+_ZgRd!Zhn5y{lqeeQUlvhLOu_BFG|B_IChAqmy z+3RuQ%*l9Fvgxa|@Ae-!WITW0 zOIW`O>U&>Lrzp2xI9}lYG7D>d)FXF66S0v=inP-HFKgLGs zOp=wwYSWuP5^UWhVG6G~HrgDs5m4%%G1A$axX4=i`9UlNYi5bHhcyOs_dgkS0XF}s z*!R8rExNeAW*Hfmk8&otDy`|s67Okn2eChqlqAd81x3jntd;_**mZMx-3o`0kd}1I zQIEAZN$nZZC{{Cdd(){-r*tsh_H#CSm4@tIS+kWI zqg3Si$n(xUoQPaM{33vx-)6uMnvA#wSf)~?R;9sQ87d~$+MLo+QU&_|?r%mv+~3RLp7i zdEKQYk|~g1r_N^55QFpQ@i1c-oWm8zg!78>PoConTqYMJ@c*1fwoPLGI$&*kO`1YW zR%gl4R^7L71t%PzK*>>_?A2u3@f+OYz>ECwt{i^3-ve95e8{ex>X*%`r4!0k6s>jo zCRh?*cX~aelm3(vNvY?7!43o!y=uUA4|z>hRMDjlroQG%+IbyBkhBOEj1dO##8oHi z)Fi3qb;-#bRsOU)nX8kE1vKtds0A!D5B_Dt+p%r6rOQ~J>ibyM=K(*y=d{yf%UBi9 zEAjv*1*P#0FI>U#MOAZwWh%<&qeQ%`dJ#ugUstD_t@S^!ro5StcEc%rjlW7VY#-jZ zi}Ytj)n+!nFHws%t(~QNAM-BaWEZen{6n8JmMmh+jDopMgEOt1bJY2d?!ERtj;r|Du zw`q~)U4{Xir!`3$tKm}-lM0a?J%1%2@`^wom>37%Y78uv4C-wHhTH!$8u-7!gZ~FE zypIKT3rmb~Dn6|iqM)GVUx-o{^mZ`4g9wCql3_Vi6rlv@fTZfYg)-^|)R|xqTywom zJ_&t5>Pw@-j2mfG@YHi7Z&HG4A(2Nqg!N*!O)IzonTjEFy{Pci08s%M<;2Tv^fx&Y z+Y$oh{*R6o47`+G?({)%ROeIOyiLdU4Z5a@f(O!P`o0fM{hsZmptm`BuAZm3$A;p@ zk9O#S`h#Ma_CuZAYOx>V?Bb1dGklPB*^9^QD*RBkh7~0dVa_d%z*<)|ROPQLCKogWivLsSQL2BSomY=EYQ@Cuea4oV; zA8G;i^;H6hS|ZXeWD&)ibrV(C!*ob0a6a`RFH9yozUEKX-%?(nkM4rqQ)ErnfQ;a9 zY8VZ0!SvB|tbWFu1CYfR4kQo@>AlAYCMae1Rp=!~JJJ~k$Y;s2>;v@VO&cpTbq|2W zIgX?g8C<&JFswf%ssKBZsl*n!Xhp9>9E$hpm zNr*2+(?eYEMH63A98Jc1bRFdkQvV>>f|iIcpFh{vp&C0eX5h>RsX-Gw@QR=id0>xB zH{&xe8Yi_cgIb3hjJn-lh(Y^CYO|q0yN|dAYi3;Y1{V*^6Eu0CY7%QRgF9`ILyebt zK09B9E5Qw@j$~O%pD~N!`u6QxU~f&>&RkkalIX{o?L3}BQ)GW5?{m!>Xu=iAk5sDu z;WRsd*k~;<=-=CCEero{!w`Iy+|1B%eK1h3E<oUrs1<&9<-p`7x|J%mE>r)?>^-8Yh}N{tkNV6k$5=o&BT4D)r;Li__-!5&McUqh9U-6B@H6B-2O57u zo3dXzKR{D%7Qv$uI-z0mUpix4buBmU5vi^XS51<|e|R&>BlW)pQLK6u`SrnSf(9Px zz*i%S+&>q7*1QnY_Z9MKjXr1@?^$mhFTZ)S?yhwsZ#kxq>vqWf++jPPw#=y6^2 zeS__9%r#v6@SzlJ$t1@+-N-glcVCpIFD ziO_z7PGN{M)~v@Axhe9%m%1HZA#&cL>#o9sBibG@MzF>W!M7Vlj;iz#7cUV2LOKVx z+__(a27K5fxcjIt*MAUHGXwxjc_Z4+JLDTgFM#~>sV;&E^lLJC4s4BnYNI)fhp#}U z1{+u5-!g6aT*G{#M_LM^wqN$st@ZbvrH%Trsi^bsuA@s|bFfne?Zr!3|6=uTVKVe( z^0#5p+6fcobgNMM(%7OT{~DqJl2Px@9m)vwAoQ0KUr_cPYZ!BSPSLCL%(`q@1$V^f zfV@cZg-XQ-dY`#!H5JY^b@uZEi9b(VFn1H>MML^vatk;r7yE#p_7&HygS!a0$k7Wb z3rak?ClVS1>2%IwdGK@4Zu5lV3Zp+u4k?P>!aH6N{s&t!qK1=Y1v&$VzFuZ5_lpvf z4Rt{;_NG=y$y1$^<-$Gq_FHt|a}F8v3vt03Rv7eP*$e4b8H z$=;3zZg698iE*^-Bhcq-UZ0VDu|@9AdrMc-h*!6tu}16ByIM~F!KN?wTs+?zD?(9= zJNtD=`M?aRDljBd`|H`TK$G(AXyd}LZ)k6={)HQSc6c*?+vsd+SfBR?bm{t9Ry zVlQGMR?}0|h-Q&wYDh-pz)@6KDx3HOrDQekXvO^akJcjN0{(CExS?{2FTCMZ{_1D^ zHGXftYxk#<{0%i&;VfG)fl?^2%yNEGd~-WcAtm;;Ewbl zMP!~6)Qp*BjI)B0HF#A76X5=qJNgh!w~bDIZ>B>=jcTJ3qDKlkIve4K=v27hHqcol zqszY~J&+=AQolH!Z{#ficOhk?M1HBIP(k9Jd?&UkJED>Sma_C_Z(JFg!m%acVkpW? z`X857^*855dp?q9$)JK$2P>C5tCGu(q&-Rhmc8Hq6E!k@v$Iu}A7A8?(MT#U_c>Ty z8VX1qs@~!fT6tAmpJ7{Fo$^3EfZg`bs{H3*+{xyBNHTRWnD)2wPR#X_OiF=9pIIfx zMYPq}%_Z}!U)yB3_OqhyyGMMeZ#o5hmV`@h*N(^>FSBz&$owZ}ESoi#ON;tm#o?a{JBd zA1c7h&wL)fEs5g6N_*bW&y!3r7%yGbMW|)cpKCPGiN0~s<)-HJ)PI^3#ZXU+BXSt= ziG5F?SmUlgHIET_(2J!HGp>>t*Wq)e+4iV1CW_-cSZB1qtn}CPty3>>waonN_|Jjl zr4K1bN9w_iat>tGH&T*x)w!eOlO@fA4S#JiwwLovS$Sl>pTK}l!iYEMv=$sWBnhzL z2smP17IzzO-jmmR9b5U5^E!N2gI61|qL%0c=s+`XScJA8^vy#2yrb?OfgLpk-%di4 zUCy32SLOe0j=AMxbArDZ*9~|7`A@*9@@4(#o20H(DxfVCW1J_WEI&NGv&fM23^o(Q z=+`V`y+z@p4a2_-y6#cRsPgmEUaI*37wRWEtzle_qJD1CwhN-K+exGs?UDhJ9N_Mqb;`rVjV}vxJd(e?d zE&EG@70_daKT@VPvh8uU!Zl+=5aW^=%{+Z;BHP58Nhuqx8-rmyG{N6fpVWoPz0o6D z=JsoKa}D`-*JoO3JK&`H-ICQ<)n)(>1Tg0FiXF8aa_wAJQ^NwtBpM4q40uW*^VIdH z-DzspX3u-Nr$RH&E^nZo*ChSb7hVlh(eEr{lM9I<#j-W2>lEn^YjBr)wzrdsD`Z7^u|{WbG?5! z_N@JjpSn){f+Il4!T@=})+&uIT!Gta@Tw#9A{xH@E*)N1np;xZd970*35g~ZRko_& z&gMund)Xd^pGJKoH%sehJ+k~T-1aMBNt0=h0retxP+7`ZlH};gJ5neOrDP;Zf|QKB zRIwq?nt~BHpOW|wX-qPbR~{hX96hsL0MW~CDfT6#*3&nfZj5UM#VQ+an=U?!S`(#Y zpf@}7c7OPA`}%a_3eTMl+^p z=zTq{g#doiGbsGD)przE8Es%++1KNlA#N_Xo40jL!6-|edZ6@rtQRYhy`g|6=Fr z0un}iz)glG-!kifs8;iq>F!MW7bX$f$mcgNV|PbJuJ>B8i(YkMW#m&yw6G#U;p7kR zk~4gc_a7Y8#s}7fYVaBXt4{Ssz6|=LIY%lr#0JF;il%&E6x;owgxr;_NdOmFeA{Da zJvp_SJ3xm_G1H~#R!QTyOW*qU?c>$->ABtuT7$a!uk^H6laP$gOmWmu17n;E=+7CU zEsc9neHVIVl){OZ>|idwl-)zfu>Jc_N&J{~`Jov?NI?=9tNM37 zn=C|sb9l_bH#zI&bu-^=;>98pYP`dPqYYEiD0xg_x^o0z#z4YkW$X!8aIs95O${B2 z^s#2V_a)^`$>U{Gi_4yU4a+c`*ih%eX2%~ntQJRQyq!R9j4FSDZ0dGBksV10RL0x2 zr!G=QlLf_UG|HMiOBqPiemtHW11}vvrG}=3ax2i<40sr^7tXERK_Oi8*|SI|-QHai zw5c{iQ#JjTKe+etl&!}iAq}|UI^JctCvEY!?SfhC?bv#e%KEsU?k4)j=8!AxQ6c8K zt_SVrtA@6*lRi$ps^5BoUZg#j>#LnKFH(-vF2y!zj{AjF7a9B)kSe!yjjNH~wm}oz zi=&1DksC17Et*0UYL`((&Ntto_)R&}-0CNdZnhZEUD%t() zvNPY~rmSKke7v96^&>M9!qwkXS+RRqDz}OO$;)1ft+9`bBHjOX2Q0iZJ+H0G)L zq&?VRW)f~tO(6_4HPb1G{y?7{8ywwu*IcSB^A6X=$hJs$zX9Ti9clKuxW1>y8<(xw z8My_f|M}>PapLfP9Yl>vujCV|CYA^4xerA=Xjds@h$APdCkb>GG1Aoza1$uwilxGkC}bNnhcPFgjzfh z$voKn@!aqugtYjZU{r1QPW-tx|6H#*XlrxMP-ne#&($D`)x-In1IBfu+~Qi9yR)S< zxnkR#V~;Ok!0ni&gw8tZcko=ZnQ3T)RL5w_)=|EV8#4SS8|8atIX70@nP%DH=s;&o z*-^OC@A3KOQxCr0TLYkx93tleUY9jpMc?X_-m~Bb*<9O=O5`aId^dA5>);wbuy+f4 zdjaIH-#IE8ufiiU_<6aXRVX|_aD}A>ZMXs2=k9=?*FBx03Y`57{EJ-==xGR-@eyOw zxyYpja>aMfvQH*BkFv5(TY3kux3Y}+xVT}z^WvYp_IU(7~+8-}Pu$ z0>{NA+w_ZkFVS^$NWyobiEG<+%3`94>+3RQm9unNlGM7J;itjhxNTVff*y73lyBw` z^Gv6;cAYOEbCWJ{0w>-xZaT8f5NrE6OtqJH-T*X}gG)gE9o;V&F;s+!Kwv3M2tE_T z$!^9*wrLaFMo8cteAsJFcu0;u#@*LLZ+$|W{wlJq++KMnF0tu2twyh6ly>O(>Oza- z!=4Z46o0%}cYIUdQ;RD*rN(XEvF$S5GNWUrlIh^X83%Ba2w#G=B=ze~WpFBxsSQ2oF|jM0z5pQR>K9CYgGlu0rs&Bznaa(=RL|EvnE4N|D? zXyvnPd6zO!l7|^S_4|bx2=(Lb+fk72wXZkOLFBOE+h$3zRN2xfW-Q;sO_O6dkdb8O zfBAe4IU@twZFSNk8T}}Zpav#9%g5&#cm8=IkSlb_Nl2}?d@Jl1dF%CzrHo)x!EbGj ze%#N47g?3Y5r~cOwA>s%APw`vhG+EZAwP9(JQae+s_9>TLNE7Sj+$V+TW!#>9ra$F zE&JfnpQN%lX^6vph!umbqp_$7t-T9#ZHyj>S;V+qSGhpx`S@tkbn!2f7oT|C5}73ndA^s*jvY2jk`hvU4fUm9kbWg$x&oflxdB?R1g<` zZ23qiLK(&1stJ<|F?aL8wMEi%4|Oz%-qB2Pv`~ydZd#=ganwLsJkcss^>rysZc!qW zS=QD((CaAO8?w`Z@-sW|eB6x+6C^CuV- z3v5;0#&-`Fni<9=TOCp0SJOAJGo{bEJS*t^rNp%EK~IEogFNm9k1`>73L7(e>15O> zP?|agsj?NGS8XS41|HPWS>zMddRKZZ165Bc6oaKeva3_jiM4jvR3fafH#?KVA1>4T=9LjnNP%zs+`c4S$t_so>7_4*WbjMrVC7*1#?^2ShptQ;tdiL z46+jv#^ssgkcei0$CUPsclIWS0qb2R=(pI678sJY_%F+};0Y^=VYx)*txxEzqqE|GHSV zr{fxqUGPKIu|-#kn}s{9iJbAP6g5^H%2)Wit55Pb3*7|)YjIt3<8IIFjC|%(nx@_j zB{ov$H1^f&@N|{dGnUc@2ajub@(D>;;cj^^{EiQv98I2zmT?y(`CJEtCIODgh$));RLAwn*B<%*1iJ$vO|W!;nn!kF_NQ+A~#-g z(brpG?#I1OiFSiU$-llr&z+AyVmHC5FL{kmWxCI=GBS^=JlW07;Q{1u{jnFxUy~rA zzJ)eCXV-;+sreS*UBWWs=eF>AS(Y(wl|B0rWi(}Vl7WyJ!m_CF=(;j*|6!_|_=@vf z+$OLf2fa9iKUXZ}BQ9C|SL4jdf3?|+jH5)i@c&KO?En6MX>Xl`z1=};GyVs??x6l# zSG5fWsblqDK)ssfg%bvyQ=`CiJimXiBcE7*s#72wvoY|BXhi{DM2Rf%-~Tw+{(GU@ z^ZUc!obtf*gvDoLn*Cq3>O&S4GL#%71p7<=_B?2<&*yYAM83T3W}Cnue{x-Y)l}&wqFWJ$Yh$N`|@%?bY7=s9+&s_Lqgibr?9z zfR-7{hLvm6?5BLOU|2NamLjR~UX=e5(ORpGOP4L8EpAGODh`z$2LuDy~s6 zH+V=p;I-6uTsfQ0i3$W5@itvKIM+7&@)^}Jo$6I;Hf=R$-=)~fv2nqgV{gNef{o-8 zrQQL3b)q}IuvhJroL5i4O2hj$*KO~a$R_@f1(Wz>(i4sY^O>#lV~?60Li(Bs+jR^l3S_Jr*(@AcI#Z3~XRhQ~`^$q@|F`ENA6EMzIhl}k0M4-v;pkFj~) zg>)(B0}H#iSD|S#I*L=~s+)Jq#-zBCXx0X84tFD{LqkJKAVWTT!~74Fk4Ycr?Yd5~ zh8U}qid^LM=lQE{u`1QsNOd6k^$ZAz^#+TJ?Yhd*ZKcnUv%_AsA)fW(2$wz$d{EMG zN0TPPLi%Ai_QmYuQ?jh3O71AWmv4q%aB8cs?R%kU?U0n2=QkneY=TP7l+27N{O$Uq ziLbP-!t0d%yTc?wq$%2SX$L|@dK}8n+5OFn0ghJKiEVviUkge11zj<7QMb6{raGL+ zq6_-55uWPfKh=Nj^jCsRwaE(Vzg*lhlcS~4WQKW_c>E|yRg`l&W6L5$yKg`Kv)*Kl zxHl0Uf#|gwf-Mg>@ilXVI0r`H1lP%VTG&cdmyTWd`~y_0sl^{Y*G=5zAU!eM?-opd zX~!6sJFcB&_;;6YsPek2+HEf{%eytyi&41D)ki9FX}G&0_GEpXznxV23lc(h1UA*I z^k*{Ot=PBPBaS=AT#lKUMLt(>O>)*|qiY{zhY1;#(nh%LSnoy!GyndYQ=^k@@dRPD zyk|X=$~STdS~9;|b+jgj3#@!#{CVw@(tbUi(YIIrnhdNOkM`EmRPcF;%K};X<>#lf zkGXd1m?P%?5t>6?xbu%K%TUEQTAtC`97$v$(?L2)ASA7Rqwkv+OT3q zIV1zxyPR*8RkXbGsxtSMEITemy1&NMuh3r|4{{ETeM0X0vg#Kc-kFvV@{?%@8JOjN zSx_?RvIgGW@RvX(p_94t==0x;{rDd-u!gvdsk$PRKJPz1eEkHog#=_>v5Jq6w%iuu zN82@Ai|>pw)1Zmb%IBTuA^SId6|{1?s_To_#1Tii<7m&j%n^7M!=zE?-Gq;$j7`f; zznZOHa+8FUD!<6TZ?pGg*8X_D=G&O9l%2zMjZQwV&1vX>M|=Pu*_4m64Y||3_TK4c zW)Z>Ch8bl}$_kz7;k#1aAB(16YYFA-pYd1(Tb|6P#&0|Z&F7nLHxZhr2k$m-sx}@o zh^sDNwcI;--n4*d7<_7TcP!GAnPOU#Ot_#(?eP8P7^d#gqHO|_CMx&i)Up_F;qhEf zqrh+FXOro9AE)}<;?G>nTFiiocjAnBIlJoGa%^3#G9tzF^jB8}|LN6OkNqR7jdV(9 z&;8Jqj;D9#d8fk9rHGfgHD6fAYy|l}#)lOe9?3|`cTaAzXW1PyP~wceoXpNmyc2t2 z!_Ra`+xvMukj?GoSo>spN5@^#4Klr3Wp^g(?Xvdi_Bvylcs5R>3%e9%G$k+3ZR~VA zUYO0Az2_5L(cYR$dm!iM@|2P`q-A*-6{VO>(94S6<%;={R z{BgViZEE1@W3Dn!Wp=O9&vrJ}N@s)c0L|TIHvZzLYSY4oQstw}LR;T}nLwZ5%#GZA zlHkwe{l+G&%w-EBe2-aHA7&FIKDIF%OU53=d}FWXCzYHa*j`^jUOX^++^DTQ9(Uk& za%pS-gwnEZa}!-FOps}v`;54PRA6LF(VthVe^X^r!T0sNsaO<+)ZG(y>)2n;yhE>Q zC0a7*tjU%o(sRiSzep*PdkAR~xCRF1@S%zSl+)KBsbQ6b6^o_VR|^E?4kN8eNT0E?pge zvAq;Avs*Z^w_sP-?CdnY=rm|Hoo92rU^C-nvnMWTlAui``EzpV<7hy{S4YZ&g{@ki}!ShjP`)R&fyGcrHs;tRV^c!f(a4rPsna*saG2_VAiT1@rkRml5&gQ*ZY**N4dLek-0L^ z{`;!qO!h|51G7H2CN-a6adWN5LI1oraJq3~{^Wv0SzNR0+W?PrdRO`{p_n4v#?DHG zi^)if-pvWIjUWf9u6cvmR}##>ySqN{whnl*~rA8m_3G&=g)K401Ky2doBVNOlXiQ0UZQ_d zyaF@%+$P#abfo1$?B*9U_@gHYEzeb!2J4?T-?U8owToZ~xFF6uC*4d!Bh3KY^R$fe zuC@0|=5{JOac-iFxNuItM`?el`4|2gLf;?rNhf_c_NPluz#|T2!PwM4Q>TLlo9Wi} zXNJQJlH(~j>$KDCo?1abCte&ZC6`^!FyIP5NLdvLgabHKiF5IJS))7U#n+2zPHdvPDO z(SWWZKlPby4ia!p!#JgU&U)-J)PAZVH`M*wU2?$vEZBT6CB4d>`|AxgA$ewY?_#)r zihb;I^Yq_iozps@Ya{h)G`Y8}Es}=Ke%a1%&ct6|m~RnM%6>DaysM;h=#p}Dz`fC` zuk|Tv#esQddoK2>L05Fn_LZS)FI1PXa^(oX0w?Z5$X5 z8O1#I!xVpS1krshUpGNTPY{Z}C88r9tlNJxzjR3DT3kCFzy97SWMG28=W}*loNOVq zdVkR7eSXaaVS}*Io2RvfoQ)d6GCP|A)f+q8_+P^SBXq!tEw+oW!QbJe05* z>w2o*<=fajd6~RcALxo6NS|W6M8>7UzZYH{Ky$}Mh{y)H!?&^nmKnOMr32ESiOHO< zCG?hObeQi65NJV?r4_}x6Ah^=8?m%~6s7w#-^@YNOTL!q;V}0>=j_4W4zoDJ>8GPt z0vcH3PQB>G8GU%~?6nPwKD4#!rMBb1fB5>tl>bzXtfiLWCn8KjSa{lzF~aX|1?#QY zB#y(pcdJDs?i;tMQV#a@#Erl7)g_l+zb4vmV3nXfdCJ~gEwzB?j&JStv^>k_@JW$- z&`UDAq5mePH{NdR3c*GFQF`BpPyTs6e0rX=0p^ngH+U=_X5$S)lovMKeA^ujtTMkQ z6@}%Kle~g6Fq|$nD#qR7`r0V57AE!=2kmVj@r!?%& zQ!=amMZoX03Nu%F9lN{b8s~< zNhag!yC%b3^}1K>@8`ydojdA5t61PC)CF1-JYMmVM?7Jpoj}YFiFW>K{-vHUoSBlF zkbcF9;+@SfpD0P~pXM9+gO#||BB}|^>dvg-Wc!zPXd1=JdmA^t@DwN|C9CYujj%FP zYviWfOsSN0Q+=?QiXlV{=5e-%wOs3ymOb*HVaW24+jTFfFF)Tm=cOAveidF{hGL_8 zJp6mE_V&PlVthVM>0D6FE4erRFE?IV@b9T6gxo}w95pn*&5KsriN>;e&pDVn&vv(Ky+V~-QI*Umb;4w5f4HEmPNd}W$z3W=GGqONJ`nwgf0wLY>!EhMs%7MFZY z1!WSO8KgC4QkxGd{h#s-)g8SA#J6V3VYd^_OESr_xyKf&OPUz^L^*c zy}!Bl9FngCjkfJ~WYR^WCUta(iL^0Ne-JUpZ_T+letH%rYVpY}X3hDWzZ}(x2v&Wz ze^&MDzII+)YH}Cm(5&SmOHs|c0U~_!PMkjRr>zcvofS}>}b}W>3-THxh8wKjqmq%*#SGIuZB_`u8HkStvn%! zzi-#H)<&v9s@^FYbeW-4BS-1uuwk81q9gh9hxf!H3I@?uKuAMOEf@^3(TvciTLKq= zHc&HC-AEAt8^&X%I#_Hn~8^g*S97oTxezhwnH1Sy?@W z&~n)8epz9e_BlSVY~pLC#-RVRELLeTTVjGAjkMh{_L!J~*QDX!i3wbGA_2e5`}#}= zKD(an?!^lgD^_8a#Xb>C6tU~l@z>arUOc~5tVHv=-Zv#`*Ig&448W|Po9J`9GKDqM zc6h!a2|?0VWu$ni+#ypu|ClgfNBpz_goB)*r73QZPs(yAZ}(8e?NfAbTRoYn!mvCI zzd|-2j|I>r2;Pky^oVvbLEn+4E)gd8SIR!M&gL5`)j|ABDg=kNku?;4rj5!8Z1U`I zGwS>FJjzybi^~(i>vA^L>b7}*?*KSbA6=-51H1ENS+{$yqDn0ZX1wcfYBU;{f8e~QeQp0>#7MeY zw6`Ao1dlueEzCH(l{&1N(1%KWQ+cBia<)Q|wt99g`ACUId1CKdXT*`1gChXGX z!o1h<*0k0tI2%PbYs}$eGKD4$e&vbcR^(&v30);8!Ap|_A^@~%WCkzZ6v1AdlMOkf z_}@q<%qoMMHq90&G*h)we;iw~y>)LNM>=Ui*4f|bobbHnR?E;!H((4MI3tE7!8zu% zFIP6(mz9&(-zZ~?xLg^f&1dq|jT9Bh*~)|Y7@MnyhXrFy{Q7WlKJ&88|Z(muJn zIZ%D+DJ$bOi#?A#V)N00nYn8PWR?PO+>VF81gxJaTgxN$tx?*RdI)K4YjYE%waJI~w{7(-tv(>lxY_vG96@zC=xZ5$`lTPU8%m*o@gh}twh+r|>KmG0oNJ^=#u~4Xk}MX#s~Fi$`n365ztH1v zPeZ3qPjBxn74r?Rb2c6w*g4UTO*^#;ptp(+?6#wWwghN{226PG(qUhymW9WJkDkJ7 z`t?t9R_t5vue%!)VPBt_)V;sK>(&5QNK@HVE zc#KzzGbrN|61W_;4Hx?|WYg7)%uy&ttum)vvymU}SI8(F(uD#CZ2D4^2;qRhyAm|@ zg*4CE2ptgEA^YdVN%(iBs&MbFj(8^~GUgfgG2FiG25)kqR@L>}Vx*UP(;|fJxoBu; z2ptNqaYjhpxc_!O>*~nAd)|Dp)DefrUVhAFzd3E+{v^v{p_6{GC#4RqW7wBY4d1+vhn$ghuR{Lz^z1w)ocZ~0)d=(MO8C9wiuj|HLF4J^>9jjw0K>n8LZQ6K*jrI!&K?6M z`5#cn`xeEhHJOI*^c$O-bP|44!K`Ybs8M^5lcT+%?y*9PyR2&Y_c%DN!-4o@Y*Gbg zBm6dt!O|l`X%npKg_$jf<8VuGo%!x05*YnYtGU+cHZ(068QE7tTC*h7azCfT_WWZ{ zV*2&<_1iGPjA*CtcbxF?jU~|L$|2+x+~uw6wHJN=xlG8yF)vv_FlsO#b zclxw8*4Jb0w`PS;wz~?8i@D5(`FYKT4HaBBryH>Eu_$M{eW#b)ob+a&TwQIQ?}$r! z6ORv}^;JVcO6qE5WhIMh?l;)|ow*<6OG--IjJ4Cm-SLk%o7jIRCU#}(NgDO0HWXRS z?ZX-P9d}dX(O@@669WV{k!Vc*o0fqU*-wQF$yDBiys>vGfB>BW^u;3Q+$0F1B7!>vcYu34>^XP3*ok*CyoTyK6F->$x}PigrHQef{g{Wz{^RXRzZ` z@=eIwM2}d?&rXl1_h3mZ1Z|fuxfK-fQ487T9%sNh7Gf}6SvnHLBqRq*8Rg5noG!<9 z2)Kx4-IQHyGej}rQQ@hHf)ndAy$m;Fe;F%+xI==`J5s?-)Hn!OdS4fo`yE;3YOC5H&aA(KI-=&Ch z9isUyevK?Kz5Nr=Vw0*_I%r9qrkcma$iSdVU2Z<{Nq~c+{c1Q|meI<=p}1!wAmI8| zjW-$V7;0s(=W4i?;|foTvA>@muTqBQ$klK%Hg!Xk)7i7#Ro_UxWA($GC4++FS+r8s zSArI|gVMcG?!@?bmBzE6SNt*H%WQ9&S|pCuSeY|* zaRPhU{@<(NV-8EK9*blT6q~PxE2XP{U`|mhNGIo@6?N)QaH%biF1A@x4WSjZt}k&p zafmz|DnRlXw8e}p&&?Uw$VRlBVOD!ml+;2DY+WkU|H$3J^kuo8xp2Qrf1C8ZC9A(* zImg4pL*Vx9im2dV-GY-U8E(M*P4xKmboKtmR6_)eIu%y*q3K{SKw75FlA@%&we>>; z0mZ}2g+#fZ;x}Mqn1tzdgJ1S|n~8%);+@?u1N zDEa)t!*9j7p4!8n)XPczEfXym%2U_i&xDH(SQChDWPnQQ9Us)XO%=$A8~s3u&0`Vo;>C03tHS~wiv|fV0_nPw6`B%CEN2?6*G8iY3;7i@ zG@X^^3PTuU>^{K)=IGR2%Qqd;w=sdJxJOUl0h@JrWMpmN)++r9DeTX)%aDj>JsD#% zncJ9zoDXy3Q;m9(5!ka;e#YAZfS;Ote1z z$TX<)qyFOIA=KUWm4FffVN55P|DPS0IDw4}_Z!#@BBg30ds8CciPE%F& z$CDi-Ju9nk$OENp?W&8dQ36liydgF>O=|W%bIm9@TE##F{IHm+RXHvFo2SKvBZyjc zl5KNkX9iAC?<#(Je6SK~&^*Y%$f(vRXtP+It=AMT>MRJ`prWa%iL2=Z7DBbc0+sIVJNfcKN$Y!$oGHjazH5bJ&$gL?> z!!jv~Y4y=UeDVoW+Wnc@c8-oYdQGIZtHUo>2J?`ZiznI%ScYb(D?xe+ zU|VOj#2$OU(viFyYvpH~Eqr5{ka{9j=7)CBAzO)wi3SeKw(K|-)57H3VK8S20K(V& zB~A2p?Z?b`2fp!nDyAwERp@{CA_8$gzCKZffYjRi^OPGyD1=&-va{od=%quu95T~O zhF16WkufqdLi%VDS}UVo7fLp2DbG&u$TKi0=A{*}W0TLG zP6HlR-cBKnQhj$IsN>7tY>wK1T4y48N#ILZlpi`0yc0 z)M{{6&^CJLLdC{|x)n_i2{yuSZp@L1*O$pm_N*GNuQ;0~**tA$VXTph87Gem(Pk z&lRzMdbGB&yQ>e83%g=!W@e^+4wb=!96k0MT=@8$h{&GB?!aN=&>%x=CS8;SNqXhqibEcZ-FEg;XRj zfmg|nt*wf6ke{zQn)lVz%;fLs%gJ2_ zdI5Ozsr>ZNySn=6VsF|J+BtypGnSqRTPn7xw{(1qkK{yQT|>XEIt60l+-Dv?K{RIxsL0NTYm;BI~H*nn>iX&0@DUL=fn< z_B1uVSHVn`FxlU{yng95g*HcU*Cm$$LIMSP4>7a7(B*-D^ZxkKQq$?lQIXxoubJYt z-XK}ghQIDt4#qzdGAShAFf%ioZVK}!({k!a)-*Tgz`?N;tV}mY2Jx=t{j~A#EsY3cn+cC~FD#y4dtTrPeBjo%xw;vd!>b9ivPz|8I z)MviP#?xi|V}89wh%Zs5No}p?xUdN=CMI^gjthx~abjWuJEy=&+6$0clT%YFC3c); zWo3I#RF5A&_PNFR3ATHV*@)n;U%$Tj`xi7P6c~2Ub)_u;O&}Mx;~-&_(FHbQGEyA! z>B5!GjXGL+Ai|O61nv2zYL>Ij4aWyt0|iK0n8{#f`LEZg1tFg04Y&t!kc5i zr4NiyLRFPwrZFVvavxv=R>HZSo$LjM=UwE8Sevwv6KQ7xs@xJ}W{DKh7n_Cs5CTbO z9WI7tAYuWWSb~l$KOt38Qo2Y;BL(D~ww`}A9*8Go@}~|Cf-p=FKueDa8^j48dVyHg zef;W`mmZl9rD=cWN4P}@LFqgrLNgUB_iHDQ-$`>oMc^|f1 zD2GLmXM;0kD~H5oY*cP9wS(5%W&qIEN2^#)jG13gyp9sG@hj}-A08fIlKX|Q(Mpx< z>iaRdJZX|)d_d^s<@FvI+|%dJiOI!c^+EkWx)BRY5P++k74{sQVhAUODGzR3WSM)AWmwSoiyglqnf% z$*lR4UtnwI-hE-O1ut{@3%?B6R(ai<8Y67P86H9+V%<*P0 zwq-(OlAdLSs~=dsj(v)AeD3(05N%yq+EIVGvsmQmz|*tx!>P&Sv-4v33DS|Zvn`Qp zg(r<64=xjy?}&ulVPdLZeG&>mVY}o{Effm3H90x?-hOEcxSvY7iwG>^FIlwcc|5$a zjs6IT-xqOkOo=lAJOY7Vd$NNtrxb)_GQo^1%gQyC}j(lpW3Iwez zkY3h0xY4>Vmq~7M>-+rO5sBr&$|GfE0cU#Vm&|XI{0cYLdW8q$+er+ z8sBn;T%5``?lCfKtp=cw_A#;Ik9jL8D#Y5+y(4P4nN}yQm_lOvaIBLBA%C5U$|NUh z%C^>@RuoR|d{y<6O7ms$Jdi;py(EOY19@Id$RcsEa#rsO>8(7NB`5LoHadjZV~& z;j`Kpenyx7JRm&aMUYaYT!F%fPzk>M4XGI6i1~TLveUyQ7WbiV-!1_{r8Yu>tDZx< z%77Y3YuuV`X`)1yRKi-@0lcQs7z$fVp0`=*jaTD4Y9p&3!J43fb>agQdY277^df~2?qM!-jVC^ok z$o-?l^2=%@wQfnc^%}PeI>ML;i?B!eQn-VMgDhLv7g0!6<^b({xMyy2LVhaY!?9j!o(a3_W4EltZG47KCZN~ za(GXcLW&}%>#37b_n*h$eAWYb1vglDqngBNr$_1h`STp`OPMHsi@vGr#y^U%tN(nW z)BS*FTB(;^UADWGsWUy{^=~Pn#oXkoZK5t5rv0an5#mZ;Kz=Cm=-f;(NY7sca3Xa> z3c7ywj=YlLvUy)GCfB*BuEpqK(#OOC#vw)IT^@QKN#nwqa`uPYha!1KH&c|xtYvF7 z%M;5EE<*Owbly?Gu=^CL)N%#-?2>aGW+f4}On#g67nt@ysbfKlqO&CmYsyb& zO|CA;!1gn2Pxz={zdvYf{HCera7@uzKPLh>pzfejb-poSm|i~LZi^HHiRggy3*m?t z$MJT@xH5tN79Q@nDS4rqAwss(%2q zyfRuC`5ANVO{VVS#`9W{tW;c9FA{cyl-8uSJq;P$rT>bBP2n6gcW^^SyvDATfl)Q0 zJClC$b$BvnaTpts4AMV>0wJtw_@}|VW3h2k9<&vi6O`nc(rriT=>osODmO#j=T%q;`t{S!Wt7^ z`Lp;#ej%Z;oucWDOx*_J$Ox01foAYv!^K=h;XoLGGIhr5xQphdvP<>#FH&?se(J7` zmI6I_Q)D^gD70S}p#N`XIm^!M{3K`+UkuRg&#d1)w0jt+9T*-ZWqp)Rc{3*2yl-yb zpi}I4jPFchg3G-@o}UorJUob#YS=ztd)vCPm#Jw9U2ay7o(a&}&`Q-6>Crww@*OO^eB4Vc@tX%kNpuU}c8S_qZx zS{zZl!K&09U{!LskYb{i=lf@!>96?T7cXB`GGYjCF1`IN(&j>au>P-#P=;u1cAS2?;`6?)T@<=+z;;#PS<0qd=`Yh;tE|GUs{u#y5?(t@HuQ{6-mv-_eFceqvI^&^lg$7 zz2sC7-U9)Fh|Ek5U|)Yf;^KYt^J5SZi59lslmn@yns4F>_EG4Oh|S`S9D_D}pnBD{ zwUfYPAPu2O+n9^Q~)C{Maf7)PQDZ!nlk2#Kmv%n%si&OyaAXBgT7P z{}K2J1nAn?_{g8x_9r(>$vAQa+&WQW6=YgE#SNFfHxllDY{C| z^#yjC(C}QXFC{q)11!iJtg68fES3kGdVVw_jquOZzdLZ){7?ZqFzmPK>JbpFo4}T) zz}Z~tfFV^{C%`f3rVd;HRjvIX1AlxozoKxm@BtYz_^%dL~%RRboMq}1(rb&t5 zBjj5wk@ML1-gt`smftNzWb^4@YHsU|e^`BpSf2=a+^`y0sIlM$L;deIROqnEs3H#v zZryJ^4k^L>hU!8*{J^g(8X6jAdFDK>QU8VT8qCWWGd42=YUQ!NZ}%$>S=c8N^$s## z$#$pYbYr~ZiY9jQi5h-!*R@{H;yFjgM<)5L^|~uF4P^Zko($0er*vw}EhIDLUJt+J zY(s;$Mp|EcW86d1}cso3a`N3g{=MHVkM=>;y&=_>$tF9KGfg_ zQ~N?4pEQYsipw?xZH;(qtW1Dj7FDb}w{?-2l=wO17s)@OHuK$iItREo{h4;S+IO_K zPvY2Np7xOAmSb>|i-q~t;FRv=?^IP);rvm~qJx!4)xd;;+YgxvczJ^D%AlRSy-MEY z|5!;d6zo|SgJ*74fE9k}$`x!quCQo96GQtDtMa}-yJIDe7^1&}P@q(#7h#R|{LDb? z${7Q0R_6Odd@m&?Ki62X2G53L(Q#BVPwMk^Zod|}M%OkF>xe&4nM+SkFGzERDklO2 z{lY*FBh)kBb^Ln9qFxvPfwTctjvU85vlL#S67&E9pm2$4Y1?8%9F?=)Uw~>+-Pjl! zbhgXc2`X73Wp>@9a9E@TD>HQ;1114)tOmU8p-{{b&@8pEx!;%8yqVObB5EUmNMB7a z={aLX===S6X(Gk)Ie{QM7o1Wsos3A2h_av%1CFSlxA&Edl1;BCra+q!*pvn@r_+PE zw}0e5f(?reOcG+^zX5li+~P610hLw+wh)L7TdZ;6bg<*t(i2iJTI`b167|x=gyr+a z5$lZj1h@pvn}56KlMf!bF<*vnO5_Q7Ls`&awo&c zoh)Z(6CSjXZS@dPB|eys^M@+*4JcZ^+nB5=Ib2K$wS~fxUQer~YxSV%`rySi|LAJ3nx=)l z&8FZ7Z@(e+Tc;!Y@T|rQ#_Fw{ZQq;<$wUcB_@AD5`uopSg|d?O{&KG}owIYu#;GT} zUZK?YIZt7XfU+kkcbe;P;TWFJJ-&8iY(;| znv|&!y6r!(JsEhcSFYSW?4t2@9JI_^;;S$zoTd8fU}Pvw6sAC1g>BpiithrllkNmdk2q&1`?jv~J{D z4n^~HxiV2%$!)+-L!x8l)Up#To_ynh zZIg?2hKL=ugJZLwGSB=@e6j7!*2^HSG;YVK=jrzt<>Pc;z8catgZLq1` z6dK%&9($uHjMN;~(1<8AqfS!FILCqNVj%whok=ZoJX*_Vm=9!8?%4#M;@V(MmtG?W zMwWKiEN4YifA2>aMrciBj|vXzwAnUFuq!q4^HmB?9LyY0JX{oPx5x!#rf$AXb~%JfI(S4qKEJe!N}dZQ|u~4{$3(%Q=J* zZAaI&F|X;!kL?4hzstzL=GPJ!sHmr(Gmy+Odq|-u5mjdXQ9%ebxi~3~f%@9mc=1gk zTMpx@iwRKm_D@V~*?<$>1b#iuWBhF}-#59&duu8Z`Tc>Ttw-A(LYZZPr8sW!e36r( zf`TtgY8&QOIz8i+SEIQ-8t&XV&M~E=frQ4c6+@_+n|aJ-2%E*jqrrqXaxHh`(^5wN z4Nzc8Tc&4$rTdU*e%Wsg^<_Q5dUf#nFI$oR7j!fIpHuv|y!?M*e_8bmw6RqivTn}g zFk4yJ&^tnFRod&%y(1X;LhBEag3g)_o+FfeTxMMimSjxkVqc9ccA*&xlz!0<9&#?L zS!a;~tko+qp-Q7dFqoU=C{?QZL-z90BK9e;ux%leHU-aDge3^5#R`Ir* z#638HKZU!oeX1_u)m*3On9y0dNRddstsye4b^!@O?B8Al6BQE*k8JsQi? z#)TWxvCGg@_w$|UgEk`_EC~O;Btr4<-NBfv!G~rHuSu*%!}r_=D&OHY^^?-zW`?y? zN$LDU{zMDmtTzSXmQx8(8F5>*-@djsBKh+>J@$5fGcZ~i?wRSIY+m{KKykh5(*-*R z$Dv%wZ{AZuq^vIrogx^a(8zh`BaDxGGtQp_nvM#ecnrljc6G*IUFju|MIsNSW283l zZ)V$8!i`yKF{?L$E6wItPc3xosD=!YYriJqF;sS3i3cUdo9N{MR=uQ#1|O*O#k-!= zlw|3Et|BxUG}SB)R&DvwVO)Jl_+OuseSIX7%XEKz=<8BQR|4nelz&5@5Fc)$4ET&- zZL2_x(_DpF)EyZ5&aT!P<+R||z~WPVCIpOb|%xy})FG=jJ+}&MP!Au;Um7Ro=-<5ci>STX-wnH{>?i>|OBCM3KGJfA? zeX;!mQMb<+R9B*n@nPvEetD|O6S>My1uF)t;hfNKQJ^mUvZ3<;cXK*RZPmir*)}=) zbl3fBHsrEntC`6qa_;rbot91kz`yCrPP03k%FK%CRMMKxA>Gruq|#A#Lx)5|`5Q+S z9}6Arz#Bu~D!1p(xkml|v>~Wj%>FwSbchZ1>T>DXJ6(7c9BaPNS^ZpsTW`LdrGSix zC=gE)Vnc{rl2e98*lwseGL!{pJVaLXr}j|(bR(Kp357y@Cn53l_2acR5go|kwmY=O z54m4+yfJm)JpSm(5%wI8|IT6HM~$25(9Xe4exy9t9N9P9f9QS%C}8b1KjWeE*M9K& zexjG;LN(uFARvFgF90h&Jt|iiTGv=Pf>Al_(?Yy8Z6D9y*}4gQ!YfI>=*JHuW-(H7 zd)X(*-6#QaVP`=Y|GQrHa*hMc+Uh_NoS>Q?EJYfXD#x8{Cubg>ka-fuZ^LELV8g^X z&5|#cZ+Ds=8TRM1TGjbzH*L;Xr^$N|-+W+sF!>DH*`Dqgpl2$eL<1dSvbAWU%%hTR z;x5-yaz+CK!R=LN*;abk`n51biQZ&#ZWb*e0fC8?If;SAo}|4_$%iIGq>PC}XA`yT z64$=6s#gEL^Q6Dj;;vdKJ0t4jIZ%y}ics5?NmJe1Bxb>Ub=v5ZJU;1?g*Jkr5qUN2 zM!&|erNKg>VZIm~r~}h@7CSqXFEW1z04u|OOUNhzTZMqvf6IU!BhSlwu61H=O*UT8#_+5|Vwz2^BEbb6 zrulxa*0+Glqe1umC;@wjXSrRVN}L2qo(^8(HQL$EgDu1~n{vSTb6>kGW2Iy8c{hu& zNYQh+gTy*JL`7#e(q{rWO!v)s*)9OsRP@yrI#9cZj`O;!=A-N60LCKU2|@SE!vgU9_`PPy2au4a|E-s<#kgLY2lP4;V*Txd~&>n=thnV zCJP&kmK^W5iGh>Wnw$~*`<~0eG>Kx{kDA{Z<+D>AaVxt=wMtq8(a*4v;1TpUPncL0 zwYF+L;{5X`)j$CuVB>=DJ2!e|++3$4)6-}^e_SEk>&5Rc^icAZnYt!kqo(6xxe9R> zQ%sZ1ZFh9=xnmRd=}XOFZ4+C7 zYV{}a=B)nFCO9~Vr(>BZb-wS@!_Xow$NA3BOhy_b?n(ecmanW*i@9zSEkK-{2}{)|$>mtvbjba++KNiRidE@F3u zStE_&w8+w8wVN4Id^KDvoXeU?v#cz4e)wRQ>;Xj7UPv$GGe$@`-A27W02gI0Qv|l@ zTC(rO&w1Fc>5ngESOHN$q>P9x0}eG7gHbS~A-){pxO{61dV>8|h7PQoZ?Q!QR2>me zW;?e~L>6Z_9dque6@a^SXIMm}K`Hf?DRj%SyPQ$GB~^V5j0z90dGHS`L8j73ihR8m z@6_@mO4RZT`%r%T2#IX_(tv>{KH*$mn8-`NFT8e*4xz4O4#6{hTt-ZkYY&g%|LG5p8Rx~1i_@0qn}`!p`l%V zBUHzyQEV>28_K&@D$!bE&uxm%x;Jf0$dw)=5LBv`-yf_6e_TdtcI(D|3MhHl`>1Za@0k7g(^= z;|<7%xD+82Hc)>0YQyX1w&OZVS?6OVs2vAWOz2*;mmOy+s{^7-=@HX7zT;1LPQ$e`D+efHO%SWdS} zkN$eLL3nj~c%h4GIL{k!2S1eMr9Z7p{hloZM`}s}?d+m|pZ( zUe>~{jpiEkM(QfxiqX^zPScEL$S8f@k|2$=Pv^0kh5obFAOcF&(b@2bT57@ZiJ6+R zb~#`i->;K-cogtmkPN-IA7NmZF3%XLlpyvx9$#9^H5xXBpTzTQ_J=nJ+1b5N@{( z51h=?b8Wr|7t+7#_~eN|F_IQwoRMCD{H)?pV^8YKC9@GG-;+6T%3PEb7x(f7|Ko^X4=vf$nRb4+WJ5!^}m6ng&)F3P_c1xq#|O)U_940=Y5&v z&tU9FX-sEjS9Kbgp355YTi{#}diqjyo|C z{71Rt4GJK|L?z!x_PKv;RZ~-D7Pbj!1D1v6vfii9rof?d{HPn`3pvPIqb8;+KNG&w z-K8EZrF6S!I;7f2p+R`yvA3KR9G~!;BOR%AUwG$6o?32U`R5yu`B=J|^Mp5}(%Td< zg~g)dfyc7P@sfyr-6nnJt%ewCw60Vv+M1gXo(#^O4Dzd7jWW_868y-J^LG}AI|7} z!h?+kHe9^P<;`+evl-*w>Xxd{Nn)ojes;zK)s>>=y^4~RZPd`p5z($dlsGxtA5;mHikQ8K8b_&q(Ww-EDXbIRhqTUA0$a zqPZ+t=)QWETYCa+!*bOx85&I~P<>Rx%g@4?@3O4%nA8J+mqiMN!Cpu4_+jZ?doD91 zVxOVHj%=!XRrx(B1I3o~P;(tOi1}$5E%;lbwDG7-_;W_N*jQ@}K?Nv>2u{m;VXe`3 zLEq&r4Ft`NIc=raGQ1D|f6?C&@#(xpA$C$u#W-i~L`x{L#7FE9GB@z3^Ko8jTB=gYfDq;$PO0ytFemh1vm}yw%(-SE&BN^TB~xA!q2=A=63`K130H! z=GT@7hI)N{{qzUdCmt^K2LjJTaO@R~VWC~~?sK_P3mu1Rv@)xoE<|uU)?;^f@JJ@v)Zg{^kVE{b^~g%Sc(-kUhGxyLuIcLCCWGCJc92>$mbxdJC`Xeo znbY0(l9mkn9vZ5z;IeRR?=s)(Yn3Pi9TViQlu|+$cz(ynbIy9RGpWSm6CfAM$4O)} zLED1LCa5^*N*pNtKmsYpdb|rJD)8tfci$!&HCs>g+-d)gi-4tEs`!QD;}cJe`A67m z>imwRe1wD!C)+`KiGI~z&VSuotm!XvZqO>>|9G$&c%7`SQUg;+Md^*oH*pfQO>GH9 zHF$bP3EQ@YZuo@ZHc*_HgtTu z^p#c9L_Wi2Z}sY8e<0xx-guxWN8(Rj0#gWc*yflg0n{UkgOzpF9QeWWVQbWJc{JBE z?r4_?HF1tDJ$?GVoSSN!NZl5)?1TNE>lJ-t&=!WtaXqq?p%rbOD0Q^3w4?(R105L} zs!@DBfDy7)n_4ats~}4MYH5~E3(Ep7R0fQUY%Z3-#jnN9*i z{xtLgJR!sobq=2|Zkvo!DUp$3re2`={Ncp!nmgmE1g*Xzw6s+Fl4~liQJd6ueAqW< zNK>s@%qVAoBasFzbj$5`*y!(F7t+HYaCR#E5n4O%dOG1B@?gMq^BzJE|He3!!t9R* z3@q}wDV?U5A&GCvhOYEKhKkose6m?@Ueop6^~|G)v3>g+tL-&x4nb^%XqF+w#rRxVwgid<}T^_GlQVZ|JP zhS*R(QCRqkh2w*To?jO-_zd3_xaj*ui8Mh8D*GXePZt^6_}t915zCqVWDTMFH<-W2 zmsML}Th54z>NYQ&Z;!XoVPjItS|5&VIxL}=Y>+ycz-M0@kqoCZz0QDb#Mww~%J5OH7$`%aGIqKA|Z_v61h~0Jz z?MVp=jcKiSzmhbRNEnu(o;9TZ>cu5#ysNjNCYhj+d|mz5dC2t8FsSNs!n5X2E!n-$ z^)OjE@cGRL#L!nc;&SB6JLYV_?e{rRR=yO4Jk`X6M!C2=B)o=grkVYW$L%~KDB*JT z+L7Vk^FIh~ReWC=Vvb;^W$#UW88}>|Qff>l)*LZqE(YDYNgKds2g;%yW}3FFBe<3Y z9fJL;Jh;aX*@gU+3sCGg9d~t?JLC85?Aj+A7_oJ;96qz(wRN;P4s)$SMN$@S*aoks ztDoDn96UFg!niFb#6_W*1R++g*1NXldRCYJhlxTVwLVJJd3|I1{B3v+ZPDda;4Udk zER;tdrYN=~tV{=7=Ov_m760oQ4nH)Lw1C)H87)m6Eeg7L^TA;5fz^(YBACgr#;J`S ztr1l|J4^NnZ|F}C=kK1&(Zj{g+wK#O>~~G1&(F39N6?O~32kR;IbVID5xHlw2Irfs z)NgDp)M!rTP&&HNA6cr`r9YglcYLx@m+N@=A!QRnG}CTLv61Y4wb$d%^&uic=BA+q zr({|j23-s;YS~}RT7IYuS|6?O!N&Oo;p+poQ&d!X3$q`@EnD34fwS?ois_z|1B+4N z&AH}=UuDi|`tBs(zxS88mRC8m_Gh~r+*f!ngBA@P*lEYbGhCkuyBfvo3%fzhB|X9u zW9bfSD)IgmKDX#ea9E5(qoqlWAGI)_`R$C8Z4!@KznYdF9&e=w1#uBj-VQvvjeLYa ziXE|pk&uu(L`F1!`F)RpMawTJh@>2zT)y0EZd$q;wlau=#S7aZ!o~K=`))H(+cfS< zKpbNX{6~0^JVv@CsTm3J8&y#P`wk9V@j#U_FGBpjQ{P)H#1wPsHxOf-ckNaMuy1c{ z>vjvR!2|7BtZ7I};x&zd#bZ#$JuWf7gQyZh<)1fp6Be*sucAb%r&v|ZS*&k=2z7U# zSc8u6Xh9VaLeLzQq>vmu!a3%7r?22O)o4*pi{%w0At9!`S+`b~i*s|OAT2cQd>GT? zJ^dDl9D{AKv*CRIu`u>IDHE1bExXG5%D4}c-+IDwSZp5KlYadwD9mO7svU%g^QOvD zuU%mgr_sjy)|whURixV68=lO}!YaSc({Ik^37o2yIv`U0zhz>Yv88!lJ=y6dyo!Z> zWz3w(>(XLrmHwYkqP)H1Y$Ld)bGf@>hQ`pu_FLIU{hY)>L8gA?CXDyO*yP;2+^>+J zX7Gjlsj{f+f@BASsi8waS?cQJ(^K2ei(8eIx_i6&Py?%0FFs#!=zN4QRLn~r+3DHs zO&#-;pKHrm+qYv)cewOkB5`@R-Kz1=Ge4aC+Dk){(T+9UPoe@8aM1D7nPFm(KOYO?DPiRVpG@M zMgGoJeHz=|(F6_GN10DYOA0iePsF(AVIoA{ULK#A4PD^$5cQ92wO~X))9*xbnoYG@ z?s2)8nZ`KaXGFS86usU74XX9TA@yHvbXWrB-x?mZrk!h7@O&Q^K#24$>7_O1$-%}6*{({v@J8eAJHqTbkL$!H&F$F`leE5JwFi| zTrc3_3Xo7XB`+;`B6FcVqJQ7+)ohc2{rqN9XFQH_i7Dk}+fSQgXQ#${>h{McINwUj z{C9mJCI^CN2rk81e>@je?i4XV$;)poQjk5hu8oM}F{3y|UyAQcTMhScNu{NMDM;m>8M8F_#>Ck3dFQE~sQ7pwZ&MQBVe@ zq~uV(evXWU*zlx<&(`20ibk|KH_vz_pEl2sNE?q(wM_QKtI1MM9q?{S2L8Mb4gIdG zzJy1@piHBBo*!BpTEqR!#90+$O8u_rnh;Z5Sfnw$zy1%RTT((l=yeKv$ zGrJd7fR_%e#1#Yv9el5WYP2JOLOF}34*=!b z!w;52qN5)=zf_nE+~U%?NTQUQsq!@_Ah;TxqG&-EG~iey1a)C3Vk4a3dnbHO=V&oG z$%wI%Q^82OyDDWmc+WD*Z*trbh=9(H@4?^y9<@ZOmF}!*E&vmB^rtmgEI)gbZL9?S zHP0(69swv>%r}!?LyJf@E%vOKq4$hNir=2?>re!<%R=;?j^uc7XJrMmDhJ?D3+jHL zqRlm|OpX>hFc{)}RdU2e&=^*CHT5dtoaK8u_U=F017;jm^XVG)i(RthFr!VV<=9tQ z@2Nx`L!tKs3l`-A1LB#^(SlB5j=Yp~bW9^@l$G?a%#Vi!@}EE3fc_oZg4a%^k8#Bb zsor(}diK`o%-Ja9{H75FWWoS#I&d+wp&uNaj%zlN(ueO>4xup-C zk0X&hIx7WZgUjZYLht}!^Vz24bL$214q!nPq_x?Z91t&c>}zFB&9Y*7gH06aBHxM@ zJZKHoFuc&-#R(^A*5|f5+Ri#^LbxSjgdX4#CcuQM%Sh>N^rpchqTDM}{>XzZii1|6 zW_a!sx~s7HLf^E%Vj{=jgYwUFXifBh?Nhf5cFOXz7+`SO4t;FB%cIf}=Prcz=P-WO zbncmOiMcQTRYUBoS%SU9EisZZ1!{5w(ECi!u<2yZ(ItBDh3f)7|1g@rRky6?E~X%> zqE(Os#AeCE|9HVPT<$`sSwriGvr?h2;X79zB3&d}{&jKwe||*~2K${l-+ub}(4NJ5 z+}^DCCtv=b7IFtR=MY)cLpK?KMHcWX*dgODtRH?@Sh&#^AThbNsYmibqmn zD!ezxM^07mi&1yT?D?CW#M+JTUq8Ep{xJSqi=bNBry6J4(CD2(0E2M7v&l zROW_&&Fv0fcod3@cPmHtvb?ys@^A)iUQ=+EU|YH4?t0T?&5gzoZQqa6 zD>_}LC(Q%7p7bY2Q;VDBx8#!yUkmaYzC#T&wxJE4+3^FYzhzczHj9QynV6c;mXBe- zeY*)ABs}`h38=nf#>R$CON6TVzahzmZh~N0AFlZLg<9}w2Do;e7(N8BeGm-^vXQ)s zIW<77N{*H?j?bcB>dC*gSyio}d*)`G(>Vv{w{eqs9=7qkTj#8G?z(H8wa&fg+;#6-_peq9-|zP`z3=l5&-2-I{ll2M zyiA!OLhqMGC2lH6^qK>`8UCvesqf>S0i_2V+J$RTLDyFB<1i`L&!7L(CM0e9<&XjE zY6Fou$(;f-UvSZL4t^7^&7G(ivJhZP&dQgJ`Egg~{8{hfpv@QxtjF2s#u~qg%*p zL~zv8*E_$~a1*+qMcD%kdWXjZe@F!?Rc-Z$(8v9J9;2xpF_knv+iOr|%8T6)T`ja8 zRg*koMnU(*k-g+q7l$sgrBzfat0T33e6e*IG4;O4M+_pqsZ0)aV&kGk2wkCAtlY6f z1+pTOop6?^ZWiBaYyMTKbY&!}H%?wo;ZwK3usw7~(m(I|F|a&t4txE#s4tPJ23@U2 z(C$dNwPS}2c%5rq;WxOj5D8Vj$JD%3rPHx5UIkIRDEf&v;KLwD}&gP=~i4a zyrRc?VeQ^$Vh${lcB-ZQi?6To_&DJQRFGSiKtV}i)(&k1UKU9(y ziN$ir#E_8jxE%Bt;B31))^%+H!=+|>P-8aOTb6|ZsOHMi(0LK?OwGs2?Y(nmd_hOT znqblT(|J}7XhKlkRvV#WQEKXQTynCWV?xDZpr-&vU|Rn_HQ81@Wf|spxo6t%t!L)s zxef9I#>s^QQFZiL$E~d;rJCTMIqHGEGrhpo-Deptl$L=^85{IsxBqg)>l&y)H1QFF zW9#rxBChy8Tu2Rp>5X$7lF%RU34So}QYEj+6iu+Zto>u8i{lCgaN)uI=0Y3KHmTDz zJO-0`3?s@L`(^}gV)g8R`k4k>!-67Bu4TrUB$V=H!)!* zzmd0~D61n%V7pT6bU835=OMPrqUBMVEVaj7yB1It6BgyCA@X2pB1@J=U!8y`81WUS zvN|~mn{vD%2G+s|glae`C`@Y8lRLhdKMZW4&+`;$-Iyi6m;3S3Of0v`bDfI}S6I&j z9QS?w>c`}44o93sxn>DfRX{FCmk66R8iUGTsrjj?v9YamU*818+C_;ZNLzXJXD3RT zocTB785uJ=s-%MoE==}D<|@XT>)2_!2}(34$Pyux@74>5$dB~m2A%*gz-FkNxp}&o68LD^54i;OdDRo9xV?3)1|SwL zKF+e$cI_*5CxITUEa*V$;?xXwP2k6a(JBX_yf$|Rjqi2QxUwwBcmS$r-=veyis{Hlb^l|?JOPHgM~LAZGB3JI1IHo|e(c*gDFULObKJxVbvp@O~9FQA8uJ@#K>HUbt)MCh2Mv zd5^A`DfYJsAq2fPIIj%*>||qDBe?%Q`CTF~=&Tp?aL@YQ3ga8FdA1-Z!4SXh zWHX2Qj}iTd2h4_lB_XI3RaDX&xy#>T2NM=WoQrMt07j5ZP#^!{IW$=1KZnCr*J2Qa zhI0V^7u>8NB5UgYc?a@u(vcr8DY}=f;Fp~cpAWh+qPaDW=l3feWT*pcBOSMHNG1N^ zHmL+R_jcLt3RAM-KZh`jjY5V?uMSx!2eU2jmpp7Ui@A1{bOgVUWXS}_n2Vk5baJ#4 zW>k1?u`=#Uurt>wl<)p_r-;KX;=M#NkR$T;(vwNvml=_j)kaKpd#pcO?NV7`rWv|3lTCYJw7g-~gqzo_71XIG3{qpOACj0|kgczAE?Wdd_iJShnoctYia zgk51PCnrL-)kX(oC^+I+w zWYnsI5L4DJl>I(Smq?)C(9$6Yv)MjbviPLLl^dlHW4tkyJU%%SpDHe%BuGOIsx2;m zo^pf;I=qC~OGmO8!hQf47WkG#?HOHW!^0HS*lDQ0xHLw55L1lg(kH%)25`0~6puej z9^LOR2#cHT?aVY&%pB(vuqGKHj6B6)jQBgDctU4`RY+xh@MNu9OWP-5thmgRXyMAs z55)WJJorp&d1EL~)YqG$jrFSI4=aR`o-5N}dPV+{w^dJ#i{`zKGNXzcHDG6=S@GY%^Q5EiQqO_8a8$4_mJggn7rs0*!Z z^CF|BT4})^o1r#Qf^vsND@T>PkS9%QPT({CedNaaTs>dRjY!A!8;~WqgteZ`i4bQ6 zKcJa#v%Y=~8k)a!X=JXrC-Cz?{+{IIL{mM6_S%5hlm&O0bO!WOGc)Y9N)U?!ZJ-qC z!bAA(Y?Z`v=gl`3uL|{4MPdL5^qsBU)Yyf!6+F-1U9>s0#HIgL`z37)9I-d76R~UmXj>`S>)Jy-kC7h?1 zh!q`$q!h>5^L2$xA_H^0tnV&F5xDqc_3W#XT_7{H+~qY`KrZ$V^aR``M!x?WEm41W z;D03J+o+GiA$va>K?=jS7v4$&{iy+mHIV;Lqya$vzp8KiUthh(zp!ktG$j~HSJx`d zf@l|3EeA1G`54oZBqkX`$G*Lm;c7?jN&C6QYB7ExmP38J^V92~n41S~m!_bj)cq{< zr%4$#$CukWSg&|961zO=5)tlgWN$GEnVBN@@7Fvjw7sZbOo{35-$sUph_r6ZD_(#6 z{UkZHZJ{O0>9-MvzSzK$lHR?GD`a9h@aLjKkdmR+!xrb9rgsg84w(T)U3i36PMTT{ zF4JYO_}8y@622Tq$S82LM&f6u&H)+=On`<&`}LW!m-VK3CA%ig@%3&V+mFgT6iaOE zP#f+$8>wrvo1Au~k;(LiPU-6DrC!^XHfhRCPc;{7QJ9@gyEMac3(H|~k@*2ad*LFV zw6t|YhdU`NEB^}C)n>YjV{siZ^%;IgyKEr$Dwk};0vgv7hO$c6ZctZO2UhGqEI#Lz zN?g!CEytT>S;Xq|%xEE9>#{RYCh|($d#1f0i#n(DOvT&FS%MC%l_?G|P`RFG+HXYd ziyY5=a+*%>%QF)X3R;HGCb6GvGJ-d&mG94L8~7}@N|f_>IBDtWV>#3(EgO(oRjGCEUd&)%`A*r<+FtMQ2m$biYngv}^vXk5n@@0oY^rafHU+8FGCET8H(ja#rTjw2Ox z9-G<9_Vx&JiHF1#Su`xq*_m$77t2Nf!&v_Z#7o{`o4@d18bw*TI43YaU@Sl-R_j{^@Pz}5$183yFWA}YU_@}mK@+W zhb5{zJ7c-g?_~)q>zo)hb@e`!)%vYLplV0kmD_~n=iekd#`#$(E}av@uDtsmz5*uK z3W)GINJB|3{a~ZEkCU9*ya6L4lmC!QkXOXqjJ` zz!-Bw%;ZE5NlOct)530)CJRJP#zbP%^{uTf^(-B}$||ih44wgG>l7~C24{m@kMdJ&y6l7kRXHHn5 z%X|UYXNVxO_of{FgHHlZQUX8TwcO1pz zXP2I|CW`$2)2{D;ScSHFtwAPPr!^XqeS*cEgIzx1PBaMA-ZHcZ7073L__LE}yOyC( zGKGe5R}_cq*Dl*ufu}xraU_-F@%IL%b+%s2O!z!- zr#3*TJRIM6jkt+2x7*ug=Qn6+5tM$6LMylxFCQNv812MZO+M&w_mB2l(bEf{cO6N| z$i*=Mr}zlVvKHdIOWc`w&{8CjUN=ND+m)?y>~ zBH&9u>r2|EKK;#m$Et@31nyn?89~^c4$@=%c&pz1Q}?DjRQef9BJ=a}@t14O8f(og ztrr%OK;lTUAttjC=IFfmt}|$1+``<{_!FdtSa2b}2`;EnhZ%w+_3WS>=*PB?>o@`44b!3{^3KTpM}NA^dRlf%a!iQ5>$H*}|}r!U3{E}MuS zlv1gPF0+9L{pG|F59bnu-_3WouEZ5=E-it(ssYRVsyp+5`r0)MobQ1v1Wus9$Y;^Q zI5zB?e&%BFTR3|(r{9A7Dc=WirW(TPHITcx*8S}+8R#iTEMPG!dz&_d{T&z1Crf-U zUK|Yy9g@`3qja7{8>7}!;4ln{fz6lzkF>NZDZ@x6uzJc|?5hSGi?9Y*n#o?wGWV(W2 zaY3{Rn$YDk)b854XM6AP-}c^q3O}yy&g3|I-F_Z5bBe*G>E}baIt?TzA?%85rM!Xy z(a^(e!le?)YNc-7C1_@_8ty=(2YLW(b8DPv>*zKEf0J?LE9yM2q%el5B|t8>6FNC` z3#h;-oS-(We?yZf=*ae=5Vzf)*4&r7m~THS2m6os8*tJYCD5@TDgVQ=lW}`3PRf31 z)zw)Hw)l!Yk%lMiUeV|f4=66$TOiu3C@p~;F2=hH4fiIz|sv8s}BooPcjf;V>p$T~-lD&Ke5i=lW!5yB{21L& zAPDXG`t32vl$LtIH5DhIOK=Xim(wmqOtF$f0kiu4YYIb8x#f}H^Zb}>gNOLOTr7Cm zDDL5djeK$6eVvhwm6iNK3mJE>-J_oaiTpZhKPUzGf7A4&@;k*~KEOzEN?%Sv0jRh9 z)y>USYDr={D1nl+LDf$8boF*gO!h5kf}-A<(`e*#ngvESK8 zDMA^8>k{Gp)juu$IfrFsY@vBe^;2~eGS3B+T^YS>HVBokJ1H-8G&s&}Etj#e8>Bt` zo#*DJ=Xoht2Q}qJz@r~e_5U!U8}=Qh1LR)h=)An%Ebq2;^2`~oj#9#o(=>18Hy;?C zq4zqa=Vo?QeSOfot5)x>$Qpz*4OT6W6C1tLELwxWnGfjeN7VUuG45gRNi*MXP8e3@ zn@7O8`t|EP>lFdr7*OTQ%!B_l_GV<%^RJV_1560QzWZwY7_08*&2KLs@cw%<^(fxF zxw!LdI|KhZc1ZO6oo81==)#P##g>dhc~NzB&8CfXa&V}YD=AnYEC~BIdv#buUkHXw zz(k}|AYbplD&ziVZV1{a0&Aqv#Se%<*>r*qJ{~bz$pPU}+})cTKr?Phe7e2%8{GH% z5c<}!Df)Y<3SpMRXo(^Oeap)EaF$`r+O&Uv5#k&OZh;<_=s)BF>E1g&0Caw4u+Y0E zIgH~h8j(j2=~;P4^@~F*dIbU>V)-nnIzP)^h!Sz7J3`^t|}-!wf<-atMgx??wfMh{zu7AM~wB_;_*e?mBj4V`Hfz z-l7oL5xi>#c1@4nTNVmGP$N~zM_7MA6pNpqv>{9?J z6o1D(&=%`M_O$CO7pE?N5*c{`y!Eak)@Be1^-V8vg6TzOW}NHFGN6b$?@xDu*zyd! ztuE|z{vNm$FxF@@toKe20_>=l*Y5O?`!6Cz*_uEK-UR`kTvCc&*l7WdFJgI2dj)^9 zzLDG4jKkH@@skP0j%ZyVi6H3ZmX3#3L?egW$5^_zbMk zv#?P3MMRXRa*oe}eC(5k$)*bp{rQ3JJM)oLVbI76j8d5LIk>Tdr04gBjJS9&^L0^2 zwxPHCyv&bEbIU6#UIHo1yF$%n#gvT%0>2~KV)k$e&}rB{^1ol2FJNx3x_o;vC7>4bVvtCybIY^8NYN?RCB!U+?~ueO_cFoQyZW zejy%X0G#Gp^*&DkROJ6`a6|FarvkXr$EeteqY|dIbk|X0cMn0!j1C4=(3a6j{a8`! zT@-D;8;ZE|B><%Zi+TTD*SGF?B8z#y_^OC&Haibbl2o9}ZAc_Q{BfcpB~g;(hNefr zSWRZ+`|~Og+zHuVimtA91H;+sN7@Vkh4gmZ_Z_Rj0_wf(y(&cO`Uqia*rdYG?5FMC z2}?_@3nCk04bkrL9_t3AAU;P^x}jq`@N;<#)$5pVpVIRbCQ>x?C#N*HSJzGWt$-yAOR{Nkz_vM5rduVFn=4+l(wsxZM2LHII5zbP__@rZjT@qK z<~FoY&uPCswlUV^_uW6PoHW#`B4IJ|2Q=wM`X{L8X8`8I$y(0d5pkX& zfrxE+{nSVEW_mkq*F~H&+0V%4kWMKF^}*at@=rJ(Rmvwymi`~vTnbCBs< znoOv&cin7q5``cWu%JJj%C;Xy*D_6xQXtJX7w*C(0B8JGr&RJhUEfGGDiL@7e#aqk zI)IB9^*uI&r?i#0o@(5jA6UOCxFtj)DVcWLb7S)?w?UYnpI0@;@M&uj3U2-AfaeaN zW~~t6FpLkPMaf-VYzZd$76%oR%?J*ipC|YC?^o9*nZtTPP^VDh;%(Wt)RR8~8H9L< zg?X%>fI94Ys1pElo@Z^VUQ72i6=HpY!MsI9M^`9vdoL_a5hKOTz(dmXnDLv?A!|)@ z{)DUskT2&n)cV@yR~6`Lzcj)Rt_<|(rTX(!I=|GHbbar?(KwZ(I)2pHq}6^xPH<-< zt7>pC`I}|}=mX{qW@E!cL@?1DhV2ko9->iE7$?rt?nu75w<1Iwuu0o<#|!zbYQSPm z{N@7>znaKz@S~y*qPsd@Rm2(#8IAit|NX^)!`_Vli*nvX1OET8oEK5AjAiW-aS+nX kM-MqX|EGCz69=RX7jnbO$q^lJ=Ol^z17+!K$)_*>1saPm+5i9m diff --git a/code/ui_functions.py b/code/ui_functions.py index 1019600..0a49837 100644 --- a/code/ui_functions.py +++ b/code/ui_functions.py @@ -83,7 +83,14 @@ def uiDefinitions(self): self.ui.frame_plts.hide() self.ui.checkBox_fc.hide() self.ui.checkBox_ttest.hide() - self.dialog.ui.checkBox_bootstrap.setChecked(True) + # "Bootstrap Analysis" and "Collapse Technical Replicates" moved + # from this global plot-config dialog onto their one relevant + # plot's own switcher bar (plot_dendrogram's "Bootstrap" checkbox, + # plot_ordination's "Collapse Replicates" checkbox) -- hide the + # now-orphaned dialog widgets rather than editing the generated + # ui_plotparam.py. + self.dialog.ui.frame_bootstrap.hide() + self.dialog.ui.frame_2.hide() # Top bar functions self.ui.btn_maximize.clicked.connect(lambda: UIFunctions.maximize_restore(self)) diff --git a/devnotes.md b/devnotes.md index 9b496b7..c0c842d 100644 --- a/devnotes.md +++ b/devnotes.md @@ -221,6 +221,14 @@ only handles the combo boxes, axes, and pick events. instead (`test_ordination.py`'s synthetic-replicate-structure test, cross- checked against real example data with a scratch script during development). +- **The checkbox itself later moved off the global plot-config dialog**: + `checkBox_collapsereps` only ever affected this one plot, so it's now + `plot_ordination`'s own "Collapse Replicates" checkbox (`self.collapse_replicates`, + default `True`) in its switcher bar, same move as `plot_dendrogram`'s + "Bootstrap" checkbox (see the dendrogram section). The dialog checkbox's + containing frame (`frame_2`) is hidden at runtime rather than edited out + of generated code. This field was never in `paramfields.CHECKBOX_FIELDS` + (wasn't pickled before either), so no save/load behavior changed. - **Loadings view and high-dimensional data**: thousands of features can't all be drawn legibly, so only the top-25 by loading-vector magnitude are shown by default (`ordination.top_loadings()`). Whichever feature is @@ -316,6 +324,34 @@ substitution pattern as `plot_ordination`'s method/view bar): Both views' purity math is the same Qt-free linkage-traversal logic in `clusterpurity.py`, unit-tested in `tests/test_clusterpurity.py`. +- **Red is "bridge" coloring, not "any impure ancestor"**: an earlier + version colored every impure merge red, including every ancestor above a + single mixing event all the way to the root -- since almost any real + dataset has *some* mixing somewhere, this painted most of the tree's + upper structure red regardless of how localized the actual problem was. + `purity_link_color_func()` now distinguishes three states per merge: pure + (`true_color`/green), a *bridge* -- impure, but at least one child was + itself pure, i.e. this is the specific merge where a different group + first gets bridged in (`false_color`/red), or *neutral* -- impure with + both children already impure, i.e. just continuing an already-known mix + further up the tree with no new information (`neutral_color`/black, the + same color as "no coloring" so it visually recedes). Verified with a + hand-built 4-group linkage matrix (deterministic merge order, no + clustering ambiguity) asserting the root of two already-mixed clades is + black, not red, while the two actual group-meeting points are red. +- **Bootstrap is now a per-tab checkbox, not a global one**: the + plot-config dialog's "Bootstrap Analysis" checkbox (`checkBox_bootstrap`) + only ever affected this one plot, so it moved into `plot_dendrogram`'s own + switcher bar (`self.bootstrap`, default `True` -- matching the effective + startup default the old checkbox was forced to in `UIFunctions`, which + differed from its own Designer-set default of `False`). The dialog + checkbox's containing frame (`frame_bootstrap`) is hidden at runtime in + `UIFunctions` rather than edited out of the generated `ui_plotparam.py`. + `('bootstrap', ...)` was also dropped from `paramfields.CHECKBOX_FIELDS`, + so it's no longer saved into `.mpct` files -- consistent with the + dendrogram's other per-tab state (View, Color), none of which persist + across save/load either. + - **Purity is a strict, whole-group check, not "any uniform subset"**: a label only counts as pure if *every* leaf carrying it ends up in one clade before that clade touches a different label — 2 of a Sample's 3 replicates From 5ca9baff7535c8759f5786b4e736cb89098f96c3 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 18:45:43 -0400 Subject: [PATCH 12/20] Fix dendrogram coloring: red = proven non-monophyly (label-set overlap) The previous "bridge vs neutral" heuristic still mis-colored real data: it could mark a high-level merge red just because one side was a single freshly-introduced pure clade, and could miss genuine tangles where two already-impure children share a label without either side being trivially pure. purity_link_color_func now classifies each merge by comparing the two children's label sets directly: disjoint sets (no label in common) -> neutral/black, a clean join even if one side is impure from an unrelated tangle further down; overlapping sets -> red, definitive proof some label's leaves are split across this exact merge. Verified against the real dataset's bootstrap dendrogram: only the actual scattered-replicate merges render red, and every higher-level merge joining that region with cleanly-resolved samples stays black. Co-Authored-By: Claude Sonnet 4.6 --- code/clusterpurity.py | 47 +++++++++-------- code/tests/test_clusterpurity.py | 90 +++++++++++++++++--------------- devnotes.md | 51 ++++++++++++------ 3 files changed, 111 insertions(+), 77 deletions(-) diff --git a/code/clusterpurity.py b/code/clusterpurity.py index acf8657..5e8ebee 100644 --- a/code/clusterpurity.py +++ b/code/clusterpurity.py @@ -17,21 +17,28 @@ def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red', neutral_color='black'): """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``. - Three-way coloring, not just pure-vs-not: + Three-way coloring, classified by comparing the two children's label + sets (not by simply asking "is the merge result impure", which would + paint every ancestor of a single mixing event red all the way to the + root): - - ``true_color`` ("pure"/monophyletic): every leaf under this link + - ``true_color`` ("monophyletic"): the two children's label sets are + identical and contain exactly one label -- every leaf under this link shares one label. - - ``false_color`` ("bridge"): this link is impure, but at least one of - its two children was itself pure (a single leaf counts as trivially - pure) -- this is the *specific* merge where a different label first - gets bridged in, i.e. exactly the "bridge sample"/"two groups meet - here" point. - - ``neutral_color``: this link is impure AND both children were already - impure -- i.e. it's just continuing an already-known mix further up - the tree, not new information. Without this third state, every - ancestor of a single bridge point would also render in - ``false_color``, painting most of the upper tree the "bad" color even - though only one merge actually caused it. + - ``false_color`` ("polyphyletic"): the two children's label sets + *overlap* (share at least one label) without being identical-and- + singleton -- this is definitive proof that some label's leaves are + split apart by this exact merge (some of that label is on each side), + i.e. genuinely non-monophyletic, not just "still impure from before". + - ``neutral_color``: the two children's label sets are *disjoint* (no + label in common) -- this merge simply joins two regions that don't + contradict each other; it's a clean bridge even if one or both + children are themselves impure from a *different* label's tangle + further down. This is what keeps a single low-level tangle from + cascading red all the way up the tree: once a tangled label's clade + stops growing (nothing more of that label to fold in), every merge + above it only ever joins disjoint regions, so it reverts to + ``neutral_color``. Args: Z: linkage matrix (``scipy.cluster.hierarchy.linkage`` or @@ -47,21 +54,19 @@ def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red' """ n_leaves = len(leaf_labels) leaf_label_sets = {i: {leaf_labels[i]} for i in range(n_leaves)} - is_pure = {i: True for i in range(n_leaves)} # every leaf is trivially pure colors = {} for i, row in enumerate(Z): a, b = int(row[0]), int(row[1]) node_id = n_leaves + i - merged = leaf_label_sets[a] | leaf_label_sets[b] + set_a, set_b = leaf_label_sets[a], leaf_label_sets[b] + merged = set_a | set_b leaf_label_sets[node_id] = merged - pure = len(merged) == 1 - is_pure[node_id] = pure - if pure: + if len(merged) == 1: colors[node_id] = true_color - elif is_pure[a] or is_pure[b]: - colors[node_id] = false_color - else: + elif set_a.isdisjoint(set_b): colors[node_id] = neutral_color + else: + colors[node_id] = false_color return lambda k: colors.get(k, neutral_color) diff --git a/code/tests/test_clusterpurity.py b/code/tests/test_clusterpurity.py index 7138761..87cc3de 100644 --- a/code/tests/test_clusterpurity.py +++ b/code/tests/test_clusterpurity.py @@ -19,6 +19,30 @@ def _two_clean_groups(): return data, labels +def _scattered_pair_linkage(): + """Hand-built linkage (not derived from real coordinates, so the merge + order is exact and unambiguous) reproducing the real-data pattern that + motivated the overlap-based coloring rule: labels P and Q are each + split across two separate leaves that DON'T merge with each other + first (P0, Q0, Q1, P1 -- interleaved, not P0+P1 then Q0+Q1), so neither + P nor Q is monophyletic -- plus an unrelated label R that cleanly joins + in afterward and should NOT show as part of the tangle. + + Leaves: 0=P, 1=Q, 2=Q, 3=P, 4=R. + Merge order: (1,2)=Q+Q pure; (0, that)=P+{Q} disjoint bridge; + (3, that)=P+{P,Q} overlap -- the actual tangle; (4, that)=R+{P,Q} + disjoint again (R was never part of the P/Q mixing). + """ + Z = np.array([ + [1, 2, 0.1, 2], # node 5: Q+Q -> {Q} (pure) + [0, 5, 1.0, 3], # node 6: P + {Q} -> {P,Q} (disjoint) + [3, 6, 2.0, 4], # node 7: P + {P,Q} -> {P,Q} (overlap!) + [4, 7, 3.0, 5], # node 8: R + {P,Q} -> {P,Q,R} (disjoint) + ]) + labels = ['P', 'Q', 'Q', 'P', 'R'] + return Z, labels + + def test_purity_summary_both_groups_pure(): data, labels = _two_clean_groups() Z = linkage(data, method='ward') @@ -26,66 +50,50 @@ def test_purity_summary_both_groups_pure(): assert (n_pure, n_total) == (2, 2) -def test_purity_link_color_func_root_bridges_two_pure_clades(): +def test_purity_link_color_func_clean_disjoint_groups_stay_neutral_even_at_root(): data, labels = _two_clean_groups() Z = linkage(data, method='ward') n_leaves = len(labels) color_func = purity_link_color_func(Z, labels) # The final merge (root) joins group A's whole clade with group B's - # whole clade -- both children are themselves pure, so this is exactly - # the "bridge" merge (the one and only point the two groups meet) and - # must be the false/bridge color, not the neutral one. + # whole clade -- their label sets are disjoint ({A} vs {B}, no overlap), + # so this is a clean join, not evidence either group is non- + # monophyletic. It must be the neutral color, NOT the polyphyletic one + # -- two cleanly-resolved groups simply existing in the same tree isn't + # itself a problem. root_node_id = n_leaves + len(Z) - 1 - assert color_func(root_node_id) == 'red' + assert color_func(root_node_id) == 'black' # Every internal node strictly below the root is a within-group merge - # for this dataset (each group's 3 points cluster before the cross-group - # merge) -- those links must be the "pure" color. + # for this dataset -- those links must be the monophyletic color. for i in range(len(Z) - 1): node_id = n_leaves + i assert color_func(node_id) == 'green' -def test_purity_link_color_func_does_not_cascade_red_up_the_whole_tree(): - # Hand-built linkage (not derived from real coordinates, so the merge - # order is exact and unambiguous): 4 groups of 2 leaves each -- - # A=(0,1), B=(2,3), C=(4,5), D=(6,7). Merge order: each group merges - # with itself first (pure), then A+B bridge, then C+D bridge, then the - # root merges the two already-impure (A+B) and (C+D) clades together. - # Z columns: [child1, child2, distance (unused), count (unused)]. - Z = np.array([ - [0, 1, 0.1, 2], # node 8: A+A (pure) - [2, 3, 0.1, 2], # node 9: B+B (pure) - [8, 9, 5.0, 4], # node 10: A+B (bridge -- both children pure) - [4, 5, 0.1, 2], # node 11: C+C (pure) - [6, 7, 0.1, 2], # node 12: D+D (pure) - [11, 12, 5.0, 4], # node 13: C+D (bridge -- both children pure) - [10, 13, 50.0, 8], # node 14: (A+B)+(C+D) -- root - ]) - labels = ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'] +def test_purity_link_color_func_overlap_is_the_only_red_and_it_does_not_cascade(): + Z, labels = _scattered_pair_linkage() color_func = purity_link_color_func(Z, labels) - assert color_func(8) == 'green' # A+A - assert color_func(9) == 'green' # B+B - assert color_func(10) == 'red' # A+B bridge - assert color_func(11) == 'green' # C+C - assert color_func(12) == 'green' # D+D - assert color_func(13) == 'red' # C+D bridge - # The root combines two ALREADY-impure clades -- no new bridge event, - # so it must NOT also render red (that's the "entire tree turns red" - # behaviour this function is specifically built to avoid). - assert color_func(14) == 'black' + assert color_func(5) == 'green' # Q+Q, monophyletic + assert color_func(6) == 'black' # P + {Q}: disjoint, clean bridge + assert color_func(7) == 'red' # P + {P,Q}: OVERLAP -- the actual tangle + # R joining afterward is disjoint from {P,Q} -- R was never part of the + # P/Q mixing, so this must NOT also render red just because it's above + # (contains) the node-7 tangle. This is the specific behaviour this + # rule exists for: a real, low-level tangle must not paint every + # ancestor red all the way to the root. + assert color_func(8) == 'black' def test_purity_link_color_func_custom_colors(): - data, labels = _two_clean_groups() - Z = linkage(data, method='ward') - color_func = purity_link_color_func(Z, labels, true_color='cyan', false_color='grey') - n_leaves = len(labels) - root_node_id = n_leaves + len(Z) - 1 - assert color_func(root_node_id) == 'grey' - assert color_func(n_leaves) == 'cyan' + Z, labels = _scattered_pair_linkage() + color_func = purity_link_color_func(Z, labels, true_color='cyan', false_color='magenta', neutral_color='grey') + assert color_func(5) == 'cyan' + assert color_func(6) == 'grey' + assert color_func(7) == 'magenta' + assert color_func(8) == 'grey' def test_purity_summary_one_mismatched_leaf_breaks_purity_for_its_group(): diff --git a/devnotes.md b/devnotes.md index c0c842d..a521abb 100644 --- a/devnotes.md +++ b/devnotes.md @@ -324,21 +324,42 @@ substitution pattern as `plot_ordination`'s method/view bar): Both views' purity math is the same Qt-free linkage-traversal logic in `clusterpurity.py`, unit-tested in `tests/test_clusterpurity.py`. -- **Red is "bridge" coloring, not "any impure ancestor"**: an earlier - version colored every impure merge red, including every ancestor above a - single mixing event all the way to the root -- since almost any real - dataset has *some* mixing somewhere, this painted most of the tree's - upper structure red regardless of how localized the actual problem was. - `purity_link_color_func()` now distinguishes three states per merge: pure - (`true_color`/green), a *bridge* -- impure, but at least one child was - itself pure, i.e. this is the specific merge where a different group - first gets bridged in (`false_color`/red), or *neutral* -- impure with - both children already impure, i.e. just continuing an already-known mix - further up the tree with no new information (`neutral_color`/black, the - same color as "no coloring" so it visually recedes). Verified with a - hand-built 4-group linkage matrix (deterministic merge order, no - clustering ambiguity) asserting the root of two already-mixed clades is - black, not red, while the two actual group-meeting points are red. +- **Red marks proven non-monophyly (overlap), not "any impure merge"**: two + earlier attempts both got this wrong in opposite directions. First, every + impure merge was colored red, including every ancestor above a single + mixing event all the way to the root -- since almost any real dataset has + *some* mixing somewhere, this painted most of the tree's upper structure + red regardless of how localized the problem was. The second attempt + ("impure but at least one child was pure = bridge = red, both children + already impure = neutral") fixed the worst of the cascading but still + mis-colored real data: it could still mark a high-level merge red merely + because one side happened to be a single freshly-introduced pure clade, + *and* it could miss real tangles where two already-impure children share + a label without one side being trivially pure. + + `purity_link_color_func()` now compares the two children's label sets + directly at each merge: + - identical and a single label -> monophyletic (`true_color`/green). + - **disjoint** (no label in common) -> neutral (`neutral_color`/black) -- + a clean join of two regions that don't contradict each other, *even if + one or both children are themselves impure from a different label's + tangle further down*. This is what stops a low-level tangle from + cascading: once a tangled label has nothing more of itself left to fold + in, every merge above it only ever joins disjoint regions, so it goes + back to black. + - **overlap** (share >=1 label, without being identical-and-singleton) -> + polyphyletic (`false_color`/red) -- definitive proof that some label's + leaves are split across this exact merge (present on both sides), not + just "still mixed from an earlier merge". + + Verified against the real example dataset's bootstrap dendrogram (the + case that exposed both earlier bugs): only the two merges that actually + re-unite a scattered sample's replicates (e.g. one sample's reps split + into two non-sister sub-clades that only meet again higher up) render + red; the higher-level merges joining that region with cleanly-resolved, + unrelated samples stay black, same as a hand-built synthetic linkage + (`tests/test_clusterpurity.py`'s `_scattered_pair_linkage`) reproducing + the same pattern deterministically. - **Bootstrap is now a per-tab checkbox, not a global one**: the plot-config dialog's "Bootstrap Analysis" checkbox (`checkBox_bootstrap`) only ever affected this one plot, so it moved into `plot_dendrogram`'s own From 66b3e0240a8a9d1aadda6fc237315cc55b23d713 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 19:04:35 -0400 Subject: [PATCH 13/20] Dendrogram: add Use Sample/Group Names labels; fix AU/BP label scaling - ordination.replicate_label_components() numbers each Injection's biological and technical replicate rank (BioRep#/TechRep#) within its Biolgroup/Sample, unconditionally (works fine when either count is 1). A new "Use Sample/Group Names" checkbox in the dendrogram's switcher bar swaps the raw file/injection names for _b_s (or _b alone in the Biological Replicates view) -- useful when the real file names are long or uninformative. - pvclust.plot_dendrogram's AU/BP annotations used a fixed icoord-unit x-shift that shrank to an ever-smaller pixel gap as leaf count grew (icoord-to-pixel ratio shrinks with more leaves in the same plot width), eventually merging "AU"/"BP" into illegible overlapping text. Fixed with ax.annotate(..., textcoords='offset points', ha='right'/'left'), which keeps a constant pixel gap regardless of leaf count, plus leaf-count-scaled fontsize. Also removed a plt.figure()/plt.tight_layout() pair that created and abandoned an unused Figure on every redraw. Co-Authored-By: Claude Sonnet 4.6 --- code/ordination.py | 47 +++++++++++++++++++++++++++++ code/plotting.py | 36 ++++++++++++++++++++-- code/pvclust.py | 56 +++++++++++++++++++++++------------ code/tests/test_ordination.py | 40 +++++++++++++++++++++++-- devnotes.md | 44 +++++++++++++++++++++++++-- 5 files changed, 198 insertions(+), 25 deletions(-) diff --git a/code/ordination.py b/code/ordination.py index c941453..5356010 100644 --- a/code/ordination.py +++ b/code/ordination.py @@ -100,6 +100,53 @@ def load_ordination_matrix(file, raw_msdata_header, collapse_replicates): return x, biolgroup +def replicate_label_components(raw_msdata_header): + """Derive (Biolgroup, BioRep#, TechRep#) for every Injection, for + building short ``biolgroupname_b_s``-style display + labels as an alternative to raw (sometimes long/uninformative) file + names -- used by the dendrogram tab's "Use Sample/Group Names" toggle. + + BioRep# is the 1-based rank of an Injection's Sample among all distinct + Samples sharing the same Biolgroup (first-seen order in the header); + TechRep# is the 1-based rank of the Injection among all Injections + sharing the same Sample (first-seen order). Both are always assigned + starting at 1, so a Biolgroup with only one Sample still gets "_b1", and + a Sample with only one Injection still gets "_s1" -- there's no minimum- + replicate-count special case to handle. + + Args: + raw_msdata_header: the peak table's 3 header rows, read raw + (``header=None, index_col=[0,1,2]).iloc[:3,:].transpose()``) -- + same format ``load_ordination_matrix`` takes. + + Returns: + DataFrame indexed by Injection, columns ``['Biolgroup', 'Sample', + 'BioRep', 'TechRep']`` (``BioRep``/``TechRep`` are 1-based ints). + """ + header = raw_msdata_header.copy() + header.columns = ['Biolgroup', 'Sample', 'Injection'] + + samples_seen_per_biolgroup = {} + biorep_of_sample = {} + for _, row in header.drop_duplicates('Sample').iterrows(): + biolgroup, sample = row['Biolgroup'], row['Sample'] + samples_seen_per_biolgroup.setdefault(biolgroup, []) + samples_seen_per_biolgroup[biolgroup].append(sample) + biorep_of_sample[sample] = len(samples_seen_per_biolgroup[biolgroup]) + + injections_seen_per_sample = {} + techrep_of_injection = {} + for _, row in header.iterrows(): + sample, injection = row['Sample'], row['Injection'] + injections_seen_per_sample.setdefault(sample, []) + injections_seen_per_sample[sample].append(injection) + techrep_of_injection[injection] = len(injections_seen_per_sample[sample]) + + header['BioRep'] = header['Sample'].map(biorep_of_sample) + header['TechRep'] = header['Injection'].map(techrep_of_injection) + return header.set_index('Injection')[['Biolgroup', 'Sample', 'BioRep', 'TechRep']] + + def autoscale(x): """Mean-center and scale each feature to unit variance ("UV-scaling" / "autoscaling" in chemometrics terminology -- the standard pre-treatment diff --git a/code/plotting.py b/code/plotting.py index 1467cd0..80133c9 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -826,6 +826,13 @@ class plot_dendrogram(ui_plot): switcher bar (formerly the plot-config dialog's global "Bootstrap Analysis" checkbox -- moved here since it only ever applied to this plot). The purity-coloring math lives in the Qt-free clusterpurity.py. + + A "Use Sample/Group Names" checkbox swaps the leaf labels from the raw + file/injection names (which can be long or uninformative) to + ``_b_s`` (Technical Replicates view) or + ``_b`` (Biological Replicates view, no TechRep# + since replicates are already collapsed) -- see + ``ordination.replicate_label_components()``. """ VIEWS = ('Technical Replicates', 'Biological Replicates') @@ -844,6 +851,7 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): self.view = 'Technical Replicates' self.color_mode = 'Purity' self.bootstrap = True + self.use_sample_names = False self._build_switcher_bar(parent, currplt) self.plot(parent, file, filtereddfs, groupsets) @@ -872,11 +880,17 @@ def _build_switcher_bar(self, parent, currplt): bootstrap_check.setChecked(self.bootstrap) bootstrap_check.toggled.connect(self._on_bootstrap_toggled) layout.addWidget(bootstrap_check) + + use_names_check = QtWidgets.QCheckBox('Use Sample/Group Names') + use_names_check.setChecked(self.use_sample_names) + use_names_check.toggled.connect(self._on_use_sample_names_toggled) + layout.addWidget(use_names_check) layout.addStretch() self.view_combo = view_combo self.color_combo = color_combo self.bootstrap_check = bootstrap_check + self.use_names_check = use_names_check parent.pltlayout[currplt].insertWidget(0, bar) def _on_view_changed(self, view): @@ -891,6 +905,22 @@ def _on_bootstrap_toggled(self, checked): self.bootstrap = checked self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + def _on_use_sample_names_toggled(self, checked): + self.use_sample_names = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _display_labels(self, raw_header, textlabels): + """Build short ``Biolgroup_b#[_s#]`` leaf labels in place of the raw + file/injection names, when "Use Sample/Group Names" is checked.""" + components = ordination.replicate_label_components(raw_header) + if self.view == 'Biological Replicates': + per_sample = components.drop_duplicates('Sample').set_index('Sample') + return [f"{per_sample.loc[sample, 'Biolgroup']}_b{per_sample.loc[sample, 'BioRep']}" for sample in textlabels] + return [ + f"{components.loc[injection, 'Biolgroup']}_b{components.loc[injection, 'BioRep']}_s{components.loc[injection, 'TechRep']}" + for injection in textlabels + ] + def plot(self, parent, file, filtereddfs, groupsets): self._last_file = file self._last_filtereddfs = filtereddfs @@ -945,10 +975,12 @@ def plot(self, parent, file, filtereddfs, groupsets): else: link_color_func = None # plain black dendrogram, scipy's own default + display_labels = self._display_labels(raw_header, textlabels) if self.use_sample_names else textlabels + if self.bootstrap: - dend = pv.plot(parent.ax[self.currplt], labels=textlabels, link_color_func=link_color_func) + dend = pv.plot(parent.ax[self.currplt], labels=display_labels, link_color_func=link_color_func) else: - dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', link_color_func=link_color_func, labels=textlabels) # default leaf label size 16 + dend = shc.dendrogram(Z, ax=parent.ax[self.currplt], leaf_rotation=90, color_threshold=0, above_threshold_color='black', link_color_func=link_color_func, labels=display_labels) # default leaf label size 16 if self.color_mode == 'Purity': n_pure, n_total = clusterpurity.purity_summary(Z, leaf_labels) diff --git a/code/pvclust.py b/code/pvclust.py index 1efc2dd..6943b83 100644 --- a/code/pvclust.py +++ b/code/pvclust.py @@ -282,9 +282,6 @@ def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None, link_color_func= y = {i: j[1] for i, j in enumerate(ycoord)} pos = node_positions(y, x) - - plt.figure(figsize=(12, 8)) - plt.tight_layout() set_link_color_palette(['c', 'g']) # link_color_func, when given, takes priority over color_threshold/ # above_threshold_color (scipy's own precedence rule) -- that's how the @@ -295,25 +292,46 @@ def plot_dendrogram(linkage_matrix, pvalues, axis, labels=None, link_color_func= link_color_func=link_color_func) maxval = max(y.values()) ax = axis - for node, (x, y) in pos.items(): #modifications added to scale y axis label shifts + + # AU/BP labels used to be positioned with a fixed x-shift in icoord + # units (e.g. "x-7"). icoord spacing is always 10 units per leaf + # regardless of leaf count, but the AXES' actual pixel width is not -- + # with more leaves squeezed into the same plot width, each icoord unit + # maps to fewer pixels, so a fixed icoord offset shrinks to a fixed + # *fraction* of leaf spacing but an ever-shrinking number of *pixels*, + # eventually overlapping (e.g. "AU"/"BP" merging into "AUBP" once there + # are enough leaves). Anchoring with ha='right'/'left' plus a constant + # offset in POINTS (not data units) keeps a fixed pixel gap regardless + # of leaf count or icoord scale -- no more digit-width-dependent x-shift + # hack for AU values of 100 vs not. Per-node fontsize is similarly + # scaled down as leaf count grows, so neighbouring nodes' labels (which + # do have a fixed minimum icoord-and-therefore-pixel separation) don't + # run into each other either. + n_leaves = len(d['ivl']) + value_fontsize = max(5, min(8, 140 / n_leaves)) + header_fontsize = value_fontsize + 3 + gap_points = 2 + + for node, (nx, ny) in pos.items(): #modifications added to scale y axis label shifts + y_offset = ny + maxval / 200 if node == (len(pos.items())-1): - ax.text(x-6, y+maxval/200, 'AU', fontsize=11, fontweight='bold', - color='purple') - ax.text(x+1, y+maxval/200, 'BP', fontsize=11, fontweight='bold', - color='black') + ax.annotate('AU', xy=(nx, y_offset), xytext=(-gap_points, 0), + textcoords='offset points', ha='right', va='bottom', + fontsize=header_fontsize, fontweight='bold', color='purple') + ax.annotate('BP', xy=(nx, y_offset), xytext=(gap_points, 0), + textcoords='offset points', ha='left', va='bottom', + fontsize=header_fontsize, fontweight='bold', color='black') else: - if pvalues[node][0]*100 == 100: - ax.text(x-10, y+maxval/200, f' {pvalues[node][0]*100:.0f}', fontsize=8, - color='purple', fontweight='bold') - ax.text(x+1, y+maxval/200, f'{pvalues[node][1]*100:.0f}', fontsize=8, - color='black', fontweight='bold') - else: - ax.text(x-7, y+maxval/200, f' {pvalues[node][0]*100:.0f}', fontsize=8, - color='purple') - ax.text(x+1, y+maxval/200, f'{pvalues[node][1]*100:.0f}', fontsize=8, - color='black') -# plt.savefig('dendrogram.pdf') + au_significant = pvalues[node][0] * 100 == 100 + ax.annotate(f'{pvalues[node][0]*100:.0f}', xy=(nx, y_offset), xytext=(-gap_points, 0), + textcoords='offset points', ha='right', va='bottom', + fontsize=value_fontsize, color='purple', + fontweight='bold' if au_significant else 'normal') + ax.annotate(f'{pvalues[node][1]*100:.0f}', xy=(nx, y_offset), xytext=(gap_points, 0), + textcoords='offset points', ha='left', va='bottom', + fontsize=value_fontsize, color='black', + fontweight='bold' if au_significant else 'normal') def node_positions(x, y): diff --git a/code/tests/test_ordination.py b/code/tests/test_ordination.py index 1813600..f8dfc48 100644 --- a/code/tests/test_ordination.py +++ b/code/tests/test_ordination.py @@ -12,8 +12,8 @@ import pytest from ordination import ( - load_ordination_matrix, nmds_loading_proxy, run_nmds, run_pca, run_plsda, - top_loadings, + load_ordination_matrix, nmds_loading_proxy, replicate_label_components, + run_nmds, run_pca, run_plsda, top_loadings, ) @@ -76,6 +76,42 @@ def test_collapsed_values_are_the_mean_of_their_technical_replicates(tmp_path, m assert s1_row['feat1'].iloc[0] == pytest.approx(11.0) +# --------------------------------------------------------------------------- # +# replicate_label_components +# --------------------------------------------------------------------------- # + +def test_replicate_label_components_numbers_bio_and_tech_reps(tmp_path): + # Reuses the same fixture as the collapse tests: groupA has 2 Samples + # (S1, S1b -- BioRep 1 and 2), groupB has 1 Sample (S2 -- BioRep 1, the + # "only one biological replicate" edge case), every Sample has 3 + # Injections (TechRep 1-3). + path = tmp_path / 'example_filtered.csv' + _write_synthetic_filtered_csv(path) + components = replicate_label_components(_raw_header(path)) + + assert components.loc['inj1', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1', 1, 1] + assert components.loc['inj3', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1', 1, 3] + assert components.loc['inj4', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupA', 'S1b', 2, 1] + # groupB has only one Sample -- still BioRep 1, not skipped/blank. + assert components.loc['inj7', ['Biolgroup', 'Sample', 'BioRep', 'TechRep']].tolist() == ['groupB', 'S2', 1, 1] + + +def test_replicate_label_components_single_technical_replicate(tmp_path): + # Edge case: a Sample with only one Injection should still get + # TechRep=1, not be skipped or raise. + path = tmp_path / 'single_techrep_filtered.csv' + with open(path, 'w') as f: + f.write(',,,groupA,groupA,groupB\n') + f.write(',,,S1,S1b,S2\n') + f.write('Compound,m/z,Retention time (min),inj1,inj2,inj3\n') + f.write('feat1,100.0,1.0,10,30,50\n') + components = replicate_label_components(_raw_header(path)) + + assert components.loc['inj1', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S1', 1, 1] + assert components.loc['inj2', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S1b', 2, 1] + assert components.loc['inj3', ['Sample', 'BioRep', 'TechRep']].tolist() == ['S2', 1, 1] + + # --------------------------------------------------------------------------- # # run_pca / run_nmds / run_plsda / top_loadings # --------------------------------------------------------------------------- # diff --git a/devnotes.md b/devnotes.md index a521abb..ece3cf0 100644 --- a/devnotes.md +++ b/devnotes.md @@ -291,8 +291,10 @@ only handles the combo boxes, axes, and pick events. ## Dendrogram purity coloring (`plotting.plot_dendrogram`, `clusterpurity.py`) -The dendrogram tab has two combo-box switchers (same runtime-widget- -substitution pattern as `plot_ordination`'s method/view bar): +The dendrogram tab has a switcher bar (same runtime-widget-substitution +pattern as `plot_ordination`'s method/view bar) with two combo boxes (View, +Color) and two checkboxes (Bootstrap, Use Sample/Group Names -- both +documented further down, formerly/newly local to this tab respectively): - **View** — which leaves to cluster: - **Technical Replicates** (default — matches the tab's original @@ -408,6 +410,44 @@ Both views' purity math is the same Qt-free linkage-traversal logic in `freeze_support()` specifically). The real app is fine — `main.py` already guards its entry point — but throwaway test scripts need the same discipline. +- **"Use Sample/Group Names" leaf labels** (`ordination.replicate_label_components()`): + swaps the raw file/injection names for `_b_s` + (Technical Replicates view) or `_b` (Biological + Replicates view -- no TechRep#, since that view already collapsed + technical replicates), for when the real file names are long or + uninformative. BioRep# is the 1-based rank of a Sample within its + Biolgroup (first-seen order); TechRep# is the 1-based rank of an + Injection within its Sample -- both numbers are assigned unconditionally, + so a Biolgroup with only one Sample still shows `_b1` and a Sample with + only one Injection still shows `_s1` (no special-casing needed for either + edge case, verified in `test_ordination.py`). This only changes the + `labels=` argument passed to `dendrogram()`/`PvClust.plot()` -- the + underlying data orientation, clustering, and purity-coloring lookups all + still key off the raw names internally. +- **AU/BP label scaling, regardless of leaf count**: the bootstrap + dendrogram's per-node AU/BP annotations used to be positioned with a + fixed x-shift in *icoord* units (e.g. `x-7`, with a separate `x-10` for + 3-digit "100" values). icoord spacing is always 10 units per leaf no + matter how many leaves there are, but the axes' actual pixel width isn't + -- with more leaves squeezed into the same plot width, each icoord unit + maps to fewer and fewer pixels, so that fixed icoord offset shrinks to an + ever-smaller *pixel* gap, eventually merging "AU"/"BP" into "AUBP" (and + every node's AU/BP pair into illegible overlapping text) once there are + enough leaves. Fixed by switching to `ax.annotate(..., xytext=(±2, 0), + textcoords='offset points', ha='right'/'left')`: a constant gap in + *points* (real pixels-at-a-given-DPI) stays a constant gap regardless of + icoord scale or leaf count, and `ha='right'`/`ha='left'` anchoring makes + the old digit-width-dependent branching (-7 vs -10) unnecessary entirely. + Per-node fontsize is also now scaled down as leaf count grows + (`max(5, min(8, 140 / n_leaves))`) so neighbouring *different* nodes' + labels -- which do have a fixed minimum icoord (and therefore pixel) + separation -- don't run into each other either. Verified by rendering + both a 6-leaf and a 27-leaf synthetic tree (matching the real 9-sample + x3-techrep dataset's leaf count) and visually confirming no overlap in + either. Also removed a `plt.figure(figsize=(12,8))`/`plt.tight_layout()` + pair that created and immediately abandoned an unused Figure on every + call -- it never affected the actual target `axis` and was a real (if + small) per-redraw resource leak. ## Treemap / upset plot canvases (`plotting.plot_treemap`, `plotting.plot_upset`) From 9697aa38e4147d1d4cfd4ee776e6be72a5737c21 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Mon, 29 Jun 2026 19:23:33 -0400 Subject: [PATCH 14/20] Docs: update mkdocs guide for ordination rework and dendrogram improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - multivariate.md: rewrite from NMDS-only to cover the full PCA/NMDS/PLS-DA ordination tab — method switcher, scores/loadings view, collapse-replicates checkbox, stress metric for NMDS, %explained for PCA/PLS-DA. - group-analysis.md: document all four dendrogram switcher-bar controls (View, Color, Bootstrap, Use Sample/Group Names). - changelog.md: add 2026 entries for ordination rework, dendrogram purity coloring/switchers/label options, AU/BP annotation fix, and canvas-plot UpSet/treemap. - development.md: add ordination.py, clusterpurity.py, csvcache.py to the hand-written-code list; add ordination to the test-coverage list. - index.md: expand the "mid-2026 updates" note to mention ordination and dendrogram reworks. Co-Authored-By: Claude Sonnet 4.6 --- docs/changelog.md | 27 ++++++++++++++++-- docs/development.md | 9 ++++-- docs/index.md | 12 ++++---- docs/plots/group-analysis.md | 51 +++++++++++++++++++++++++++++---- docs/plots/multivariate.md | 55 +++++++++++++++++++++++++++--------- 5 files changed, 125 insertions(+), 29 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index b58901b..c9475ef 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,8 +9,31 @@ Metaboscape) and automatic GNPS2-compatible re-indexing of MSP/MGF fragment databases on export. - MVC refactor of "Plot Feature Sets" (`groupsets.py`). -- `fastcluster` optional acceleration for hierarchical clustering/bootstrap - dendrogram. +- Multivariate ordination rework: the former NMDS-only tab now supports + **PCA**, **NMDS**, and **PLS-DA** via a method switcher bar, plus a + scores/loadings view toggle and a loadings highlight synced to feature + selection elsewhere in the app. PCA/PLS-DA features are autoscaled + (mean-center + unit variance) before fitting; NMDS stays on raw + abundances (conventional for Bray-Curtis). A new Qt-free backend + (`ordination.py`) handles all ordination math and is covered by + headless unit tests. +- Dendrogram rework: + - Per-plot switcher bar for **View** (Technical vs Biological + Replicates), **Color** (Purity / None), **Bootstrap**, and + **Use Sample/Group Names**. + - Purity coloring: green branches are cleanly within-sample/group; + red branches mark the exact merge point where two groups' leaves + overlap (proven non-monophyly). A title reports how many + samples/groups are fully correctly clustered. + - "Use Sample/Group Names" checkbox: replaces raw injection/file names + with `_b<#>_s<#>` (Technical Replicates view) or + `_b<#>` (Biological Replicates view). + - Bootstrap can be toggled off for a faster undecorated dendrogram. + - Fixed AU/BP annotation alignment: labels now stay a constant pixel + gap apart regardless of leaf count, with leaf-count-scaled font size. + - `fastcluster` optional acceleration for bootstrap linkage. +- UpSet and treemap plots are now rendered directly on a Qt canvas + (replacing a PNG round-trip). - Headless unit test suite (`code/tests/`). - Hardened dependency installer to prevent NumPy 2.x environment breaks. - Numerous bugfixes: Spearman double-colorbar on re-run, highlight not diff --git a/docs/development.md b/docs/development.md index 246d38e..0879d35 100644 --- a/docs/development.md +++ b/docs/development.md @@ -15,6 +15,9 @@ in the repo root. - **Hand-written app code:** `main.py` (`MainWindow`, run/save/load, database search), `plotting.py` (plot classes), `filter.py`, `stats.py`, `MSFaST.py` (analysis driver), `pvclust.py` (bootstrap dendrogram), + `ordination.py` (Qt-free PCA/NMDS/PLS-DA backend), + `clusterpurity.py` (dendrogram branch-purity logic), + `csvcache.py` (cached CSV reads for the ordination data path), `translators.py` (import/export framework), `mzmineimport.py` (format conversion), `getfragdb.py`, `mspwriter.py`. - **Canonical peak table** format (what `MSFaST` consumes internally; @@ -59,9 +62,9 @@ Headless unit tests live in `code/tests/` (pure-logic only — no Qt): python -m pytest code/tests -q ``` -Covers `filter`, `stats`, `importdependencies`, `translators`, and -`groupsets`. Add tests here for any new Qt-free logic. GUI behaviour can't -be tested headlessly — verify it by running the app. +Covers `filter`, `stats`, `importdependencies`, `translators`, +`groupsets`, and `ordination`. Add tests here for any new Qt-free logic. +GUI behaviour can't be tested headlessly — verify it by running the app. ## Conventions diff --git a/docs/index.md b/docs/index.md index 7d36644..f61cc62 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,8 +25,10 @@ shows. If you're contributing to MPACT itself, see [Development](development.md) This documentation is adapted from the original MPACT user guide (2022) and updated to reflect the current codebase (mid-2026), including the import/export translator framework for MZmine/MS-DIAL/ - Metaboscape peak tables, the background-threaded analysis run, and the - groupset (Plot Feature Sets) editor. Some screenshots referenced in - the original guide have not been re-captured yet — see - [Development](development.md) if you'd like to contribute updated - images. + Metaboscape peak tables, the background-threaded analysis run, the + groupset (Plot Feature Sets) editor, the multivariate ordination + rework (PCA/NMDS/PLS-DA with scores and loadings views), and the + dendrogram rework (purity coloring, view/bootstrap/label switchers). + Some screenshots referenced in the original guide have not been + re-captured yet — see [Development](development.md) if you'd like to + contribute updated images. diff --git a/docs/plots/group-analysis.md b/docs/plots/group-analysis.md index f41eeb9..732a98b 100644 --- a/docs/plots/group-analysis.md +++ b/docs/plots/group-analysis.md @@ -31,12 +31,51 @@ For datasets where biological/treatment differences should exceed technical noise, technical replicates should cluster together after filtering. -Bootstrap analysis (1000 iterations) can be enabled in the plot options -dialog to annotate the dendrogram with approximately-unbiased (AU) p-values -and bootstrap probabilities (BP). AU values above 95 are generally -considered statistically significant. Bootstrap computation uses -`fastcluster` if it's installed (falling back to SciPy's hierarchical -clustering otherwise) for substantially faster linkage on large datasets. +A settings bar above the plot controls how it's drawn — all four options +are local to this tab (not the general plot options dialog) and redraw +immediately when changed: + +**View** — which leaves to cluster: + +- **Technical Replicates** (default): every injection is its own leaf, + letting you see whether each sample's individual injections agree with + each other. +- **Biological Replicates**: technical replicates are averaged together + first (one leaf per sample), so the plot reflects clustering of + biological/treatment groups without technical noise. + +**Color** — how branches are colored: + +- **Purity** (default): a branch is colored **green** if every leaf beneath + it belongs to the same sample (Technical Replicates view) or the same + treatment group (Biological Replicates view) — i.e. it's correctly, + unambiguously clustered. A branch is colored **red** if it's the specific + point where two different samples/groups' leaves are proven to overlap + (some of that sample's/group's replicates are on each side of the + split) — a real sign of poor clustering, not just "still mixed from + somewhere lower in the tree." Every other branch (a clean join of two + unrelated, already-resolved regions) stays black, even if it sits above a + red branch elsewhere in the tree — so a single tangled sample doesn't + paint the whole tree red. The plot title reports how many + samples/groups are *fully* correctly clustered (e.g. "7/9 samples' + replicates clustered together"). +- **None**: a plain, uncolored dendrogram with no title — useful if you + just want the clustering shape without the QC overlay. + +**Bootstrap** — when checked (default on), runs bootstrap resampling +(1000 iterations) and annotates the dendrogram with approximately-unbiased +(AU) p-values and bootstrap probabilities (BP) at each branch point. AU +values above 95 are generally considered statistically significant. +Bootstrap computation uses `fastcluster` if it's installed (falling back to +SciPy's hierarchical clustering otherwise) for substantially faster linkage +on large datasets. Uncheck for a faster, unannotated dendrogram. + +**Use Sample/Group Names** — when checked, leaf labels switch from the raw +injection/file names (which can be long or uninformative) to +`_b<#>_s<#>` (Technical Replicates view) or `_b<#>` +(Biological Replicates view), where `b<#>` numbers each biological +replicate (sample) within its group and `s<#>` numbers each technical +replicate (injection) within its sample. ![Dendrogram](../images/dendrogram.png) *MPACT dendrogram after filtering, showing correct clustering of most diff --git a/docs/plots/multivariate.md b/docs/plots/multivariate.md index 30aa180..013e37f 100644 --- a/docs/plots/multivariate.md +++ b/docs/plots/multivariate.md @@ -1,17 +1,46 @@ -# Multivariate Analysis (NMDS) +# Multivariate Analysis -Nonmetric multidimensional scaling (NMDS) reduces the high-dimensional -metabolomics dataset to a low-dimensional view that's easy to interpret -visually. Samples are plotted by biological group, with 95% confidence -ellipses shown per group — samples that are closer together are more -similar in overall metabolome. +Reduces the high-dimensional metabolomics dataset to a low-dimensional +view. Three ordination methods are available: **NMDS**, **PCA**, and +**PLS-DA** — all operating on the same samples × features intensity +matrix. -By default, technical replicates are averaged so NMDS runs at the -sample level. To instead evaluate clustering of individual technical -replicates, uncheck replicate averaging in the plot options dialog -(the small square button in the plot's toolbar). +A settings bar above the plot controls how it's drawn and redraw +immediately on any change: + +**Method** — which ordination to run: + +- **NMDS** (default): nonmetric multidimensional scaling using + Bray-Curtis dissimilarity. A rank-based embedding — samples that are + closer together are more similar in overall metabolome. The plot + title shows the NMDS stress (the conventional fit-quality metric); + axis labels are NMDS1/NMDS2 (no percent-explained, since NMDS is + not a linear decomposition of the feature space). +- **PCA**: principal component analysis on mean-centered, + unit-variance-scaled features. Axis labels show percent of + total variance explained by each component. +- **PLS-DA**: partial least-squares discriminant analysis, supervised + by biological group. Axis labels show percent of explained variance + per component. Useful when NMDS/PCA show overlapping groups but + there's a genuine biological difference you want to maximize the + separation of. + +**View** — which aspect of the ordination to plot: + +- **Scores** (default): each sample (or injection) as a point, + coloured by biological group, with 95% confidence ellipses per + group. +- **Loadings**: the top 25 features that most drive the separation, + shown as arrows from the origin. Selecting a feature in another + plot highlights it here (yellow marker at its loadings position). + +**Collapse Replicates** — when checked (default on), technical +replicates are averaged together before running ordination, so each +point represents one biological sample. Uncheck to treat every +individual injection as its own point, which can be useful for +diagnosing injection-level outliers. ![NMDS plot](../images/nmds-plot.png) -*MPACT NMDS plot with technical-replicate averaging, showing differences -between samples and biological groups, with shaded ovals denoting 95% -confidence intervals.* +*MPACT multivariate ordination (NMDS scores view) with technical-replicate +averaging, showing differences between samples and biological groups, +with shaded ovals denoting 95% confidence intervals.* From 52c1d37488618acdf32b0622f563d277ab178e09 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 01:21:42 -0400 Subject: [PATCH 15/20] correlation matrix control improvements --- code/clusterpurity.py | 17 ++- code/ordination.py | 33 +++++ code/plotting.py | 206 +++++++++++++++++++++------ code/tests/test_clusterpurity.py | 16 +-- code/tests/test_ordination.py | 46 +++++- code/ui_functions.py | 13 +- devnotes.md | 113 ++++++++++++--- docs/changelog.md | 12 +- docs/development.md | 122 ---------------- docs/index.md | 9 +- docs/plots/group-analysis.md | 73 +++++++--- docs/troubleshooting.md | 4 +- docs/user-guide/analysis-settings.md | 6 +- mkdocs.yml | 1 - 14 files changed, 444 insertions(+), 227 deletions(-) delete mode 100644 docs/development.md diff --git a/code/clusterpurity.py b/code/clusterpurity.py index 5e8ebee..a6971df 100644 --- a/code/clusterpurity.py +++ b/code/clusterpurity.py @@ -10,17 +10,22 @@ cluster tightly together, and separately whether biological replicates of one Biolgroup are well separated from other groups. +Default colors are green/magenta rather than the more conventional +green/red -- red-green colorblindness (the most common form) makes the two +indistinguishable; magenta stays distinguishable from green under all +common forms of color vision deficiency. + This module is Qt-free and unit-tested (see ``tests/test_clusterpurity.py``). """ -def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red', neutral_color='black'): +def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='magenta', neutral_color='black'): """Build a ``link_color_func`` for ``scipy.cluster.hierarchy.dendrogram``. Three-way coloring, classified by comparing the two children's label sets (not by simply asking "is the merge result impure", which would - paint every ancestor of a single mixing event red all the way to the - root): + paint every ancestor of a single mixing event false_color all the way to + the root): - ``true_color`` ("monophyletic"): the two children's label sets are identical and contain exactly one label -- every leaf under this link @@ -35,9 +40,9 @@ def purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red' contradict each other; it's a clean bridge even if one or both children are themselves impure from a *different* label's tangle further down. This is what keeps a single low-level tangle from - cascading red all the way up the tree: once a tangled label's clade - stops growing (nothing more of that label to fold in), every merge - above it only ever joins disjoint regions, so it reverts to + cascading false_color all the way up the tree: once a tangled label's + clade stops growing (nothing more of that label to fold in), every + merge above it only ever joins disjoint regions, so it reverts to ``neutral_color``. Args: diff --git a/code/ordination.py b/code/ordination.py index 5356010..7829e9c 100644 --- a/code/ordination.py +++ b/code/ordination.py @@ -241,6 +241,39 @@ def nmds_loading_proxy(x, scores): ) +def similarity_matrix(x, method): + """Pairwise similarity between samples (rows of ``x``, a samples x + features intensity matrix) -- backs the sample-correlation heatmap's + "Method" switcher. + + - ``'Spearman'``: rank correlation of abundance profiles. The + established default for metabolomics QC (robust to the non-normal, + heavy-tailed abundance distributions typical of LC-MS data); values + in [-1, 1]. + - ``'Jaccard'``: 1 - Jaccard distance on which features are detected + (abundance > 0) in each sample, ignoring relative abundance + entirely -- useful when what matters is which compounds were + detected at all rather than how much; values in [0, 1]. + - ``'Bray-Curtis'``: 1 - Bray-Curtis dissimilarity, the standard + abundance-weighted similarity measure in ecology/metabolomics, + computed on raw abundances (same convention as ``run_nmds``'s + dissimilarity, unlike PCA/PLS-DA's autoscaled features); values in + [0, 1]. + + Returns a samples x samples DataFrame. + """ + if method == 'Spearman': + return x.transpose().corr(method='spearman') + x_filled = x.fillna(0) + if method == 'Jaccard': + dist = pairwise_distances((x_filled > 0).values, metric='jaccard') + elif method == 'Bray-Curtis': + dist = pairwise_distances(x_filled.values, metric='braycurtis') + else: + raise ValueError(f'Unknown similarity method: {method!r}') + return pd.DataFrame(1 - dist, index=x.index, columns=x.index) + + def run_plsda(x, y, n_components): """PLS-DA: PLS regression of the samples x features matrix against one-hot-encoded group labels. diff --git a/code/plotting.py b/code/plotting.py index 80133c9..137f2a5 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -579,57 +579,169 @@ def plot(self, parent, file, filtereddfs, groupsets): # abundance tied opacity u class plot_samplecorr(ui_plot): """ - The plot_samplecorr class generates a heatmap plot of the Spearman or Pearson correlation between samples. - - Parameters: - - parent: the parent widget for the plot - currplt: the index of the current plot within the parent widget - frame: the parent frame for the plot - file: a path to the file containing the ion dictionary - filtereddfs: a dictionary containing filtered dataframes for each group in the plot - groupsets: a dictionary containing GroupSet objects for each group in the plot - Methods: - - __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): initializes the plot by calling the plot() method with the given parameters - plot(self, parent, file, filtereddfs, groupsets): generates the plot with the given data. Reads the ion dictionary from a csv file and reads the filtered data from a csv file generated by the program. Calculates the Spearman correlation matrix and generates a heatmap plot using the Seaborn library. Adjusts the layout of the plot and draws it on the parent canvas. + Sample-correlation heatmap, with a Method/View/label switcher bar + inserted into the *shared* Group Analysis nav bar (frame_12 / + horizontalLayout_25, alongside the pre-existing "Sets"/"Sample + Correlations" buttons) rather than this plot's own canvas -- unlike + plot_dendrogram/plot_ordination's per-canvas bars, these controls only + apply to this page, and the nav bar is shared with the UpSet plot page. + UIFunctions.switch_grpanalysis_tab (ui_functions.py) greys the controls + out when the UpSet page is active. + + Views (same collapsing semantics as plot_dendrogram/plot_ordination, + via ordination.load_ordination_matrix): + - "Biological Replicates" (default, matches this plot's previous, + checkbox-less behaviour): technical replicates averaged per Sample, + biological replicates kept separate. + - "Individual Injections": no averaging, every injection is its own + row/column. + - "Biological Groups": both technical and biological replicates + averaged together, one row/column per Biolgroup -- "see only + biological groups" with technical-replicate averaging implied. + + Methods (ordination.similarity_matrix): + - "Spearman" (default): rank correlation of abundance profiles. + - "Jaccard": presence/absence (feature detected or not), ignoring + abundance. + - "Bray-Curtis": abundance-weighted, the standard ecology/metabolomics + similarity measure. + + "Use Sample/Group Names" checkbox: same nomenclature as the + dendrogram's -- ``_b_s`` (Individual + Injections), ``_b`` (Biological Replicates), or the + raw Biolgroup name (Biological Groups -- nothing left to shorten). """ + + VIEWS = ('Biological Replicates', 'Individual Injections', 'Biological Groups') + METHODS = ('Spearman', 'Jaccard', 'Bray-Curtis') + def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): ui_plot.__init__(self, parent, currplt, frame) self.parent = parent self.currplt = currplt + self.view = 'Biological Replicates' + self.method = 'Spearman' + self.use_sample_names = False + self._build_grpanalysis_controls(parent) self.plot(parent, file, filtereddfs, groupsets) - + + def _build_grpanalysis_controls(self, parent): + bar = QtWidgets.QWidget() + bar.setStyleSheet(_SWITCHER_BAR_STYLE) + bar.setMaximumHeight(_SWITCHER_BAR_HEIGHT) + layout = QtWidgets.QHBoxLayout(bar) + layout.setContentsMargins(4, 2, 4, 2) + + layout.addWidget(QtWidgets.QLabel('Method:')) + method_combo = QtWidgets.QComboBox() + method_combo.addItems(self.METHODS) + method_combo.setCurrentText(self.method) + method_combo.currentTextChanged.connect(self._on_method_changed) + layout.addWidget(method_combo) + + layout.addWidget(QtWidgets.QLabel('View:')) + view_combo = QtWidgets.QComboBox() + view_combo.addItems(self.VIEWS) + view_combo.setCurrentText(self.view) + view_combo.currentTextChanged.connect(self._on_view_changed) + layout.addWidget(view_combo) + + use_names_check = QtWidgets.QCheckBox('Use Sample/Group Names') + use_names_check.setChecked(self.use_sample_names) + use_names_check.toggled.connect(self._on_use_sample_names_toggled) + layout.addWidget(use_names_check) + + self.method_combo = method_combo + self.view_combo = view_combo + self.use_names_check = use_names_check + self.controls_bar = bar + + # Pushes the new controls to the right of the pre-existing + # Sets/Sample Correlations buttons within the same shared bar, + # rather than editing the generated horizontalLayout_25 itself. + parent.ui.horizontalLayout_25.addStretch(1) + parent.ui.horizontalLayout_25.addWidget(bar) + + def set_controls_enabled(self, enabled): + """Grey the Method/View/Use-Names controls out when the UpSet Plot + page (the nav bar's other tab) is active -- they don't apply there.""" + self.controls_bar.setEnabled(enabled) + + def _on_method_changed(self, method): + self.method = method + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_view_changed(self, view): + self.view = view + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _on_use_sample_names_toggled(self, checked): + self.use_sample_names = checked + self.reset(self._last_file, self._last_filtereddfs, self._last_groupsets) + + def _load_matrix(self, parent): + pltfile = parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv') + raw_header = cached_read_csv(pltfile, sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + x, biolgroup = ordination.load_ordination_matrix( + pltfile, raw_header.copy(), collapse_replicates=(self.view != 'Individual Injections')) + if self.view == 'Biological Groups': + x = x.groupby(biolgroup).mean() + return x, raw_header + + def _display_labels(self, raw_header, leaf_names): + """Build short ``Biolgroup_b#[_s#]`` labels, mirroring + plot_dendrogram's ``_display_labels`` -- nomenclature switches on + the active View the same way.""" + if self.view == 'Biological Groups': + return leaf_names # already bare Biolgroup names + components = ordination.replicate_label_components(raw_header) + if self.view == 'Biological Replicates': + per_sample = components.drop_duplicates('Sample').set_index('Sample') + return [f"{per_sample.loc[sample, 'Biolgroup']}_b{per_sample.loc[sample, 'BioRep']}" for sample in leaf_names] + return [ + f"{components.loc[injection, 'Biolgroup']}_b{components.loc[injection, 'BioRep']}_s{components.loc[injection, 'TechRep']}" + for injection in leaf_names + ] + def plot(self, parent, file, filtereddfs, groupsets): - iondict = cached_read_csv(self.parent.analysis_paramsgui.outputdir / 'iondict.csv', sep=',', header=[0], index_col=None) - msdata = cached_read_csv(self.parent.analysis_paramsgui.outputdir / (self.parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=[0, 1, 2], index_col=[0, 1, 2]) - try: - msdata = msdata.stack([0, 1, 2], future_stack=True).groupby(level=[0, 1, 2, 3, 4]).mean().droplevel(level=3, axis=0).unstack() - except TypeError: - msdata = msdata.stack([0, 1, 2]).groupby(level=[0, 1, 2, 3, 4]).mean().droplevel(level=3, axis=0).unstack() - msdata.index = msdata.index.droplevel([1, 2]) - pmatrix = msdata.corr(method='spearman') + self._last_file = file + self._last_filtereddfs = filtereddfs + self._last_groupsets = groupsets + + x, raw_header = self._load_matrix(parent) + pmatrix = ordination.similarity_matrix(x, self.method) + + leaf_names = pmatrix.columns.tolist() + display_labels = self._display_labels(raw_header, leaf_names) if self.use_sample_names else leaf_names + fig = self.parent.fig[self.currplt] - ax = self.parent.ax[self.currplt] - # Remove any axes left over from a previous run (notably the colorbar - # that sns.heatmap appends). Without this a new colour-legend bar is - # stacked onto the figure every time the plot is regenerated. - for extra_ax in list(fig.axes): - if extra_ax is not ax: - extra_ax.remove() - ax.clear() + # clf() + add_subplot: sns.heatmap permanently shrinks the axes each + # call to make room for its colorbar; removing the extra colorbar axes + # afterwards doesn't restore the original size. Starting fresh each + # call keeps the axes at a consistent width. + fig.clf() + ax = fig.add_subplot(111) + ax.set_facecolor(self.plotbackground) + ax.set_axisbelow(True) + self.parent.ax[self.currplt] = ax + # Spearman is mathematically capable of going negative, but real + # sample-vs-sample correlations in practice cluster tightly positive + # (e.g. 0.7-1.0) -- a -1..1 scale would compress all of that + # meaningful variation into a sliver of the colour range. 0..1 for + # every method keeps the full range informative. sns.heatmap(pmatrix, ax=ax, cmap=self.parent.analysis_paramsgui.colorscheme, vmin=0, vmax=1) ax.tick_params(axis='both', which='both', labelsize=10) ax.set_xticks(range(len(pmatrix.columns))) - ax.set_xticklabels(pmatrix.columns, rotation=90) + ax.set_xticklabels(display_labels, rotation=90) ax.set_yticks(range(len(pmatrix.index))) - ax.set_yticklabels(pmatrix.index, rotation=0) + ax.set_yticklabels(display_labels, rotation=0) ax.axes.get_xaxis().get_label().set_visible(False) ax.axes.get_yaxis().get_label().set_visible(False) + ax.set_title(self.method, fontsize=10) self.parent.fig[self.currplt].subplots_adjust(left=.1, right=.95, bottom=0.15, top=0.9, hspace=0.2, wspace=0.2) self.parent.canvas[self.currplt].draw() - - + + class kendrick(ui_plot): """ The purpose of this class is to plot the mass defect versus the nominal mass of compounds based on the input files and parameters provided. @@ -816,8 +928,9 @@ class plot_dendrogram(ui_plot): Coloring: - "Purity": green wherever a branch's leaves are entirely one group - (correctly clustered), red wherever a branch mixes more than one - group (polyphyletic). + (correctly clustered), magenta wherever a branch mixes more than one + group (polyphyletic). Green/magenta rather than the more conventional + green/red since red-green colorblindness can't distinguish the latter. - "None": plain black dendrogram, no purity coloring or title -- the tab's original (pre-purity-coloring) appearance. @@ -969,9 +1082,11 @@ def plot(self, parent, file, filtereddfs, groupsets): Z = shc.linkage(data_for_linkage, method='ward') if self.color_mode == 'Purity': - # Green = monophyletic (correctly clustered); red = polyphyletic - # (mixes more than one group). - link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels, true_color='green', false_color='red') + # Green = monophyletic (correctly clustered); magenta = + # polyphyletic (mixes more than one group). Magenta rather than + # the conventional red -- distinguishable from green under + # red-green colorblindness, the most common form. + link_color_func = clusterpurity.purity_link_color_func(Z, leaf_labels, true_color='green', false_color='magenta') else: link_color_func = None # plain black dendrogram, scipy's own default @@ -1010,10 +1125,21 @@ def plot(self, parent, file, filtereddfs, groupsets): border-radius: 2px; padding: 2px; } +QComboBox:disabled { + background-color: rgb(220,220,220); + color: rgb(150,150,150); + border: 1px solid rgb(195,195,195); +} QLabel { color: rgb(30,30,30); background: transparent; } +QLabel:disabled { + color: rgb(150,150,150); +} +QCheckBox:disabled { + color: rgb(150,150,150); +} """ diff --git a/code/tests/test_clusterpurity.py b/code/tests/test_clusterpurity.py index 87cc3de..490544e 100644 --- a/code/tests/test_clusterpurity.py +++ b/code/tests/test_clusterpurity.py @@ -72,18 +72,18 @@ def test_purity_link_color_func_clean_disjoint_groups_stay_neutral_even_at_root( assert color_func(node_id) == 'green' -def test_purity_link_color_func_overlap_is_the_only_red_and_it_does_not_cascade(): +def test_purity_link_color_func_overlap_is_the_only_false_color_and_it_does_not_cascade(): Z, labels = _scattered_pair_linkage() color_func = purity_link_color_func(Z, labels) - assert color_func(5) == 'green' # Q+Q, monophyletic - assert color_func(6) == 'black' # P + {Q}: disjoint, clean bridge - assert color_func(7) == 'red' # P + {P,Q}: OVERLAP -- the actual tangle + assert color_func(5) == 'green' # Q+Q, monophyletic + assert color_func(6) == 'black' # P + {Q}: disjoint, clean bridge + assert color_func(7) == 'magenta' # P + {P,Q}: OVERLAP -- the actual tangle # R joining afterward is disjoint from {P,Q} -- R was never part of the - # P/Q mixing, so this must NOT also render red just because it's above - # (contains) the node-7 tangle. This is the specific behaviour this - # rule exists for: a real, low-level tangle must not paint every - # ancestor red all the way to the root. + # P/Q mixing, so this must NOT also render false_color just because it's + # above (contains) the node-7 tangle. This is the specific behaviour + # this rule exists for: a real, low-level tangle must not paint every + # ancestor false_color all the way to the root. assert color_func(8) == 'black' diff --git a/code/tests/test_ordination.py b/code/tests/test_ordination.py index f8dfc48..992434e 100644 --- a/code/tests/test_ordination.py +++ b/code/tests/test_ordination.py @@ -13,7 +13,7 @@ from ordination import ( load_ordination_matrix, nmds_loading_proxy, replicate_label_components, - run_nmds, run_pca, run_plsda, top_loadings, + run_nmds, run_pca, run_plsda, similarity_matrix, top_loadings, ) @@ -186,6 +186,50 @@ def test_nmds_smoke_test_on_clustered_data(): assert proxy.values.min() >= -1.0001 and proxy.values.max() <= 1.0001 +# --------------------------------------------------------------------------- # +# similarity_matrix +# --------------------------------------------------------------------------- # + +def test_similarity_matrix_spearman_self_correlation_is_one(): + x = pd.DataFrame( + [[1.0, 2.0, 3.0], [3.0, 2.0, 1.0], [1.0, 5.0, 2.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Spearman') + assert sim.shape == (3, 3) + assert np.allclose(np.diag(sim.values), 1.0) + # s1 and s2 are perfectly rank-anticorrelated. + assert sim.loc['s1', 's2'] == pytest.approx(-1.0) + + +def test_similarity_matrix_jaccard_identical_presence_is_one(): + # s1/s2 detect exactly the same features (different abundances); + # s3 detects none of them. + x = pd.DataFrame( + [[5.0, 0.0, 2.0], [50.0, 0.0, 20.0], [0.0, 0.0, 0.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Jaccard') + assert sim.loc['s1', 's2'] == pytest.approx(1.0) + assert np.allclose(np.diag(sim.values)[:2], 1.0) + + +def test_similarity_matrix_braycurtis_identical_profiles_is_one(): + x = pd.DataFrame( + [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0], [10.0, 0.0, 0.0]], + index=['s1', 's2', 's3'], columns=['f1', 'f2', 'f3'], + ) + sim = similarity_matrix(x, 'Bray-Curtis') + assert sim.loc['s1', 's2'] == pytest.approx(1.0) + assert sim.loc['s1', 's3'] < sim.loc['s1', 's2'] + + +def test_similarity_matrix_unknown_method_raises(): + x = pd.DataFrame([[1.0, 2.0]], index=['s1'], columns=['f1', 'f2']) + with pytest.raises(ValueError): + similarity_matrix(x, 'Pearson') + + # --------------------------------------------------------------------------- # # top_loadings # --------------------------------------------------------------------------- # diff --git a/code/ui_functions.py b/code/ui_functions.py index 0a49837..148629f 100644 --- a/code/ui_functions.py +++ b/code/ui_functions.py @@ -124,8 +124,8 @@ def uiDefinitions(self): self.ui.btn_cvplt.clicked.connect(lambda: self.ui.stackedWidget_review.setCurrentIndex(2)) self.ui.btn_datasummary.clicked.connect(lambda: self.ui.stackedWidget_review.setCurrentIndex(3)) - self.ui.btn_upsetplt.clicked.connect(lambda: self.ui.stackedWidget_grpanalysis.setCurrentIndex(0)) - self.ui.btn_samplecorr.clicked.connect(lambda: self.ui.stackedWidget_grpanalysis.setCurrentIndex(1)) + self.ui.btn_upsetplt.clicked.connect(lambda: UIFunctions.switch_grpanalysis_tab(self, 0)) + self.ui.btn_samplecorr.clicked.connect(lambda: UIFunctions.switch_grpanalysis_tab(self, 1)) #feature info bar functions self.ftrdialog.ui.btn_close.clicked.connect(lambda: self.ftrdialog.hide()) @@ -230,6 +230,15 @@ def goto_review(self): self.dialog.ui.checkBox_applyfilter.hide() + def switch_grpanalysis_tab(self, idx): + """Switch the Group Analysis sub-tab (UpSet Plot=0, Sample + Correlations=1) and grey out plot_samplecorr's Method/View/Use-Names + controls -- shared with btn_upsetplt/btn_samplecorr in frame_12 -- + whenever the UpSet Plot tab is active, since they don't apply there.""" + self.ui.stackedWidget_grpanalysis.setCurrentIndex(idx) + if getattr(self, 'samplecorr', None) is not None: + self.samplecorr.set_controls_enabled(idx == 1) + def goto_upset(self): self.ui.stackedWidget_infobar.setCurrentIndex(1) self.ui.stackedWidget_plot.setCurrentIndex(9) diff --git a/devnotes.md b/devnotes.md index ece3cf0..cb6689e 100644 --- a/devnotes.md +++ b/devnotes.md @@ -309,8 +309,8 @@ documented further down, formerly/newly local to this tab respectively): group, i.e. the groups are separable. - **Color** — how to render purity: - **Purity** (default): green wherever a branch's leaves are entirely one - group (correctly clustered), red wherever a branch mixes more than one - group (polyphyletic) — a QC judgment visible at a glance rather than + group (correctly clustered), magenta wherever a branch mixes more than + one group (polyphyletic) — a QC judgment visible at a glance rather than read off leaf labels one at a time. The plot title reports `n_pure/n_total` (e.g. "7/9 samples' replicates clustered together", "3/3 biological groups separable") via `clusterpurity.purity_summary()`. @@ -326,18 +326,19 @@ documented further down, formerly/newly local to this tab respectively): Both views' purity math is the same Qt-free linkage-traversal logic in `clusterpurity.py`, unit-tested in `tests/test_clusterpurity.py`. -- **Red marks proven non-monophyly (overlap), not "any impure merge"**: two - earlier attempts both got this wrong in opposite directions. First, every - impure merge was colored red, including every ancestor above a single - mixing event all the way to the root -- since almost any real dataset has - *some* mixing somewhere, this painted most of the tree's upper structure - red regardless of how localized the problem was. The second attempt - ("impure but at least one child was pure = bridge = red, both children - already impure = neutral") fixed the worst of the cascading but still - mis-colored real data: it could still mark a high-level merge red merely - because one side happened to be a single freshly-introduced pure clade, - *and* it could miss real tangles where two already-impure children share - a label without one side being trivially pure. +- **`false_color` marks proven non-monophyly (overlap), not "any impure + merge"**: two earlier attempts both got this wrong in opposite directions. + First, every impure merge was colored `false_color`, including every + ancestor above a single mixing event all the way to the root -- since + almost any real dataset has *some* mixing somewhere, this painted most of + the tree's upper structure regardless of how localized the problem was. + The second attempt ("impure but at least one child was pure = bridge = + `false_color`, both children already impure = neutral") fixed the worst + of the cascading but still mis-colored real data: it could still mark a + high-level merge `false_color` merely because one side happened to be a + single freshly-introduced pure clade, *and* it could miss real tangles + where two already-impure children share a label without one side being + trivially pure. `purity_link_color_func()` now compares the two children's label sets directly at each merge: @@ -350,18 +351,26 @@ Both views' purity math is the same Qt-free linkage-traversal logic in in, every merge above it only ever joins disjoint regions, so it goes back to black. - **overlap** (share >=1 label, without being identical-and-singleton) -> - polyphyletic (`false_color`/red) -- definitive proof that some label's - leaves are split across this exact merge (present on both sides), not - just "still mixed from an earlier merge". + polyphyletic (`false_color`/magenta) -- definitive proof that some + label's leaves are split across this exact merge (present on both + sides), not just "still mixed from an earlier merge". Verified against the real example dataset's bootstrap dendrogram (the case that exposed both earlier bugs): only the two merges that actually re-unite a scattered sample's replicates (e.g. one sample's reps split into two non-sister sub-clades that only meet again higher up) render - red; the higher-level merges joining that region with cleanly-resolved, - unrelated samples stay black, same as a hand-built synthetic linkage - (`tests/test_clusterpurity.py`'s `_scattered_pair_linkage`) reproducing - the same pattern deterministically. + `false_color`; the higher-level merges joining that region with + cleanly-resolved, unrelated samples stay black, same as a hand-built + synthetic linkage (`tests/test_clusterpurity.py`'s + `_scattered_pair_linkage`) reproducing the same pattern deterministically. + + `true_color`/`false_color` default to green/magenta, not the more + conventional green/red: red-green colorblindness (the most common form) + can't distinguish red from green, while magenta stays distinguishable + from green under all common forms of color vision deficiency. (Changed + from an original green/red default after user feedback; see + `clusterpurity.py`'s `purity_link_color_func()` default args and + `plotting.py`'s `plot_dendrogram.plot()` call site.) - **Bootstrap is now a per-tab checkbox, not a global one**: the plot-config dialog's "Bootstrap Analysis" checkbox (`checkBox_bootstrap`) only ever affected this one plot, so it moved into `plot_dendrogram`'s own @@ -498,6 +507,68 @@ other plot already worked, not a new inconsistency. old axes/figures were leaking), and (3) no PNG got written to disk by either plot anymore. +## Sample correlation matrix (`plotting.plot_samplecorr`, `ordination.similarity_matrix`) + +Used to be a hardcoded Spearman-only heatmap with technical replicates +always pre-averaged and no way to relabel the raw injection/sample names. +Now has a Method (Spearman/Jaccard/Bray-Curtis) switcher, a View +(Biological Replicates/Individual Injections/Biological Groups) switcher, +and a "Use Sample/Group Names" checkbox — same nomenclature and +`ordination.replicate_label_components()` reuse as `plot_dendrogram`'s. + +- **`ordination.similarity_matrix(x, method)`** is the new Qt-free backend + (covered by `test_ordination.py`): `x` is samples x features, same + convention as `run_pca`/`run_nmds`/`run_plsda`. Spearman is + `x.transpose().corr(method='spearman')`; Jaccard/Bray-Curtis go through + `sklearn.metrics.pairwise_distances` (`metric='jaccard'`/`'braycurtis'`) + and return `1 - distance`. Jaccard is computed on `x > 0` (presence/ + absence of detection, ignoring abundance) — deliberately *not* derived + from the groupset query-dict machinery the user floated as a possible + source, since that's per-feature-list bookkeeping for the UpSet/treemap + tabs, a different concept from per-sample/group detection. + Pearson/Kendall were considered and rejected: Pearson assumes + normally-distributed abundances (the wrong fit for heavy-tailed LC-MS + intensities, same reasoning that makes Spearman the established choice + here), Kendall is a slower, largely redundant rank-correlation + alternative to Spearman. +- **Controls live in the *shared* `frame_12`/`horizontalLayout_25` nav + bar** (the one holding the pre-existing `btn_upsetplt`/`btn_samplecorr` + buttons that switch `stackedWidget_grpanalysis`), not in this plot's own + canvas frame — unlike `plot_dendrogram`/`plot_ordination`'s per-canvas + switcher bars, these controls are specific to the Sample Correlations + page but the nav bar is shared with the unrelated UpSet Plot page. + `plot_samplecorr._build_grpanalysis_controls()` appends a stretch then + its own control widget onto the existing layout (no `ui_main.py` edit) + so the new controls sit to the right of the two buttons and the bar + stays a single row regardless of window width. +- **Greying out on the UpSet Plot tab**: `ui_functions.py`'s + `btn_upsetplt`/`btn_samplecorr` click handlers used to call + `stackedWidget_grpanalysis.setCurrentIndex()` directly; they now route + through `UIFunctions.switch_grpanalysis_tab(self, idx)`, which also calls + `self.samplecorr.set_controls_enabled(idx == 1)` (guarded by + `getattr(self, 'samplecorr', None) is not None`, since this can fire + before any analysis has run and created the plot object). The Designer + default for `stackedWidget_grpanalysis` is already index 1 + (Sample Correlations), so the controls start enabled, matching the + default active page. +- **View → row/column construction**: "Biological Replicates" and + "Individual Injections" reuse `ordination.load_ordination_matrix()` + exactly like `plot_dendrogram` does (`collapse_replicates=True`/`False`). + "Biological Groups" takes the collapsed (Biological Replicates) matrix + and does one more `x.groupby(biolgroup).mean()` to average across + biological replicates too — deliberately not a third mode inside + `load_ordination_matrix` itself, since it's a trivial one-line reduction + of an already-correct, already-tested intermediate result. +- **Heatmap `vmin` is `0` for all three methods**, including Spearman. + Spearman is mathematically capable of going negative, but real + sample-vs-sample correlations in this kind of data cluster tightly + positive (e.g. 0.7-1.0) — a `-1..1` scale (tried first) compressed all of + that meaningful variation into a sliver of the colour range, making the + heatmap look uniformly dark/uninformative. `0..1` keeps the full colour + range usable for the variation that actually occurs. +- Dropped the dead `iondict = cached_read_csv(...)` read that was never + actually used by the old `plot()` body. + ## Conventions - Don't edit the generated UI files (above). Put behaviour in `main.py` / diff --git a/docs/changelog.md b/docs/changelog.md index c9475ef..82b796d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -22,8 +22,10 @@ Replicates), **Color** (Purity / None), **Bootstrap**, and **Use Sample/Group Names**. - Purity coloring: green branches are cleanly within-sample/group; - red branches mark the exact merge point where two groups' leaves - overlap (proven non-monophyly). A title reports how many + magenta branches mark the exact merge point where two groups' leaves + overlap (proven non-monophyly) — magenta rather than the more + conventional red, since red-green colorblindness (the most common + form) can't distinguish red from green. A title reports how many samples/groups are fully correctly clustered. - "Use Sample/Group Names" checkbox: replaces raw injection/file names with `_b<#>_s<#>` (Technical Replicates view) or @@ -32,6 +34,12 @@ - Fixed AU/BP annotation alignment: labels now stay a constant pixel gap apart regardless of leaf count, with leaf-count-scaled font size. - `fastcluster` optional acceleration for bootstrap linkage. +- Sample Correlation Matrix rework: a **Method** switcher (Spearman / + Jaccard / Bray-Curtis), a **View** switcher (Individual Injections / + Biological Replicates / Biological Groups), and a "Use Sample/Group + Names" checkbox, all in the nav bar shared with the UpSet Plot tab + (greyed out while that tab is active). The heatmap scale is fixed to + 0-1 for all three methods. - UpSet and treemap plots are now rendered directly on a Qt canvas (replacing a PNG round-trip). - Headless unit test suite (`code/tests/`). diff --git a/docs/development.md b/docs/development.md deleted file mode 100644 index 0879d35..0000000 --- a/docs/development.md +++ /dev/null @@ -1,122 +0,0 @@ -# Development - -This page is for people contributing to MPACT itself, not end users. For -the authoritative, most up-to-date version of these notes (kept alongside -the code), see [`devnotes.md`](https://github.com/BalunasLab/mpact/blob/main/devnotes.md) -in the repo root. - -## Architecture - -- **Generated — do not edit:** `ui_main.py`, `ui_main1.py`, - `ui_featureinfo.py`, `ui_plotparam.py`, `files.py`, `files_rc.py`. These - are Qt Designer output and get overwritten on regeneration. (Despite the - name, `ui_functions.py` is hand-written and fully editable — it's the - `UIFunctions` controller class.) -- **Hand-written app code:** `main.py` (`MainWindow`, run/save/load, - database search), `plotting.py` (plot classes), `filter.py`, `stats.py`, - `MSFaST.py` (analysis driver), `pvclust.py` (bootstrap dendrogram), - `ordination.py` (Qt-free PCA/NMDS/PLS-DA backend), - `clusterpurity.py` (dendrogram branch-purity logic), - `csvcache.py` (cached CSV reads for the ordination data path), - `translators.py` (import/export framework), `mzmineimport.py` (format - conversion), `getfragdb.py`, `mspwriter.py`. -- **Canonical peak table** format (what `MSFaST` consumes internally; - Progenesis is the native/baseline format): CSV with 3 header rows, row 2 - = `Compound,m/z,Retention time (min),`, col0 = `RT_mz` id, - col1 = m/z, col2 = RT. - -## Threading model - -`run_MSFaST` is Qt-free and runs on a `QThread` worker (`AnalysisWorker` in -`main.py`), so the GUI stays responsive during the heavy compute. -`MainWindow.run_analysis` reads widgets on the main thread, starts the -worker, and `_finish_analysis` does all matplotlib/Qt plotting back on the -**main thread** (matplotlib is not thread-safe). Never create Qt/matplotlib -objects on the worker thread. - -## Importer/translator framework (`translators.py`) - -Qt-free and unit-tested: `detect_peaktable_format`, `parse_msp`/ -`parse_mgf` (→ `FragmentEntry`), `reindex_fragments` (matches fragments to -peak-table rows by compound ID first, then m/z+RT — Progenesis MSP stores -neutral mass, not adduct m/z), `filter_source_peaktable` (row-subsets the -source peak table to surviving features). `mzmineimport.format_check` -delegates detection to this module. - -## Groupsets MVC (`groupsets.py`) - -`GroupSet` (data) + `GroupSetModel` (collection, bounds-safe selection, -CRUD) + `build_query_dict()` replace what used to be a bare list + -selected-index pair. `MainWindow.groupsetmodel` is the live state; -`ui_functions.py`'s `addgroup`/`removegroup`/`updatesets`/`updategroups`/ -`writegroups`/`colour_picker1` are thin view-sync controllers over it. -`main.py`'s `query` class still exists, but **only** as the unpickle target -for old `.mpct` files — `GroupSet.from_legacy`/`GroupSetModel.from_legacy_list` -convert on load. - -## Testing - -Headless unit tests live in `code/tests/` (pure-logic only — no Qt): - -``` -python -m pytest code/tests -q -``` - -Covers `filter`, `stats`, `importdependencies`, `translators`, -`groupsets`, and `ordination`. Add tests here for any new Qt-free logic. -GUI behaviour can't be tested headlessly — verify it by running the app. - -## Conventions - -- Never edit the generated UI files listed above. -- Plot generation goes through `MainWindow.safe_generate`, so one failing - plot doesn't abort the rest. -- `.mpct` saves are atomic (temp file + `os.replace`), with per-component - guards (`write_save`). -- `loadsession` restores each saved parameter independently — a bad/missing - field can't cascade and abort restoration of the rest. Add new analysis - parameters to **both** `enumerate_inputs` (save) and `loadsession` - (restore). -- Plot objects (`self.ftplt`, `self.kmd`, `self.spec`, ...) are created the - first time they're needed and `.reset()` afterward, via - `MainWindow._create_or_reset()` / `_generate_plots()` — never gate - create-vs-reset on a whole-session flag like `self.analysisrun`, since an - optional output can newly turn on mid-session for a dataset that didn't - have it before, and the object would never get created. -- Use `MainWindow._refresh_highlight()` (not `highlight_feature()`) to - redraw the current selection without changing it (e.g. on a tab switch). - `highlight_feature(newfeature)` is for real selection events and toggles - the highlight off if the same feature is clicked twice — calling it with - the already-selected feature re-triggers that toggle, which is a bug, not - a refresh. -- Every matplotlib `pick_event` handler must call - `plotting._is_duplicate_pick(parent, event)` first and bail if it - returns `True`. Matplotlib fires one `pick_event` per artist that - registers a hit, not one per click — a feature plotted in more than one - groupset/colour layer otherwise fires the handler twice per click. -- `importdependencies.checkdep()` should stay silent when nothing needs - installing — it runs on every launch, including every Spyder "Run File" - (which re-executes `main.py`'s top level). Only report actual - installs/failures. - -## Building the docs site locally - -``` -pip install mkdocs mkdocs-material -mkdocs serve -``` - -Then open `http://127.0.0.1:8000`. See [Hosting](#hosting-this-site) below -for deployment. - -## Hosting this site - -This site is plain static HTML generated by MkDocs — there's no backend, -database, or server-side logic, so it needs essentially no infrastructure. -**GitHub Pages is the right fit here**: it's free, MkDocs has built-in -support for deploying to it (`mkdocs gh-deploy`), and a low-traffic docs -site for a research tool doesn't need a dedicated host. A `gh-pages` -deploy workflow is included in this repo (`.github/workflows/docs.yml`) — -once GitHub Pages is enabled for this repo (Settings → Pages → Source: -`gh-pages` branch), the site builds and publishes automatically on every -push to `main` that touches `docs/` or `mkdocs.yml`. diff --git a/docs/index.md b/docs/index.md index f61cc62..da22288 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,7 +11,9 @@ plots, heatmaps, and per-feature spectral/database-match lookup. This site covers installing and running MPACT, the file formats it expects, the analysis and filtering options, and what each plot/tab -shows. If you're contributing to MPACT itself, see [Development](development.md). +shows. If you're contributing to MPACT itself, see +[`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) +in the repo root. ## Where to start @@ -30,5 +32,6 @@ shows. If you're contributing to MPACT itself, see [Development](development.md) rework (PCA/NMDS/PLS-DA with scores and loadings views), and the dendrogram rework (purity coloring, view/bootstrap/label switchers). Some screenshots referenced in the original guide have not been - re-captured yet — see [Development](development.md) if you'd like to - contribute updated images. + re-captured yet — see + [`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) + if you'd like to contribute updated images. diff --git a/docs/plots/group-analysis.md b/docs/plots/group-analysis.md index 732a98b..46eef57 100644 --- a/docs/plots/group-analysis.md +++ b/docs/plots/group-analysis.md @@ -12,14 +12,51 @@ combination of groups (top bar chart + dot matrix). ![UpSet plot](../images/upset-plot.png) *MPACT UpSet plot showing the distribution of features across sample sets.* -## Spearman Correlation Matrix - -Pairwise Spearman correlation between every group, useful for evaluating -overall metabolomic similarity at a glance. Colour scheme is configurable -in the plot options dialog. - -![Spearman correlation matrix](../images/spearman-matrix.png) -*MPACT Spearman correlation matrix.* +## Sample Correlation Matrix + +Pairwise similarity between samples/groups, useful for evaluating overall +metabolomic similarity at a glance. Colour scheme is configurable in the +plot options dialog. + +A settings bar shared with the UpSet Plot tab (the same bar holding the +"Sets"/"Sample Correlations" buttons) controls how it's drawn, and redraws +immediately on any change. These controls are greyed out while the UpSet +Plot tab is active, since they don't apply there. + +**Method** — which similarity measure to compute: + +- **Spearman** (default): rank correlation of abundance profiles, robust + to the non-normal, heavy-tailed abundance distributions typical of + LC-MS data. Mathematically ranges -1 to 1, but the heatmap scale is fixed + to 0-1 since real sample correlations cluster tightly positive in + practice — a -1-to-1 scale would compress that variation into an + unreadable sliver of the colour range. +- **Jaccard**: presence/absence similarity — based only on which features + are detected in each sample/group, ignoring how much. Useful when + detection (not relative abundance) is what you care about. Ranges 0 to 1. +- **Bray-Curtis**: abundance-weighted similarity, the standard measure in + ecology/metabolomics (same convention as the Multivariate Analysis tab's + NMDS). Ranges 0 to 1. + +**View** — which rows/columns to correlate: + +- **Biological Replicates** (default): technical replicates are averaged + together first (one row/column per sample), so the matrix reflects + biological/treatment-group similarity without technical noise. +- **Individual Injections**: no averaging — every injection is its own + row/column. +- **Biological Groups**: both technical and biological replicates are + averaged together — one row/column per treatment group, for "see only + biological groups" at a glance. + +**Use Sample/Group Names** — same nomenclature as the dendrogram's: when +checked, labels switch from the raw injection/file names to +`_b<#>_s<#>` (Individual Injections view), `_b<#>` +(Biological Replicates view), or the bare group name (Biological Groups +view, nothing left to shorten). + +![Sample correlation matrix](../images/spearman-matrix.png) +*MPACT sample correlation matrix.* ## Dendrogram @@ -49,16 +86,18 @@ immediately when changed: - **Purity** (default): a branch is colored **green** if every leaf beneath it belongs to the same sample (Technical Replicates view) or the same treatment group (Biological Replicates view) — i.e. it's correctly, - unambiguously clustered. A branch is colored **red** if it's the specific - point where two different samples/groups' leaves are proven to overlap - (some of that sample's/group's replicates are on each side of the + unambiguously clustered. A branch is colored **magenta** if it's the + specific point where two different samples/groups' leaves are proven to + overlap (some of that sample's/group's replicates are on each side of the split) — a real sign of poor clustering, not just "still mixed from - somewhere lower in the tree." Every other branch (a clean join of two - unrelated, already-resolved regions) stays black, even if it sits above a - red branch elsewhere in the tree — so a single tangled sample doesn't - paint the whole tree red. The plot title reports how many - samples/groups are *fully* correctly clustered (e.g. "7/9 samples' - replicates clustered together"). + somewhere lower in the tree." (Magenta rather than the more conventional + red, since red-green colorblindness — the most common form — can't tell + red and green apart; magenta stays distinguishable from green.) Every + other branch (a clean join of two unrelated, already-resolved regions) + stays black, even if it sits above a magenta branch elsewhere in the tree + — so a single tangled sample doesn't paint the whole tree magenta. The + plot title reports how many samples/groups are *fully* correctly + clustered (e.g. "7/9 samples' replicates clustered together"). - **None**: a plain, uncolored dendrogram with no title — useful if you just want the clustering shape without the QC overlay. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 8482fd4..d2cdef7 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -90,4 +90,6 @@ python -m pytest code/tests -q ``` GUI behaviour itself can't be tested headlessly and needs to be checked by -running the app — see [Development](development.md). +running the app — see +[`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md) +in the repo root. diff --git a/docs/user-guide/analysis-settings.md b/docs/user-guide/analysis-settings.md index 1500af0..f7ba592 100644 --- a/docs/user-guide/analysis-settings.md +++ b/docs/user-guide/analysis-settings.md @@ -59,6 +59,6 @@ presence/absence in these groups, as indicated by Venn diagrams.* Internally, each Plot Feature Set is a `GroupSet` object managed by a small model/collection class (`GroupSetModel`) rather than a bare list + selected-index pair. This is purely an implementation detail (see - [Development](../development.md)) — old `.mpct` save files still load - correctly, with their saved feature sets converted into the current - representation automatically. + [`devnotes.md`](https://github.com/robertsamples/mpact/blob/main/devnotes.md)) + — old `.mpct` save files still load correctly, with their saved feature + sets converted into the current representation automatically. diff --git a/mkdocs.yml b/mkdocs.yml index 3900d26..5b76a22 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,5 +59,4 @@ nav: - Heatmap: plots/heatmap.md - Feature Info: feature-info.md - Troubleshooting: troubleshooting.md - - Development: development.md - Changelog: changelog.md From 6a289020873ae25effcc7ac1d56d7457a2616b70 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 01:37:03 -0400 Subject: [PATCH 16/20] Update tests.yml sklearn not added to tests, caused ci build test failure --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3990924..bbf1c86 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,7 @@ jobs: # against a few system libraries the base image doesn't ship. - if: runner.os == 'Linux' run: sudo apt-get update && sudo apt-get install -y libgl1 libxkbcommon-x11-0 libxcb-cursor0 - - run: pip install "numpy<2" pandas scipy tqdm pytest PyQt5 + - run: pip install "numpy<2" pandas scipy scikit-learn tqdm pytest PyQt5 - run: python -m pytest code/tests -v lint: From 03e6d22268286d146e09b399e0ae5329a4bb8cc1 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 02:13:54 -0400 Subject: [PATCH 17/20] bugfixes, docs update --- code/MSFaST.py | 14 +-- code/main.py | 4 + code/ordination.py | 18 +++- code/stats.py | 29 ++++--- code/tests/test_ordination.py | 8 +- devnotes.md | 157 ++++++++++++++++++++++++++++++++++ docs/index.md | 4 +- docs/troubleshooting.md | 8 +- 8 files changed, 212 insertions(+), 30 deletions(-) diff --git a/code/MSFaST.py b/code/MSFaST.py index 049654c..2207180 100644 --- a/code/MSFaST.py +++ b/code/MSFaST.py @@ -10,7 +10,6 @@ from groupsets import normalize_graphfilters from datetime import datetime import time -from pathlib import Path #---Classes--- @@ -187,6 +186,11 @@ def run_MSFaST(params): # Filtering and error propagation print('Filtering data') ionfilters = {} + # Initialise here (not only inside `if analysis_params.grpave:`) so the + # unconditional groupionlists[...] writes further down (and the blank + # filter, which reads it) can't raise NameError if grpave is ever off. + # The GUI currently forces grpave=True, but loaded sessions/tests need not. + groupionlists = {} if analysis_params.relfil: ionfilters = filter.relationalfilter(analysis_params, ionfilters) if analysis_params.merge: @@ -254,7 +258,7 @@ def run_MSFaST(params): msdata_filtered = pd.read_csv(analysis_params.outputdir / (analysis_params.filename.stem + '_filtered.csv'), sep = ',', header = [0, 1, 2], index_col = [0, 1, 2]) analysisrec = open(analysis_params.outputdir / 'analysisinfo.txt',"w") analysisrec.writelines(['Analysis Date: ' + str(datetime.now()) + '\n', - 'Runetime: ' + str(round(runtime, 2)) + ' seconds\n', + 'Runtime: ' + str(round(runtime, 2)) + ' seconds\n', 'Input file: ' + str(analysis_params.filename) + '\n', 'Sample list: ' + str(analysis_params.samplelistfilename) + '\n', 'Extract metadata file: ' + str(analysis_params.extractmetadatafilename) + '\n', @@ -280,10 +284,10 @@ def run_MSFaST(params): text = '' if analysis_params.relfil: text += 'Features failing peak correction filtering: ' + str(len(ionfilters['relfil'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['relfil'].ions) / len(msdata_unformatted.index), 2)) + '%\n' - if analysis_params.blnkfltr: #FIX THIS REF TO "BLANKS" + if analysis_params.blnkfltr: text += 'Features failing blank filtering: ' + str(len(groupionlists[analysis_params.blnkgrp])) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(groupionlists[analysis_params.blnkgrp]) / len(msdata_unformatted.index), 2)) + '%\n' if analysis_params.decon: - text += 'Features failing blank filtering: ' + str(len(ionfilters['insource'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['insource'].ions) / len(msdata_unformatted.index), 2)) + '%\n' + text += 'Features failing in-source/deconvolution filtering: ' + str(len(ionfilters['insource'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['insource'].ions) / len(msdata_unformatted.index), 2)) + '%\n' if analysis_params.CVfil: text += 'Features failing CV filtering: ' + str(len(ionfilters['cv'].ions)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * len(ionfilters['cv'].ions) / len(msdata_unformatted.index), 2)) + '%\n' text += 'Features failing any filters: ' + str(len(msdata_unformatted.index) - len(msdata_filtered.index)) + '/' + str(len(msdata_unformatted.index)) + ' ' + str(round(100 * (len(msdata_unformatted.index) - len(msdata_filtered.index)) / len(msdata_unformatted.index), 2)) + '%\n' @@ -310,7 +314,7 @@ def run_MSFaST(params): 'RT/mz/FC: ' + str(analysis_params.FC3Dplt) + ' ' + str(analysis_params.statstgrps) + '\n', 'KMD/mz ' + str(analysis_params.KMD) + '\n', #'KMD/mz/RT ' + str(analysis_params.___) + '\n', - 'PCA unfitlered: ' + str(analysis_params.PCA) + '\n', + 'PCA unfiltered: ' + str(analysis_params.PCA) + '\n', 'PCA filtered: ' + str(analysis_params.PCA) + '\n', 'Dendrogram (ward) unfiltered: ' + str(analysis_params.Dendrogram) + '\n', 'Dendrogram (ward) Filtered: ' + str(analysis_params.Dendrogram) + '\n', diff --git a/code/main.py b/code/main.py index 418e2bd..899c22d 100644 --- a/code/main.py +++ b/code/main.py @@ -82,6 +82,10 @@ - potentially consider other database options like HMDB etc - fix up analysisinfo file output with better and more useful log ingo - add other ordination options like pca, pls-da, etc etc + ~DONE: the multivariate tab now offers PCA / NMDS / PLS-DA via a method + switcher (ordination.py, plotting.plot_ordination). Next candidate is + OPLS-DA (intentionally deferred -- no native sklearn support; see + devnotes.md "Multivariate ordination plot"). - add custom keyword arguments for each plot to make calling them easier - make it so groups can be reordered in the groupsets widgets? ~model-layer support done: GroupSetModel.move() (groupsets.py), tested in diff --git a/code/ordination.py b/code/ordination.py index 7829e9c..43d21fd 100644 --- a/code/ordination.py +++ b/code/ordination.py @@ -15,6 +15,8 @@ This module is Qt-free and unit-tested (see ``tests/test_ordination.py``). """ +from pathlib import Path + import numpy as np import pandas as pd from sklearn.cross_decomposition import PLSRegression @@ -75,11 +77,19 @@ def load_ordination_matrix(file, raw_msdata_header, collapse_replicates): for elem in collapsed_columns: header.append((elem[1], '', elem[0])) msdata.columns = pd.MultiIndex.from_tuples(header) - msdata.to_csv('averagepca.csv', header=True, index=False) - - msdata_header = pd.read_csv('averagepca.csv', sep=',', header=None, + # Round-trip the collapsed matrix through a CSV so its relabeled + # 3-row header reads back the same way the uncollapsed path reads + # the real file. Write it next to the input peak table (the run's + # output directory) rather than the process's current working + # directory, which in the deployed app is code/ -- this file is an + # internal scratch artifact, not something the user should find in + # the source tree. + avg_path = Path(file).with_name('averagepca.csv') + msdata.to_csv(avg_path, header=True, index=False) + + msdata_header = pd.read_csv(avg_path, sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() - pcadf = (pd.read_csv('averagepca.csv', sep=',', header=[2], index_col=[0]) + pcadf = (pd.read_csv(avg_path, sep=',', header=[2], index_col=[0]) .drop(['m/z', 'Retention time (min)'], axis=1) .transpose().astype(float).reset_index().rename(columns={'index': 'File'})) else: diff --git a/code/stats.py b/code/stats.py index 1237416..3b90501 100644 --- a/code/stats.py +++ b/code/stats.py @@ -72,7 +72,6 @@ def groupave(analysis_params): # Initialize lists to collect results from each chunk sum_values_list = [] - sum_squares_list = [] counts_list = [] # Process data in chunks @@ -90,33 +89,29 @@ def groupave(analysis_params): # Set index names according to your data structure chunk_stacked.index.names = ['Compound', 'm/z', 'Retention time', 'Group', 'Sample', 'Injection'] - # Compute sum, sum of squares, and counts per group + # Compute sum and counts per group group_levels = ['Compound', 'm/z', 'Retention time', 'Group', 'Sample', 'Injection'] sum_values_chunk = chunk_stacked.groupby(level=group_levels).sum() - sum_squares_chunk = (chunk_stacked ** 2).groupby(level=group_levels).sum() count_chunk = chunk_stacked.groupby(level=group_levels).count() # Append results to lists sum_values_list.append(sum_values_chunk) - sum_squares_list.append(sum_squares_chunk) counts_list.append(count_chunk) pbar.update(1) # Concatenate all results all_sum_values = pd.concat(sum_values_list) - all_sum_squares = pd.concat(sum_squares_list) all_counts = pd.concat(counts_list) # Aggregate over the entire dataset sum_values_df = all_sum_values.groupby(level=group_levels).sum() - sum_squares_df = all_sum_squares.groupby(level=group_levels).sum() counts_df = all_counts.groupby(level=group_levels).sum() - # Calculate mean and variance per injection + # Calculate mean per injection. (A per-injection variance/stddev was + # computed here previously but never used -- the technical/biological + # RSDs below are derived from the grouped means, not from it.) mean_values = sum_values_df / counts_df - variance_values = (sum_squares_df / counts_df) - (mean_values ** 2) - stddev_values = variance_values ** 0.5 # Calculate technical RSDs and counts # Group over technical replicates within each sample @@ -251,9 +246,14 @@ def runttest(analysis_params, statstgrps, groupsets): msdata_teststats.loc[msdata_teststats['p'] <= minval, 'p'] = minval msdata_teststats['logp'] = np.log10(msdata_teststats['p']) - # Save msdata_teststats + # Save msdata_teststats. Previously written to the current working + # directory as 'msdata_teststats_test.csv' (a debug-named file that + # littered code/ and was never read back); now written into the run's + # output directory under a descriptive name alongside the other outputs. msdata_teststats = msdata_teststats.reset_index([1, 2]) - msdata_teststats.to_csv('msdata_teststats_test.csv', header=True, index=True) + msdata_teststats.to_csv( + analysis_params.outputdir / (analysis_params.filename.stem + '_teststats.csv'), + header=True, index=True) # Update iondict with -logp iondict['-logp'] = -msdata_teststats['logp'] @@ -301,7 +301,12 @@ def runttest(analysis_params, statstgrps, groupsets): # Save results to CSV files iondict = iondict.set_index('Compound') - iondict.to_csv('qdata.csv', index=True, header=True) + # Written into the run's output directory rather than the current working + # directory (was a bare 'qdata.csv' that landed in code/ and was never + # read back); the canonical -logq still goes into iondict.csv below. + iondict.to_csv( + analysis_params.outputdir / (analysis_params.filename.stem + '_qvalues.csv'), + index=True, header=True) iondict2 = pd.read_csv(analysis_params.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) iondict2['-logq'] = np.nan iondict2.loc[iondict.index.tolist(), '-logq'] = iondict['-logq'] diff --git a/code/tests/test_ordination.py b/code/tests/test_ordination.py index 992434e..89de181 100644 --- a/code/tests/test_ordination.py +++ b/code/tests/test_ordination.py @@ -47,8 +47,9 @@ def test_uncollapsed_keeps_one_row_per_injection(tmp_path): assert len(biolgroup) == 9 -def test_collapsed_averages_technical_not_biological_replicates(tmp_path, monkeypatch): - monkeypatch.chdir(tmp_path) # 'averagepca.csv' lands here, not the repo +def test_collapsed_averages_technical_not_biological_replicates(tmp_path): + # averagepca.csv (the collapse round-trip scratch file) is written next + # to the input peak table, i.e. into tmp_path here -- no chdir needed. path = tmp_path / 'example_filtered.csv' _write_synthetic_filtered_csv(path) x, biolgroup = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) @@ -65,8 +66,7 @@ def test_collapsed_averages_technical_not_biological_replicates(tmp_path, monkey assert (biolgroup == 'groupB').sum() == 1 -def test_collapsed_values_are_the_mean_of_their_technical_replicates(tmp_path, monkeypatch): - monkeypatch.chdir(tmp_path) +def test_collapsed_values_are_the_mean_of_their_technical_replicates(tmp_path): path = tmp_path / 'example_filtered.csv' _write_synthetic_filtered_csv(path) x, _ = load_ordination_matrix(path, _raw_header(path), collapse_replicates=True) diff --git a/devnotes.md b/devnotes.md index cb6689e..29d3e71 100644 --- a/devnotes.md +++ b/devnotes.md @@ -695,3 +695,160 @@ mid-session. Most recent follow-ups: heatmap W/S selection had no bounds clamping (`mv_heatmap`, could crash or silently wrap past either end of the feature list); the six per-plot dicts were consolidated into `PlotSlotRegistry` (`plotslots.py`). 65 passing tests. + +## Code review pass (dev branch, 2026-06-30) + +Full read-through of every hand-written, Qt-free module plus the docs and +TODO block. The codebase is in good shape; findings were modest. Test count +is now **159 passing** (the count above is stale). + +### Fixes applied on this branch (low-risk, test-validated) + +- **`MSFaST.py` `analysisinfo.txt` decon label was a copy-paste bug**: the + `if analysis_params.decon:` branch wrote "Features failing **blank** + filtering" (a verbatim copy of the blank-filter line above it). Corrected + to "Features failing in-source/deconvolution filtering". Confirmed against + `main.py`'s parallel data-review summary writer (`_finish_analysis`, + ~line 1208), which already labels the same quantity correctly as + "in-source ion filtering" — so the two writers now agree. Also fixed two + user-facing typos in the same file: "Runetime" -> "Runtime" and + "PCA unfitlered" -> "PCA unfiltered". These are pure string-label changes, + not the risky re-read logic the analysisinfo backlog item warns about. +- **`MSFaST.run_MSFaST` latent `NameError` on `groupionlists`**: it was + only initialised inside `if analysis_params.grpave:`, but referenced + unconditionally further down (the `groupionlists['cv'/'relfil'/'insource']` + writes and the groups-column loop) and inside the blank-filter block. + The GUI hardcodes `grpave = True` (`main.py:~1335`), so this never fired + in practice, but a loaded session or a test with `grpave=False` would + crash. Added a defensive `groupionlists = {}` next to `ionfilters = {}`. + Behaviour unchanged when `grpave=True` (`parsionlists` reassigns it). +- **Stray debug CSVs written to the current working directory** (which is + `code/` in the deployed app, per `run.bat`): `stats.py` wrote + `msdata_teststats_test.csv` (a debug-named file, never read back) and + `qdata.csv` (never read back — the canonical `-logq` goes into + `iondict.csv`), and `ordination.py` wrote `averagepca.csv` (an internal + collapse round-trip scratch file). All three now write into the run's + output directory: `_teststats.csv`, `_qvalues.csv`, and + `averagepca.csv` next to the input peak table respectively. The + pre-existing leftover copies sitting untracked in `code/` + (`qdata.csv`, `msdata_teststats_test.csv`, `averagepca.csv`) are now + obsolete and safe to delete — they will no longer be regenerated there. +- **Dead code removal** (`stats.py` `groupave`): a per-injection + `variance_values`/`stddev_values` was computed but never used (the + technical/biological RSDs are derived from grouped means, not from it). + Removing it made the entire sum-of-squares accumulation chain dead too + (`sum_squares_list`/`sum_squares_chunk`/`all_sum_squares`/`sum_squares_df`), + so that's gone as well — a small but real per-chunk optimization (drops a + `(chunk ** 2).groupby(...).sum()` on every chunk of the formatted table). + Validated by `tests/test_msfast_pipeline.py`, which runs the real + `groupave` against the bundled example dataset. Also dropped an unused + `from pathlib import Path` in `MSFaST.py`. + +### Findings NOT changed (need a decision or live-GUI validation) + +- **The "Or Groups" Plot-Feature-Set control is functionally inert.** The + groupset editor has three lists — And (`listWidget_andgrps` -> `incl`), + Or (`listWidget_orgrps` -> `src`), Exclude (`listWidget_allgrps` -> + `excl`). `src` is edited, persisted to `.mpct`, and joined into the + descriptive name, but **`MSFaST.groupset.__init__` only filters on `incl` + and `excl` — it never applies `src`.** So a user can add groups to the + "Or" list and it silently changes nothing about which features are + selected/coloured. This is the most significant finding. It's *not* fixed + here because implementing the "feature present in at least one of `src`" + semantics changes which features plot — and `enumerate_inputs`'s default + "Features not in blanks" groupset already populates `src` with every + non-blank group, so turning `src` on would retroactively add a filter to + the default view. Needs the GUI run against real data to validate. Sketch + of the fix (in `groupset.__init__`, after the `excl`/`incl` passes): + ```python + if self.src: + pattern = '|'.join(' ' + str(g) for g in self.src) # leading-space convention + iondict = iondict.loc[iondict['groups'].str.contains(pattern), 'groups'].to_frame() + ``` + Decide first whether "Or" should be an independent constraint or whether + the default groupset should stop pre-filling `src`. +- **`mspwriter.convert_to_msp` num-peaks loop is fragile.** `for frags in + sources: numpeaks = len(frags)` overwrites rather than accumulates, and + assumes `sources` is a list-of-one-list. It happens to be correct for the + only live caller (the decon path, where `ionmerge.sources == [[frag,...]]`), + but would silently miscount if ever called on a `relationalfilter`-shaped + merge (flat list of id strings) — `len(frags)` would then be a string + length and the inner `for fragment in frags:` would iterate characters. + Left as-is (single caller, wrapped in try/except), but worth hardening if + the MSP writer is ever reused. +- **Docs repo-URL inconsistency.** `mkdocs.yml` `repo_url` and `docs/index.md` + link to `github.com/robertsamples/mpact` (the `origin` fork), but + `docs/installation.md`'s `git clone` line uses + `github.com/BalunasLab/mpact` (the `upstream`/lab repo). Pick one canonical + public URL and make all three consistent. Not changed because which one is + the intended *published* home is a call only you can make (both remotes + exist locally). `docs/index.md`'s stale "multivariate analysis (NMDS)" + feature blurb *was* updated to "(PCA/NMDS/PLS-DA)". + +- **Two orphaned/broken scratch scripts in `code/`.** + `npatlassearch.py` reads `npatlas.csv` (the real file is `npatlas.tsv`) at + module top level and references an undefined `indigo`/`renderer`, so it + would crash if ever imported/run — but nothing imports it. + `masstdriver.py` is referenced only by a commented-out import in + `ui_functions.py`. Both are dead leftover dev scratch, not part of the + running app. Flagged rather than deleted (pre-existing files; the auto-mode + classifier has blocked deleting UI-adjacent files before). Safe to remove + once you confirm you don't want them as references. + +### Already-logged items re-confirmed still open (see backlog above) + +- `exportgnps()` duplicating `translators.reindex_fragments` matching logic. +- `iondict.csv` read-modify-write chain across `filter.py`/`stats.py`. +- `run_MSFaST`'s blank-filter `_formatted.csv` re-read (the "risky kind"). +- Lazy per-tab plot updates; generalizing the `ui_plot` subclasses. + +### Test-suite assessment + +The suite is well-targeted and not redundant — each test guards a specific +behaviour or a previously-fixed bug (the PLS-DA `scale=` regression, the +replicate-collapse structure, the dendrogram purity edge cases, the +end-to-end pipeline). No tests are recommended for removal. Gaps worth +filling when convenient (all Qt-free, so headless-testable): +- `translators.reindex_fragments` / `filter_source_peaktable` end-to-end on + the bundled MSP/MGF + peak tables (currently only smaller-unit coverage). +- `getfragdb.importfrag` format auto-detection (Progenesis vs MS-DIAL MSP). +- A `run_MSFaST` variant with `grpave=False`/minimal filters to lock in the + `groupionlists` defensive-init fix above. +- `stats.runfc`/`runttest` numeric outputs (FC clamping, q-value monotonicity) + against a tiny synthetic `iondict.csv`. + +## Future feature dev plan (post-review, 2026-06-30) + +Candidate features, ordered roughly by value-to-effort. None started; all +need the GUI runnable against real data to validate. Several already appear +in `main.py`'s TODO block — this is the triaged version. + +1. **Wire up the "Or Groups" groupset constraint** (see finding above). + Smallest, highest-impact correctness item — a visible UI control that + currently does nothing. Backend change is a few lines; the work is + deciding the default-groupset interaction and validating in the GUI. +2. **Data-quality score / summary** (TODO: "overall data quality score, AUC + on CV plot"). The pieces already exist (`average CV`/`median CV` columns + in `iondict.csv`, per-group RSDs in `_summarydata.csv`, the dendrogram + purity `n_pure/n_total` summary). A single headline QC number + a small + summary panel could be assembled Qt-free in a new `qualityscore.py` + module (testable) and surfaced on the Data Review tab. +3. **OPLS-DA ordination method** (next item after the PCA/NMDS/PLS-DA rework, + already deferred — see "Multivariate ordination plot"). Needs either the + unmaintained `pyopls` or a from-scratch OSC implementation plus a + reference dataset to validate against. +4. **Status-bar terminal/log viewer** (TODO). Replace the static status + strings with a live log line + an expandable full-output pane. Mostly a + Qt plumbing task (route the existing `print()` progress through a + `QPlainTextEdit`/signal); no scientific risk. +5. **Additional databases beyond NPAtlas** (TODO: HMDB etc.). `dbsearch.py` + is already a clean Qt-free ppm-window matcher taking an `atlas` DataFrame + — adding a second source is mostly a loader + a column-name adapter, and + the matching core is reusable as-is. +6. **`exportgnps()` migration onto `translators`** (backlog). Correctness + + maintenance win, not a new feature: replace the ~210-line hand-rolled + O(n·m) MGF matcher with the tested `reindex_fragments`/ + `filter_source_peaktable` path. +7. **Specificity/sensitivity & comparison-mode plots** (TODO, "likely items + that need more thought"). Larger scientific-design questions; needs spec + work with the lab before implementation. diff --git a/docs/index.md b/docs/index.md index da22288..d89785f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,8 +6,8 @@ table (Progenesis QI, MZmine, MS-DIAL, or Bruker Metaboscape), a sample list, and a metadata file, and turns them into a filtered, statistically annotated dataset with a full suite of interactive plots: data-quality review, group-level set/correlation analysis, hierarchical clustering, -multivariate analysis (NMDS), m/z-vs-RT and mass-defect views, volcano -plots, heatmaps, and per-feature spectral/database-match lookup. +multivariate analysis (PCA/NMDS/PLS-DA), m/z-vs-RT and mass-defect views, +volcano plots, heatmaps, and per-feature spectral/database-match lookup. This site covers installing and running MPACT, the file formats it expects, the analysis and filtering options, and what each plot/tab diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index d2cdef7..e004608 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -81,9 +81,11 @@ version (`plotting._is_duplicate_pick`). ## Still stuck? -Check `code/tests/` — the pure-logic modules (`filter`, `stats`, -`translators`, `groupsets`, `importdependencies`) have headless unit -tests you can run to rule out a logic bug: +Check `code/tests/` — the pure-logic modules (filtering, statistics, +import/export translators, groupsets, ordination, dendrogram purity, the +feature-search tree, and an end-to-end analysis-pipeline run on the bundled +example dataset) have headless unit tests you can run to rule out a logic +bug: ``` python -m pytest code/tests -q From 6a6537394b35df8398732717b25289888f327e94 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 02:44:12 -0400 Subject: [PATCH 18/20] implement npatlas fetch, bug report, update mgmt --- code/crashreport.py | 185 +++++++++++++++++++++++++++ code/main.py | 116 ++++++++++++++++- code/mpactupdate.py | 173 +++++++++++++++++++++++++ code/npatlasupdate.py | 144 +++++++++++++++++++++ code/plotting.py | 64 ++------- code/qualityscore.py | 140 ++++++++++++++++++++ code/tests/test_crashreport.py | 156 ++++++++++++++++++++++ code/tests/test_getfragdb.py | 114 +++++++++++++++++ code/tests/test_mpactupdate.py | 144 +++++++++++++++++++++ code/tests/test_msfast_grpave_off.py | 74 +++++++++++ code/tests/test_msfast_pipeline.py | 34 +++++ code/tests/test_npatlasupdate.py | 144 +++++++++++++++++++++ code/tests/test_qualityscore.py | 157 +++++++++++++++++++++++ code/tests/test_translators.py | 18 +++ devnotes.md | 155 +++++++++++++++++----- 15 files changed, 1734 insertions(+), 84 deletions(-) create mode 100644 code/crashreport.py create mode 100644 code/mpactupdate.py create mode 100644 code/npatlasupdate.py create mode 100644 code/qualityscore.py create mode 100644 code/tests/test_crashreport.py create mode 100644 code/tests/test_getfragdb.py create mode 100644 code/tests/test_mpactupdate.py create mode 100644 code/tests/test_msfast_grpave_off.py create mode 100644 code/tests/test_npatlasupdate.py create mode 100644 code/tests/test_qualityscore.py diff --git a/code/crashreport.py b/code/crashreport.py new file mode 100644 index 0000000..9d5640f --- /dev/null +++ b/code/crashreport.py @@ -0,0 +1,185 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free crash/error reporting. Installs a ``sys.excepthook`` that, on any +otherwise-unhandled exception: + +1. formats a full report (traceback + environment: MPACT/Python/platform + versions, timestamp, optional context such as the tail of the run log), +2. writes it to a timestamped file under a crash-log directory (so there's a + durable record even if the user dismisses the dialog), and +3. hands the report to a GUI callback that asks the user whether to send it. + +The "send" path is deliberately backend-free: it builds a pre-filled GitHub +*new issue* URL (title + body) for the MPACT repo, so reporting is one click +in the browser and nothing leaves the user's machine until they choose to +submit it. That satisfies "prompt the user before sending" without any cloud +egress, DSN, or account. + +Why not Sentry (the obvious off-the-shelf option): ``sentry-sdk`` is excellent +for hosted/web services but (a) sends events to a Sentry project by default -- +exactly the silent-egress this tool should avoid for a desktop research app, +(b) needs a DSN/account to be provisioned, and (c) still needs a custom +``before_send`` hook + dialog to honour "ask first." For a single-user desktop +tool the local-log + pre-filled-GitHub-issue flow gives the same practical +benefit (a complete traceback in the maintainer's hands) with no infrastructure +and no privacy surprise. If MPACT ever ships to many non-technical users and a +central error feed becomes worth it, Sentry with ``before_send`` gating is the +documented upgrade path. + +This module is Qt-free and unit-tested (see ``tests/test_crashreport.py``); the +GUI dialog is injected as a plain callback. +""" + +import os +import platform +import sys +import time +import traceback +import urllib.parse + +DEFAULT_REPO = 'robertsamples/mpact' +# GitHub rejects extremely long issue URLs; keep the prefilled body well under +# the practical limit so the link always opens (the full report is always in +# the log file regardless). +_MAX_ISSUE_BODY = 6000 + + +def _app_version(): + try: + from mpactupdate import __version__ + return __version__ + except Exception: + return 'unknown' + + +def format_report(exc_type, exc_value, exc_tb, context=None, now=None): + """Build the human-readable crash report text. + + Args: + exc_type/exc_value/exc_tb: the ``sys.exc_info()``-style triple. + context: optional extra text appended under a "Context" heading + (e.g. the last lines of the run log, the current dataset name). + now: epoch seconds for the timestamp (injectable for tests). + + Returns: + A multi-section plain-text report. + """ + now = time.time() if now is None else now + stamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(now)) + tb_text = ''.join(traceback.format_exception(exc_type, exc_value, exc_tb)) + lines = [ + 'MPACT crash report', + '==================', + 'Time: ' + stamp, + 'MPACT version: ' + _app_version(), + 'Python: ' + sys.version.split()[0], + 'Platform: ' + platform.platform(), + '', + 'Traceback:', + tb_text.rstrip(), + ] + if context: + lines += ['', 'Context:', str(context).rstrip()] + return '\n'.join(lines) + '\n' + + +def write_log(report, log_dir, now=None): + """Write ``report`` to a timestamped file under ``log_dir``. + + Creates ``log_dir`` if needed. Returns the path written, or ``None`` if the + write failed (reporting must never raise from inside an excepthook). + """ + now = time.time() if now is None else now + try: + os.makedirs(log_dir, exist_ok=True) + fname = 'mpact_crash_' + time.strftime('%Y%m%d_%H%M%S', time.localtime(now)) + '.log' + path = os.path.join(log_dir, fname) + with open(path, 'w', encoding='utf-8', errors='replace') as handle: + handle.write(report) + return path + except Exception: + return None + + +def one_line_summary(exc_type, exc_value): + """A concise ``TypeName: message`` for use as an issue title.""" + name = getattr(exc_type, '__name__', str(exc_type)) + message = str(exc_value).strip().splitlines()[0] if str(exc_value).strip() else '' + return (name + ': ' + message).strip().rstrip(':').strip() if message else name + + +def build_issue_url(report, title, repo=DEFAULT_REPO): + """Build a GitHub 'new issue' URL with a prefilled title and body. + + The body is the report wrapped in a code fence and truncated to + :data:`_MAX_ISSUE_BODY` so the URL stays openable. The full untruncated + report always lives in the on-disk log. + """ + body = report + if len(body) > _MAX_ISSUE_BODY: + body = body[:_MAX_ISSUE_BODY] + '\n...\n[truncated -- see attached crash log]' + body_md = ('**Describe what you were doing when this happened:**\n\n\n' + '---\n```\n' + body + '\n```\n') + query = urllib.parse.urlencode({'title': title, 'body': body_md}) + return 'https://github.com/' + repo + '/issues/new?' + query + + +def make_excepthook(report_handler, log_dir=None, repo=DEFAULT_REPO, + context_provider=None, prev_hook=None): + """Build (but don't install) an excepthook. + + Args: + report_handler: callable ``handler(report, log_path, issue_url)`` that + shows the user the report and offers to send it. Exceptions raised + by the handler are swallowed (an excepthook must not itself raise). + log_dir: directory for crash logs (skipped if None). + repo: GitHub repo for the prefilled issue URL. + context_provider: optional zero-arg callable returning extra context + text to embed (called defensively; failure is ignored). + prev_hook: a previous excepthook to chain to (defaults to the standard + ``sys.__excepthook__`` so the traceback still reaches the console). + + Returns: + A function with the ``(exc_type, exc_value, exc_tb)`` signature. + """ + prev_hook = prev_hook if prev_hook is not None else sys.__excepthook__ + + def _hook(exc_type, exc_value, exc_tb): + # Always let the default hook print to stderr first (and never let our + # own reporting suppress that or raise over it). + try: + prev_hook(exc_type, exc_value, exc_tb) + except Exception: + pass + try: + context = None + if context_provider is not None: + try: + context = context_provider() + except Exception: + context = None + report = format_report(exc_type, exc_value, exc_tb, context=context) + log_path = write_log(report, log_dir) if log_dir else None + title = 'Crash: ' + one_line_summary(exc_type, exc_value) + issue_url = build_issue_url(report, title, repo=repo) + if report_handler is not None: + report_handler(report, log_path, issue_url) + except Exception: + # Reporting failed -- the default hook already printed the real + # traceback, so just give up quietly rather than masking it. + pass + + return _hook + + +def install_excepthook(report_handler, log_dir=None, repo=DEFAULT_REPO, + context_provider=None): + """Install the crash excepthook as ``sys.excepthook``; return the previous + hook (so callers can restore it).""" + prev = sys.excepthook + sys.excepthook = make_excepthook( + report_handler, log_dir=log_dir, repo=repo, + context_provider=context_provider, prev_hook=prev) + return prev diff --git a/code/main.py b/code/main.py index 899c22d..fc0e883 100644 --- a/code/main.py +++ b/code/main.py @@ -44,6 +44,11 @@ from plotting import plot_abund, show_spectrum, show_featureplt, plot_heatmap, plot_mzrt, plot_samplecorr, kendrick, plot_volcano, plot_fc3d, plot_dendrogram, plot_ordination, prev_cv, plot_upset, plot_treemap import getfragdb +import webbrowser +import npatlasupdate +import mpactupdate +import crashreport + from indigo import Indigo from indigo.renderer import IndigoRenderer indigo = Indigo() @@ -315,6 +320,12 @@ def moveWindow(event): UIFunctions.uiDefinitions(self) self.show() + # Deferred (post-show) best-effort startup checks: prompt to refresh a + # stale NPAtlas database and to install a newer MPACT release. Run via + # singleShot so the window paints first; fully guarded so they can + # never block or break launch. GUI-only path -- verify by launching. + QtCore.QTimer.singleShot(0, self._run_startup_checks) + #---Methods--- @@ -538,7 +549,81 @@ def update_mgf_feature_id(entry_lines, new_id): def error(self, message): self.ui.label_status.setText(message) self.ui.label_status.setStyleSheet('color: rgb(150,0,0);') - + + # ---- Startup checks (atlas freshness + app self-update) ---- + # All best-effort and fully guarded: any failure is logged to the console + # and otherwise ignored so a check can never block or break launch. The + # Qt-free logic lives in npatlasupdate.py / mpactupdate.py (unit-tested); + # these methods are only the dialog/wiring layer and need a live launch to + # verify end to end. + def _run_startup_checks(self): + try: + self._check_atlas_freshness() + except Exception as exc: + print('Atlas update check skipped:', exc) + try: + self._check_app_update() + except Exception as exc: + print('App update check skipped:', exc) + + def _check_atlas_freshness(self, atlas_path='npatlas.tsv', max_age_days=30): + """Offer to re-download the Natural Products Atlas if the local copy is + missing or older than ``max_age_days``.""" + if not npatlasupdate.is_update_due(atlas_path, max_age_days=max_age_days): + return + age = npatlasupdate.atlas_age_days(atlas_path) + age_msg = 'missing' if age is None else ('about %d days old' % int(age)) + reply = QtWidgets.QMessageBox.question( + self, 'Update Natural Products Atlas?', + 'Your local Natural Products Atlas database is %s.\n\n' + 'Download the latest copy from npatlas.org now (about 30 MB)?' % age_msg, + QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + QtWidgets.QMessageBox.No) + if reply != QtWidgets.QMessageBox.Yes: + return + # The download is large and runs on the main thread (wait cursor); a + # failed/partial transfer never clobbers the existing atlas (atomic + # replace in npatlasupdate.download_atlas). Threading this is a future + # improvement -- see devnotes.md. + self.ui.label_status.setText('Downloading Natural Products Atlas...') + QtWidgets.QApplication.setOverrideCursor(Qt.WaitCursor) + QtWidgets.QApplication.processEvents() + try: + n = npatlasupdate.download_atlas(atlas_path) + self.ui.label_status.setText('Natural Products Atlas updated (%.1f MB).' % (n / 1e6)) + except Exception as exc: + self.error('Atlas update failed (kept existing copy): ' + str(exc)) + finally: + QtWidgets.QApplication.restoreOverrideCursor() + + def _check_app_update(self): + """Check GitHub for a newer MPACT release; offer a git-pull update.""" + info = mpactupdate.check_for_update(timeout=5) + if not info.available: + return + notes = (info.notes or '').strip() + if len(notes) > 800: + notes = notes[:800] + '...' + reply = QtWidgets.QMessageBox.question( + self, 'MPACT update available', + 'A newer MPACT release is available.\n\n' + 'Installed: %s\nLatest: %s\n\n%s\n\n' + 'Update now (git pull)?' % (info.current, info.latest, notes), + QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + QtWidgets.QMessageBox.No) + if reply != QtWidgets.QMessageBox.Yes: + return + repo_dir = Path(__file__).resolve().parent.parent + ok, output = mpactupdate.apply_git_update(repo_dir) + if ok: + QtWidgets.QMessageBox.information( + self, 'Update complete', + 'MPACT was updated. Please restart the application.\n\n' + output) + else: + self.error('Automatic update failed; opening the release page instead.') + if info.url: + webbrowser.open(info.url) + def getgroups(self): """ Get biological groups on input of all input files, fills comboboxes with these. @@ -1383,5 +1468,34 @@ def mousePressEvent(self, event): if sys.platform != 'win32': app.setStyle('Fusion') app.setStyleSheet("QFrame { border: 0px; }") #QToolTip { color: #999999; background-color: rgb(0, 255, 0); border: 1px solid grey; }") + + # Crash reporting: on an unhandled exception, log a full report and offer to + # open a prefilled GitHub issue (nothing is sent without the user clicking + # through). Installed after the QApplication exists so the dialog can show. + # See crashreport.py for the design (and why not Sentry). + def _crash_dialog(report, log_path, issue_url): + try: + box = QtWidgets.QMessageBox() + box.setIcon(QtWidgets.QMessageBox.Critical) + box.setWindowTitle('MPACT encountered an error') + text = 'An unexpected error occurred.' + if log_path: + text += '\n\nA crash log was saved to:\n' + log_path + text += ('\n\nReport this on GitHub? Your browser will open a ' + 'prefilled issue — nothing is sent automatically.') + box.setText(text) + box.setDetailedText(report) + box.setStandardButtons(QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No) + box.setDefaultButton(QtWidgets.QMessageBox.No) + if box.exec_() == QtWidgets.QMessageBox.Yes and issue_url: + webbrowser.open(issue_url) + except Exception: + pass + + crashreport.install_excepthook( + _crash_dialog, + log_dir=str(Path.home() / '.mpact' / 'crashlogs'), + repo=mpactupdate.DEFAULT_REPO) + window = MainWindow() sys.exit(app.exec_()) diff --git a/code/mpactupdate.py b/code/mpactupdate.py new file mode 100644 index 0000000..cc2810c --- /dev/null +++ b/code/mpactupdate.py @@ -0,0 +1,173 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free self-update checker. Queries the GitHub Releases API for the MPACT +repository (Robert Samples' fork, ``robertsamples/mpact``, by default), +compares the latest published release tag against the locally-running version, +and -- if newer -- hands the GUI the information it needs to ask the user +whether to update. The actual update of a git checkout is a best-effort +``git pull --ff-only``. + +Why not an off-the-shelf updater framework: the established Python option, +``pyupdater``, targets *frozen* (PyInstaller/cx_Freeze) apps and needs its own +patch-server + signing infrastructure -- heavyweight for a tool that's run +from a git clone (and the portable PyInstaller build is a separate, infrequent +artifact). For a source checkout the meaningful "update" is ``git pull``, and +"is there a newer release" is one GitHub API call + a version compare. That's +what this module does, with no third-party dependency beyond ``packaging`` +(already present; a tuple-based fallback covers its absence). + +This module is Qt-free and unit-tested (see ``tests/test_mpactupdate.py``); the +network and the git call are both injectable so the tests never touch either. +""" + +import json +import subprocess +import urllib.request + +#: The version of MPACT currently running. Bump this when cutting a release, +#: and create a matching GitHub release/tag (e.g. ``v1.0.1``) on the fork so +#: this checker can see it. Kept here as the single in-code source of truth; +#: keep it consistent with main.py's ``label_credits`` display string +#: (currently shows ``v1.00.01``). +__version__ = '1.0.1' + +DEFAULT_REPO = 'robertsamples/mpact' +_RELEASES_LATEST = 'https://api.github.com/repos/{repo}/releases/latest' + + +class UpdateInfo: + """Result of an update check. + + Attributes: + available: True if the latest release is newer than the running version. + current: the running version string. + latest: the latest release's tag (raw, e.g. ``v2.1.0``) or None. + url: the release's html_url (for "view release" / manual download). + notes: the release body/changelog text (may be ''). + """ + __slots__ = ('available', 'current', 'latest', 'url', 'notes') + + def __init__(self, available, current, latest=None, url=None, notes=''): + self.available = available + self.current = current + self.latest = latest + self.url = url + self.notes = notes + + def __repr__(self): + return ('UpdateInfo(available=%r, current=%r, latest=%r)' + % (self.available, self.current, self.latest)) + + +def _normalize(tag): + """Strip a leading 'v'/'V' and surrounding whitespace from a tag.""" + tag = str(tag).strip() + if tag[:1] in ('v', 'V'): + tag = tag[1:] + return tag + + +def is_newer(latest_tag, current_version): + """True if ``latest_tag`` represents a newer version than ``current_version``. + + Uses ``packaging.version`` when available (PEP 440 aware), falling back to a + dotted-integer tuple comparison so a missing ``packaging`` never breaks the + check. A tag that can't be parsed at all is treated as "not newer" (fail + safe -- never nag about an unparseable tag). + """ + latest = _normalize(latest_tag) + current = _normalize(current_version) + try: + from packaging.version import parse as _parse + return _parse(latest) > _parse(current) + except Exception: + def _tuple(text): + parts = [] + for chunk in text.replace('-', '.').split('.'): + if chunk.isdigit(): + parts.append(int(chunk)) + else: + break + return tuple(parts) + lt, ct = _tuple(latest), _tuple(current) + if not lt: + return False + return lt > ct + + +def fetch_latest_release(repo=DEFAULT_REPO, opener=None, timeout=10): + """Fetch the latest published release from the GitHub API. + + Returns the parsed JSON dict, or ``None`` if the repo has no published + releases (the API 404s) or the response can't be parsed. ``opener`` is an + injectable ``opener(request_or_url, timeout=...)`` returning a readable, + context-managed response (defaults to ``urllib.request.urlopen``). + """ + opener = opener if opener is not None else urllib.request.urlopen + url = _RELEASES_LATEST.format(repo=repo) + # GitHub recommends a User-Agent; the v3 Accept header pins the response shape. + request = urllib.request.Request(url, headers={ + 'Accept': 'application/vnd.github+json', + 'User-Agent': 'MPACT-update-check', + }) + try: + try: + response = opener(request, timeout=timeout) + except TypeError: + response = opener(request) + with response as resp: + payload = resp.read() + except Exception: + # Network error, 404 (no releases yet), DNS failure, offline -- all + # non-fatal: an update check must never break startup. + return None + try: + data = json.loads(payload.decode('utf-8') if isinstance(payload, bytes) else payload) + except Exception: + return None + if not isinstance(data, dict) or 'tag_name' not in data: + return None + return data + + +def check_for_update(current_version=__version__, repo=DEFAULT_REPO, opener=None, + timeout=10): + """Check whether a newer MPACT release exists. + + Returns an :class:`UpdateInfo`. ``available`` is False (and ``latest`` is + None) when the check can't reach the API or finds no newer release -- the + GUI can simply do nothing in that case. + """ + release = fetch_latest_release(repo=repo, opener=opener, timeout=timeout) + if release is None: + return UpdateInfo(available=False, current=current_version) + latest_tag = release.get('tag_name') + return UpdateInfo( + available=is_newer(latest_tag, current_version), + current=current_version, + latest=latest_tag, + url=release.get('html_url'), + notes=release.get('body') or '', + ) + + +def apply_git_update(repo_dir, runner=None, remote='origin', branch='main'): + """Best-effort ``git pull --ff-only`` of a source checkout. + + Only meaningful when MPACT is running from a git clone (not a frozen + build). ``runner`` is an injectable ``subprocess.run``-compatible callable + (so tests don't shell out). Returns ``(success, output)`` where ``output`` + is combined stdout/stderr text. + """ + runner = runner if runner is not None else subprocess.run + try: + completed = runner( + ['git', '-C', str(repo_dir), 'pull', '--ff-only', remote, branch], + capture_output=True, text=True, timeout=120, + ) + except Exception as exc: + return False, 'git pull could not be run: ' + str(exc) + output = (getattr(completed, 'stdout', '') or '') + (getattr(completed, 'stderr', '') or '') + return completed.returncode == 0, output.strip() diff --git a/code/npatlasupdate.py b/code/npatlasupdate.py new file mode 100644 index 0000000..0b83b6e --- /dev/null +++ b/code/npatlasupdate.py @@ -0,0 +1,144 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free updater for the bundled Natural Products Atlas database +(``npatlas.tsv``). The Natural Products Atlas (https://www.npatlas.org) +publishes periodic full-database downloads; the copy MPACT ships with goes +stale over time as new compounds are deposited. + +This module provides the staleness check and the download/validation logic, +with no Qt dependency, so it can be unit-tested headlessly (see +``tests/test_npatlasupdate.py``). The GUI side (``main.py``) only has to ask +the user a yes/no question and call :func:`download_atlas`. + +Format decision (do not "upgrade" without reason): MPACT reads the atlas as a +tab-separated table via ``pd.read_csv('npatlas.tsv', sep='\\t', ...)`` and +``dbsearch.search_npatlas`` accesses specific columns +(``compound_m_plus_h``/``compound_m_plus_na``/``compound_smiles``/ +``origin_type``/``genus``). The published ``NPAtlas_download.tsv`` already has +exactly those columns, so the TSV is a drop-in replacement and is what we +fetch. The ``NPAtlas_download.json`` is the same data in a nested JSON shape +that would need flattening before pandas/dbsearch could use it -- there is no +benefit to switching formats and a real cost (rewriting the read + column +access), so we deliberately stay on the TSV. +""" + +import os +import shutil +import tempfile +import time +import urllib.request + +DEFAULT_TSV_URL = 'https://www.npatlas.org/static/downloads/NPAtlas_download.tsv' +DEFAULT_JSON_URL = 'https://www.npatlas.org/static/downloads/NPAtlas_download.json' + +# Columns the app actually consumes (main.py's atlas read + dbsearch). A +# download missing any of these is rejected rather than allowed to clobber a +# working atlas -- guards against the server returning an HTML error page or a +# truncated/renamed export. +REQUIRED_COLUMNS = frozenset({ + 'compound_id', + 'compound_m_plus_h', + 'compound_m_plus_na', + 'compound_smiles', + 'origin_type', + 'genus', +}) + +DEFAULT_MAX_AGE_DAYS = 30 + + +def atlas_age_days(path, now=None): + """Age of the atlas file in days based on its mtime, or ``None`` if the + file doesn't exist.""" + if not os.path.exists(path): + return None + now = time.time() if now is None else now + return (now - os.path.getmtime(path)) / 86400.0 + + +def is_update_due(path, max_age_days=DEFAULT_MAX_AGE_DAYS, now=None): + """True if the atlas is missing or older than ``max_age_days``. + + This is the cheap check the app runs at startup before deciding whether to + prompt the user -- it only stats the file, it never touches the network. + + Caveat: staleness is judged by file *mtime*, which is the requested + "last modified over a month ago" behaviour but reflects when the file was + last written locally, not the vintage of the data inside it. A fresh + ``git clone`` stamps the checked-out file with the clone time, so a + just-cloned-but-data-old atlas will read as "fresh" until 30 days pass. + Embedding the NPAtlas release date in a sidecar and comparing that would + be more accurate if this ever matters. + """ + age = atlas_age_days(path, now=now) + return age is None or age > max_age_days + + +def validate_tsv_header(first_line): + """True if a TSV header line contains every :data:`REQUIRED_COLUMNS`.""" + columns = {c.strip() for c in first_line.rstrip('\n').split('\t')} + return REQUIRED_COLUMNS.issubset(columns) + + +def download_atlas(dest, url=DEFAULT_TSV_URL, opener=None, validate=True, + timeout=60): + """Download the NPAtlas TSV to ``dest`` atomically. + + The download streams to a temporary file in ``dest``'s directory, is + validated (the header must contain :data:`REQUIRED_COLUMNS`), and only then + ``os.replace``-d over ``dest`` -- so a network error, an HTML error page, + or a partial transfer can never leave a corrupt or truncated atlas in + place; the previous file is untouched on any failure. + + Args: + dest: path the atlas should end up at (e.g. ``code/npatlas.tsv``). + url: download URL (defaults to the published full TSV). + opener: callable ``opener(url, timeout=...)`` returning a readable, + context-managed response (defaults to ``urllib.request.urlopen``). + Injectable so tests can supply canned content without a network. + validate: if True, reject a download whose header is missing required + columns (raises ``ValueError``, leaving ``dest`` unchanged). + timeout: per-request timeout in seconds (urllib only). + + Returns: + Number of bytes written to ``dest``. + """ + dest = os.fspath(dest) + dest_dir = os.path.dirname(os.path.abspath(dest)) or '.' + opener = opener if opener is not None else urllib.request.urlopen + + fd, tmp_path = tempfile.mkstemp(prefix='.npatlas_', suffix='.tsv', dir=dest_dir) + try: + with os.fdopen(fd, 'wb') as tmp_file: + try: + response = opener(url, timeout=timeout) + except TypeError: + # Injected openers in tests may not accept a timeout kwarg. + response = opener(url) + with response as resp: + shutil.copyfileobj(resp, tmp_file) + bytes_written = os.path.getsize(tmp_path) + + if bytes_written == 0: + raise ValueError('Downloaded atlas is empty') + if validate: + with open(tmp_path, 'r', encoding='utf-8', errors='replace') as check: + first_line = check.readline() + if not validate_tsv_header(first_line): + raise ValueError( + 'Downloaded file is not a valid NPAtlas TSV ' + '(missing required columns); keeping existing atlas') + + os.replace(tmp_path, dest) + return bytes_written + except BaseException: + # Clean up the temp file on ANY failure (including the validation + # errors above); never disturb the existing dest. + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except OSError: + pass + raise diff --git a/code/plotting.py b/code/plotting.py index 137f2a5..cf8e019 100644 --- a/code/plotting.py +++ b/code/plotting.py @@ -11,6 +11,7 @@ from csvcache import cached_read_csv, invalidate as invalidate_csv_cache import ordination import clusterpurity +import qualityscore import matplotlib #matplotlib.style.use('ggplot') @@ -41,7 +42,6 @@ from matplotlib.patches import Ellipse from filter import listfilter import time -import math from pvclust import PvClust @@ -1473,63 +1473,23 @@ def __init__(self, parent, currplt, frame, file, filtereddfs, groupsets): self.plot(parent, file, filtereddfs, groupsets) def plot(self, parent, file, filtereddfs, groupsets): - # Load and filter ion data + # Load ion data and the average injections-per-sample (for the noise + # model that scales the CV axis), then compute the quality metrics in + # the Qt-free qualityscore module (unit-tested; see qualityscore.py). iondict = cached_read_csv(parent.analysis_paramsgui.outputdir / 'iondict.csv', header=0, index_col=0) - iondict = iondict[~np.isnan(iondict['average CV'])] - - # Calculate mean and median CV, and scale data - iondictmean = iondict.sort_values(['average CV']).reset_index() - iondictmed = iondict.sort_values(['median CV']).reset_index() - iondictmean = iondictmean.reset_index() - iondictmed = iondictmed.reset_index() - iondictmean.iloc[:,0] = 100 * iondictmean.iloc[:,0]/len(iondictmean['average CV']) - iondictmed.iloc[:,0] = 100 * iondictmed.iloc[:,0]/len(iondictmed['median CV']) - - # Calculate maximum theoretical CV based on neff msdata_header = cached_read_csv(parent.analysis_paramsgui.outputdir / (parent.analysis_paramsgui.filename.stem + '_filtered.csv'), sep=',', header=None, index_col=[0,1,2]).iloc[:3,:].transpose() msdata_header.columns = ['Biolgroup', 'Sample', 'Injection'] average_n = msdata_header['Injection'].nunique() / msdata_header['Sample'].nunique() - modelstdevlist = [1] + [0] * (int(average_n) - 1) - modelstdev = pd.Series(modelstdevlist).std() / pd.Series(modelstdevlist).mean() - cv50 = iondictmean.iloc[(iondictmean.iloc[:,0] - 50).abs().argsort()[:1]]['average CV'] - sortedcv = iondictmean.iloc[(iondictmean.iloc[:,0]).argsort()]['average CV'] - prevav = 0 - aucav = 0 - prevmed = 0 - aucmed = 0 - for pos in range(0,len(iondictmean.iloc[:,0])): - dist = iondictmean.iloc[pos,:]['average CV'] - prevav - aucav += dist*iondictmean.iloc[pos,0] - prevav = iondictmean.iloc[pos,:]['average CV'] - - dist = iondictmed.iloc[pos,:]['median CV'] - prevmed - aucmed += dist*iondictmed.iloc[pos,0] - prevmed = iondictmed.iloc[pos,:]['median CV'] - - meanav = 0 - meanmed = 0 - sumskew = 0 - if math.isnan(modelstdev): - modelstdev = 1.7 - for val in range(1, int((modelstdev*100))): - pos = val/100 - meanav = iondictmean[abs(iondictmean['average CV'] - pos-modelstdev/200) < modelstdev/200].iloc[:,0].mean() - meanmed = iondictmed[abs(iondictmed['average CV'] - pos-modelstdev/200) < modelstdev/200].iloc[:,0].mean() - skew = abs(meanmed-meanav) - if not np.isnan(skew): - sumskew += skew * modelstdev/100 - - - sumskew = sumskew/ ((aucmed+aucav)/2) - rep = ((aucmed+aucav)/2)/(modelstdev*100) - qualscore = (1-sumskew)*rep*100 - - #qualscore = round(100 * (1 - cv50 / modelstdev), 1) + + result = qualityscore.compute_cv_quality(iondict, average_n) + iondictmean = result.iondictmean + iondictmed = result.iondictmed + modelstdev = result.modelstdev # Update UI - parent.ui.lbl_spllist_3.setText('Reproducibility:\n' + str(round(100*rep,1)) + '%\n' + - 'Skewnewss:\n' + str(round(100*sumskew,1)) + '%\n\n' + - 'Overall:\n' + str(round(qualscore,1)) + '%') + parent.ui.lbl_spllist_3.setText('Reproducibility:\n' + str(round(100*result.rep,1)) + '%\n' + + 'Skewnewss:\n' + str(round(100*result.sumskew,1)) + '%\n\n' + + 'Overall:\n' + str(round(result.qualscore,1)) + '%') # Plot data currplt = 'cvplt' #instead take this from input diff --git a/code/qualityscore.py b/code/qualityscore.py new file mode 100644 index 0000000..889f33b --- /dev/null +++ b/code/qualityscore.py @@ -0,0 +1,140 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Qt-free extraction of the data-quality metrics shown on the CV (coefficient of +variation) rarefaction plot tab. These numbers -- Reproducibility, Skewness, +and an Overall quality score -- already existed, but the math was buried inside +``plotting.prev_cv.plot()`` (entangled with the matplotlib drawing and the +``lbl_spllist_3`` label update) and had no test coverage. + +Moving the computation here (the same pattern as ``ordination.py`` / +``biogroups.py`` / ``dbsearch.py`` / ``clusterpurity.py``) makes it unit- +testable and keeps ``prev_cv`` as a thin draw-the-result wrapper. The logic is +a faithful port of the original inline code -- ``tests/test_qualityscore.py`` +pins it against a verbatim copy of that original to guarantee the extraction +didn't change any displayed number. + +Definitions (as originally implemented): +- The CV rarefaction curve plots, for the mean-CV and median-CV orderings, the + cumulative percentage of features (0-100) against their CV. ``aucav``/ + ``aucmed`` are the areas under those two curves (percentage x CV). +- ``modelstdev`` is the maximum theoretical CV expected from pure + count-statistics noise given the average number of injections per sample + (``[1] + [0]*(n-1)`` treated as a sample -> its CV); it sets the CV axis + scale and normalises the AUC. +- ``rep`` (Reproducibility) = mean AUC / (modelstdev x 100): how far left + (low-CV) the curve sits relative to the noise-model scale. +- ``sumskew`` (Skewness) = normalised integrated gap between the mean-CV and + median-CV curves: how asymmetric the CV distribution is. +- ``qualscore`` (Overall) = (1 - skew) x rep x 100. +""" + +import math + +import numpy as np +import pandas as pd + + +class CVQualityResult: + """Bundle of everything ``prev_cv`` needs to label and draw the CV plot. + + Attributes: + iondictmean: features sorted by 'average CV', with column 0 replaced by + the cumulative percentage (0-100) -- the mean-CV rarefaction curve. + iondictmed: same, sorted/ranked by 'median CV'. + modelstdev: the count-statistics noise-model CV (axis scale). + rep: reproducibility fraction (0-1); ``100*rep`` is the displayed %. + sumskew: skewness fraction (0-1); ``100*sumskew`` is the displayed %. + qualscore: overall quality score (already on a 0-100 scale). + """ + __slots__ = ('iondictmean', 'iondictmed', 'modelstdev', 'rep', 'sumskew', 'qualscore') + + def __init__(self, iondictmean, iondictmed, modelstdev, rep, sumskew, qualscore): + self.iondictmean = iondictmean + self.iondictmed = iondictmed + self.modelstdev = modelstdev + self.rep = rep + self.sumskew = sumskew + self.qualscore = qualscore + + +def noise_model_cv(average_n): + """Maximum theoretical CV from pure presence/absence count noise given + ``average_n`` injections per sample: the CV of ``[1] + [0]*(n-1)``. + + Falls back to 1.7 when undefined (e.g. a single injection per sample makes + the model series a single value with no spread) -- matching the original. + """ + modelstdevlist = [1] + [0] * (int(average_n) - 1) + series = pd.Series(modelstdevlist) + modelstdev = series.std() / series.mean() + if math.isnan(modelstdev): + modelstdev = 1.7 + return modelstdev + + +def compute_cv_quality(iondict, average_n): + """Compute the CV-plot quality metrics from an ion dictionary. + + Args: + iondict: DataFrame with 'average CV' and 'median CV' columns (the + ``iondict.csv`` produced by the CV filter). Rows with NaN + 'average CV' are dropped, matching the plot. + average_n: average number of injections per sample (used for the + count-statistics noise model that scales the CV axis). + + Returns: + CVQualityResult. + """ + iondict = iondict[~np.isnan(iondict['average CV'])] + + # Cumulative-percentage rarefaction curves for the mean- and median-CV + # orderings. The double reset_index reproduces the original exactly: + # the first moves the Compound index to a column, the second materialises + # the 0..n-1 rank as column 0, which is then rescaled to a 0-100 percentage. + iondictmean = iondict.sort_values(['average CV']).reset_index() + iondictmed = iondict.sort_values(['median CV']).reset_index() + iondictmean = iondictmean.reset_index() + iondictmed = iondictmed.reset_index() + # Replace column 0 (the integer rank) with the 0-100 cumulative percentage. + # Assign by column LABEL (not in-place via .iloc) so the column's dtype is + # replaced wholesale rather than an incompatible float cast into an int64 + # column -- the latter raises a FutureWarning on pandas 2.x and is slated + # to become a hard error (same class as the LossySetitemError fixed + # elsewhere). Numerically identical to the original. + mean_col0, med_col0 = iondictmean.columns[0], iondictmed.columns[0] + iondictmean[mean_col0] = 100 * iondictmean.iloc[:, 0] / len(iondictmean['average CV']) + iondictmed[med_col0] = 100 * iondictmed.iloc[:, 0] / len(iondictmed['median CV']) + + modelstdev = noise_model_cv(average_n) + + # Area under each rarefaction curve (percentage integrated over CV). + prevav = 0 + aucav = 0 + prevmed = 0 + aucmed = 0 + for pos in range(0, len(iondictmean.iloc[:, 0])): + dist = iondictmean.iloc[pos, :]['average CV'] - prevav + aucav += dist * iondictmean.iloc[pos, 0] + prevav = iondictmean.iloc[pos, :]['average CV'] + + dist = iondictmed.iloc[pos, :]['median CV'] - prevmed + aucmed += dist * iondictmed.iloc[pos, 0] + prevmed = iondictmed.iloc[pos, :]['median CV'] + + # Integrated gap between the mean and median curves (distribution skew). + sumskew = 0 + for val in range(1, int((modelstdev * 100))): + pos = val / 100 + meanav = iondictmean[abs(iondictmean['average CV'] - pos - modelstdev / 200) < modelstdev / 200].iloc[:, 0].mean() + meanmed = iondictmed[abs(iondictmed['average CV'] - pos - modelstdev / 200) < modelstdev / 200].iloc[:, 0].mean() + skew = abs(meanmed - meanav) + if not np.isnan(skew): + sumskew += skew * modelstdev / 100 + + sumskew = sumskew / ((aucmed + aucav) / 2) + rep = ((aucmed + aucav) / 2) / (modelstdev * 100) + qualscore = (1 - sumskew) * rep * 100 + + return CVQualityResult(iondictmean, iondictmed, modelstdev, rep, sumskew, qualscore) diff --git a/code/tests/test_crashreport.py b/code/tests/test_crashreport.py new file mode 100644 index 0000000..d4ac0ee --- /dev/null +++ b/code/tests/test_crashreport.py @@ -0,0 +1,156 @@ +"""Unit tests for the crash/error reporter (``crashreport.py``). + +A real exception is manufactured (raise + catch) so there's a genuine +traceback object to format; the GUI dialog is replaced by a recording +callback, and the excepthook is exercised by calling it directly. +""" + +import sys +import urllib.parse + +import crashreport as cr + + +def _make_exc_info(): + try: + raise ValueError('boom happened in feature 0.80_418.1451n') + except ValueError: + return sys.exc_info() + + +# --------------------------------------------------------------------------- # +# format_report +# --------------------------------------------------------------------------- # + +def test_format_report_contains_traceback_and_environment(): + et, ev, tb = _make_exc_info() + report = cr.format_report(et, ev, tb, now=0) + assert 'MPACT crash report' in report + assert 'ValueError: boom happened' in report + assert 'Traceback' in report + assert 'Python:' in report and 'Platform:' in report + + +def test_format_report_includes_context_when_given(): + et, ev, tb = _make_exc_info() + report = cr.format_report(et, ev, tb, context='dataset=PTY087I2; step=heatmap') + assert 'Context:' in report + assert 'PTY087I2' in report + + +# --------------------------------------------------------------------------- # +# one_line_summary +# --------------------------------------------------------------------------- # + +def test_one_line_summary_uses_type_and_message(): + et, ev, _ = _make_exc_info() + assert cr.one_line_summary(et, ev) == 'ValueError: boom happened in feature 0.80_418.1451n' + + +def test_one_line_summary_handles_empty_message(): + try: + raise RuntimeError() + except RuntimeError: + et, ev, _ = sys.exc_info() + assert cr.one_line_summary(et, ev) == 'RuntimeError' + + +# --------------------------------------------------------------------------- # +# write_log +# --------------------------------------------------------------------------- # + +def test_write_log_creates_timestamped_file(tmp_path): + path = cr.write_log('hello report', tmp_path / 'crashlogs', now=0) + assert path is not None + with open(path) as f: + assert f.read() == 'hello report' + assert 'mpact_crash_' in path and path.endswith('.log') + + +def test_write_log_returns_none_on_failure(tmp_path): + # Point log_dir at a path that exists as a *file* so makedirs fails. + afile = tmp_path / 'not_a_dir' + afile.write_text('x') + assert cr.write_log('r', str(afile / 'sub')) is None + + +# --------------------------------------------------------------------------- # +# build_issue_url +# --------------------------------------------------------------------------- # + +def test_build_issue_url_is_wellformed_and_encoded(): + url = cr.build_issue_url('a traceback & stuff', 'Crash: ValueError: boom', + repo='robertsamples/mpact') + assert url.startswith('https://github.com/robertsamples/mpact/issues/new?') + parsed = urllib.parse.urlparse(url) + params = urllib.parse.parse_qs(parsed.query) + assert params['title'] == ['Crash: ValueError: boom'] + assert 'a traceback & stuff' in params['body'][0] + + +def test_build_issue_url_truncates_huge_body(): + huge = 'x' * 50000 + url = cr.build_issue_url(huge, 'Crash', repo='r/m') + params = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) + assert 'truncated' in params['body'][0] + assert len(params['body'][0]) < 7000 + + +# --------------------------------------------------------------------------- # +# excepthook +# --------------------------------------------------------------------------- # + +def test_excepthook_invokes_handler_and_writes_log(tmp_path): + received = {} + + def handler(report, log_path, issue_url): + received['report'] = report + received['log_path'] = log_path + received['issue_url'] = issue_url + + chained = [] + hook = cr.make_excepthook(handler, log_dir=str(tmp_path / 'logs'), + repo='robertsamples/mpact', + prev_hook=lambda *a: chained.append(a)) + et, ev, tb = _make_exc_info() + hook(et, ev, tb) + + assert 'ValueError' in received['report'] + assert received['log_path'] is not None and received['log_path'].endswith('.log') + assert received['issue_url'].startswith('https://github.com/robertsamples/mpact/issues/new?') + # The previous hook was still chained (traceback reaches the console). + assert len(chained) == 1 + + +def test_excepthook_swallows_handler_errors(tmp_path): + def bad_handler(report, log_path, issue_url): + raise RuntimeError('dialog blew up') + + hook = cr.make_excepthook(bad_handler, log_dir=str(tmp_path), prev_hook=lambda *a: None) + et, ev, tb = _make_exc_info() + # Must not raise, even though the handler does. + hook(et, ev, tb) + + +def test_excepthook_uses_context_provider(tmp_path): + received = {} + + def handler(report, log_path, issue_url): + received['report'] = report + + hook = cr.make_excepthook(handler, log_dir=str(tmp_path), + context_provider=lambda: 'active dataset: foo', + prev_hook=lambda *a: None) + et, ev, tb = _make_exc_info() + hook(et, ev, tb) + assert 'active dataset: foo' in received['report'] + + +def test_install_excepthook_restores(tmp_path): + original = sys.excepthook + try: + prev = cr.install_excepthook(lambda *a: None, log_dir=str(tmp_path)) + assert prev is original + assert sys.excepthook is not original + finally: + sys.excepthook = original diff --git a/code/tests/test_getfragdb.py b/code/tests/test_getfragdb.py new file mode 100644 index 0000000..ebbd84a --- /dev/null +++ b/code/tests/test_getfragdb.py @@ -0,0 +1,114 @@ +"""Unit tests for the MSP fragmentation-database importer (``getfragdb.py``). + +Covers the two parsers (``importfrag_v1`` Progenesis-style, ``importfrag_v2`` +MS-DIAL-style) and the format auto-detection wrapper (``importfrag``), which +previously had no coverage at all. Synthetic MSP fixtures are written to +``tmp_path``; a couple of smoke checks run against the real example files at +the repo root (skipped when absent). +""" + +from pathlib import Path + +import pytest + +import getfragdb + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +# Progenesis-style: "Name: Unknown ()" with parenthetical id, no pipes. +PROGENESIS_MSP = ( + "Name: Unknown (0.80_627.2171n)\n" + "PrecursorMZ: 627.2171\n" + "Num Peaks: 2\n" + "418.1451 257254\n" + "200.1000 5000\n" + "\n" + "Name: Unknown (1.20_300.1000n)\n" + "PrecursorMZ: 300.1\n" + "Num Peaks: 1\n" + "150.0500 100\n" +) + +# MS-DIAL-style: "NAME: ...|ID=|MZ=|RT=" with pipes, PRECURSORMZ + RETENTIONTIME. +MSDIAL_MSP = ( + "NAME: Unknown|ID=0|MZ=150.0267|RT=9.09\n" + "PRECURSORMZ: 150.0267\n" + "RETENTIONTIME: 9.0898957\n" + "Num Peaks: 2\n" + "56.0500 334\n" + "70.0600 120\n" + "\n" + "NAME: Unknown|ID=1|MZ=200.1000|RT=3.50\n" + "PRECURSORMZ: 200.1\n" + "RETENTIONTIME: 3.5\n" + "Num Peaks: 1\n" + "99.0000 50\n" +) + + +def _write(tmp_path, name, text): + p = tmp_path / name + p.write_text(text) + return p + + +def test_importfrag_v1_parses_progenesis_ids_and_peaks(tmp_path): + db = getfragdb.importfrag_v1(_write(tmp_path, 'p.msp', PROGENESIS_MSP)) + assert set(db.ions.keys()) == {'0.80_627.2171n', '1.20_300.1000n'} + # First entry has 2 peaks parsed into an (n, 2) array. + first = db.ions['0.80_627.2171n'] + assert first.pattern.shape == (2, 2) + assert first.pattern[0][0] == pytest.approx(418.1451) + assert db.ions['1.20_300.1000n'].pattern.shape == (1, 2) + + +def test_importfrag_v2_parses_msdial_rt_mz_keyed_ids(tmp_path): + db = getfragdb.importfrag_v2(_write(tmp_path, 'd.msp', MSDIAL_MSP)) + # name = f"{round(rt,3)}_{precursormz}" using the raw PRECURSORMZ string. + assert '9.09_150.0267' in db.ions + assert '3.5_200.1' in db.ions + assert db.ions['9.09_150.0267'].pattern.shape == (2, 2) + assert db.ions['9.09_150.0267'].fragparams['PRECURSORMZ'] == '150.0267' + + +def test_importfrag_autodetects_progenesis(tmp_path): + db = getfragdb.importfrag(_write(tmp_path, 'p.msp', PROGENESIS_MSP)) + # Progenesis ids are the parenthetical comment, not RT_mz keys. + assert '0.80_627.2171n' in db.ions + + +def test_importfrag_autodetects_msdial(tmp_path): + db = getfragdb.importfrag(_write(tmp_path, 'd.msp', MSDIAL_MSP)) + assert '9.09_150.0267' in db.ions + + +def test_importfrag_v2_skips_entries_without_rt_or_precursor(tmp_path): + # An entry missing RETENTIONTIME/PRECURSORMZ must be dropped, not crash. + msp = ( + "NAME: Unknown|ID=0|MZ=150\n" + "Num Peaks: 1\n" + "56.05 334\n" + "\n" + "NAME: Unknown|ID=1|MZ=200.1|RT=3.5\n" + "PRECURSORMZ: 200.1\n" + "RETENTIONTIME: 3.5\n" + "Num Peaks: 1\n" + "99.0 50\n" + ) + db = getfragdb.importfrag_v2(_write(tmp_path, 'd.msp', msp)) + assert list(db.ions.keys()) == ['3.5_200.1'] + + +@pytest.mark.parametrize('name', ['progenesis.msp', 'msdial.msp']) +def test_importfrag_on_real_example_files(name): + path = REPO_ROOT / name + if not path.exists(): + pytest.skip(name + ' not present') + db = getfragdb.importfrag(path) + assert len(db.ions) > 0 + # Every parsed ion's peak array is either empty (an entry with 0 peaks -- + # which does occur in the real MS-DIAL export) or a proper (n, 2) array. + for entry in db.ions.values(): + assert entry.pattern.size == 0 or ( + entry.pattern.ndim == 2 and entry.pattern.shape[1] == 2) diff --git a/code/tests/test_mpactupdate.py b/code/tests/test_mpactupdate.py new file mode 100644 index 0000000..b8130a3 --- /dev/null +++ b/code/tests/test_mpactupdate.py @@ -0,0 +1,144 @@ +"""Unit tests for the MPACT self-update checker (``mpactupdate.py``). + +The GitHub API and the git call are both injected, so no network or +subprocess is touched. +""" + +import io +import json + +import mpactupdate as mu + + +class _FakeResponse(io.BytesIO): + def __enter__(self): + return self + + def __exit__(self, *exc): + self.close() + return False + + +def _release_opener(tag, *, html_url='http://x/rel', body='notes', record=None): + payload = json.dumps({'tag_name': tag, 'html_url': html_url, 'body': body}).encode() + + def opener(request, timeout=None): + if record is not None: + # request is a urllib Request; record its full URL. + record.append(getattr(request, 'full_url', request)) + return _FakeResponse(payload) + return opener + + +# --------------------------------------------------------------------------- # +# version comparison +# --------------------------------------------------------------------------- # + +def test_is_newer_basic(): + assert mu.is_newer('2.1.0', '2.0.0') is True + assert mu.is_newer('2.0.0', '2.0.0') is False + assert mu.is_newer('1.9.0', '2.0.0') is False + + +def test_is_newer_strips_v_prefix(): + assert mu.is_newer('v2.1.0', '2.0.0') is True + assert mu.is_newer('V2.0.1', 'v2.0.0') is True + + +def test_is_newer_numeric_not_lexicographic(): + # 2.10.0 > 2.9.0 numerically (lexicographically it would be "<"). + assert mu.is_newer('2.10.0', '2.9.0') is True + + +def test_is_newer_unparseable_tag_is_not_newer(): + assert mu.is_newer('not-a-version', '2.0.0') is False + + +# --------------------------------------------------------------------------- # +# release fetch + check +# --------------------------------------------------------------------------- # + +def test_check_reports_available_update(): + info = mu.check_for_update(current_version='2.0.0', + opener=_release_opener('v2.5.0')) + assert info.available is True + assert info.latest == 'v2.5.0' + assert info.url == 'http://x/rel' + assert info.notes == 'notes' + + +def test_check_reports_no_update_when_same_version(): + info = mu.check_for_update(current_version='2.0.0', + opener=_release_opener('v2.0.0')) + assert info.available is False + assert info.current == '2.0.0' + + +def test_check_hits_the_configured_repo(): + seen = [] + mu.check_for_update(current_version='2.0.0', repo='robertsamples/mpact', + opener=_release_opener('v2.0.0', record=seen)) + assert seen == ['https://api.github.com/repos/robertsamples/mpact/releases/latest'] + + +def test_check_is_safe_when_offline(): + def failing_opener(request, timeout=None): + raise OSError('no network') + info = mu.check_for_update(current_version='2.0.0', opener=failing_opener) + assert info.available is False + assert info.latest is None + + +def test_check_is_safe_when_no_releases_yet(): + # GitHub returns 404 -> urlopen raises HTTPError -> fetch returns None. + def opener_404(request, timeout=None): + raise OSError('HTTP 404') + info = mu.check_for_update(current_version='2.0.0', opener=opener_404) + assert info.available is False + + +def test_fetch_handles_malformed_json(): + def opener(request, timeout=None): + return _FakeResponse(b'not json at all') + assert mu.fetch_latest_release(opener=opener) is None + + +# --------------------------------------------------------------------------- # +# git update +# --------------------------------------------------------------------------- # + +class _Completed: + def __init__(self, returncode, stdout='', stderr=''): + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + + +def test_apply_git_update_success(): + calls = [] + + def runner(cmd, **kwargs): + calls.append(cmd) + return _Completed(0, stdout='Updating abc..def\nFast-forward\n') + + ok, output = mu.apply_git_update('/repo', runner=runner) + assert ok is True + assert 'Fast-forward' in output + assert calls[0][:3] == ['git', '-C', '/repo'] + assert 'pull' in calls[0] and '--ff-only' in calls[0] + + +def test_apply_git_update_reports_failure(): + def runner(cmd, **kwargs): + return _Completed(1, stderr='error: local changes would be overwritten') + ok, output = mu.apply_git_update('/repo', runner=runner) + assert ok is False + assert 'local changes' in output + + +def test_apply_git_update_handles_missing_git(): + def runner(cmd, **kwargs): + raise FileNotFoundError('git not found') + ok, output = mu.apply_git_update('/repo', runner=runner) + assert ok is False + assert 'could not be run' in output diff --git a/code/tests/test_msfast_grpave_off.py b/code/tests/test_msfast_grpave_off.py new file mode 100644 index 0000000..806cc2b --- /dev/null +++ b/code/tests/test_msfast_grpave_off.py @@ -0,0 +1,74 @@ +"""Regression test for the ``groupionlists`` defensive-init fix in +``run_MSFaST`` (MSFaST.py). + +``groupionlists`` is only *populated* inside ``if analysis_params.grpave:``, +but is referenced unconditionally afterwards (the ``groupionlists['cv'/...]`` +assignments and the iondict ``groups``-column loop). The GUI hardcodes +``grpave=True`` so this never fired in production, but a minimal run with +``grpave=False`` would have raised ``NameError`` before the fix. This drives +exactly that path against the bundled example dataset, with every optional +filter/stat turned off, and asserts the run completes and returns a result. +""" + +from pathlib import Path + +import pytest + +from MSFaST import AnalysisResult, analysis_parameters, run_MSFaST + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXAMPLE_DIR = REPO_ROOT / 'rawdata' / 'PTY087I2' +ALL_GROUPS = ['Blanks', 'Media', '0um_Ce', '250um_Ce'] + + +def _minimal_params(tmp_path): + """Everything that gates a filtering/stats stage turned OFF, so the run + exercises the no-grpave branch. Threshold/echo-only fields still have to + be present because analysisinfo.txt prints them verbatim.""" + params = analysis_parameters() + params.filename = EXAMPLE_DIR / '200826_PTY087I2codingdataset.csv' + params.samplelistfilename = EXAMPLE_DIR / 'samplelist.csv' + params.extractmetadatafilename = EXAMPLE_DIR / 'extractmetadata.csv' + params.outputdir = tmp_path / params.filename.stem + params.outputdir.mkdir(parents=True) + + # All optional stages OFF -- this is the configuration that used to crash. + params.relfil = False + params.merge = False + params.grpave = False + params.prperr = False + params.blnkfltr = False + params.CVfil = False + params.decon = False + params.FC = False + params.Ttest = False + + # Thresholds / echo-only fields (printed into analysisinfo.txt). + params.ringingwin = 0.5 + params.isopeakwin = 0.01 + params.dimerpeakwin = 0.01 + params.RTwin = 0.005 + params.maxisowin = 3 + params.blnkgrp = '' + params.cvthresh = 0.2 + params.statstgrps = ['250um_Ce', '0um_Ce'] + params.graphfilters = [] + params.MZRTplt = params.FC3Dplt = params.KMD = False + params.PCA = params.Dendrogram = params.Volcanoplt = False + + # No Plot Feature Sets configured -> empty querylist/querydict. + params.querydict = {} + params.querylist = [] + return params + + +def test_run_msfast_with_grpave_off_does_not_raise(tmp_path): + params = _minimal_params(tmp_path) + result = run_MSFaST(params) # used to raise NameError on groupionlists + assert isinstance(result, AnalysisResult) + assert isinstance(result.groupionlists, dict) + # The three filter keys are always added, each empty since every filter is off. + assert result.groupionlists == {'cv': [], 'relfil': [], 'insource': []} + # With no filters applied, the filtered table is written and analysisinfo exists. + assert (params.outputdir / (params.filename.stem + '_filtered.csv')).exists() + assert (params.outputdir / 'analysisinfo.txt').exists() diff --git a/code/tests/test_msfast_pipeline.py b/code/tests/test_msfast_pipeline.py index 71275c5..4f424e6 100644 --- a/code/tests/test_msfast_pipeline.py +++ b/code/tests/test_msfast_pipeline.py @@ -136,3 +136,37 @@ def test_analysisinfo_written(pipeline_result): assert info_path.exists() text = info_path.read_text() assert 'Features passing all filters' in text + + +def test_fold_change_is_clamped_to_bounds(pipeline_result): + """runfc clamps FC into [0.01, 100]; nothing should escape that range.""" + params, _ = pipeline_result + iondict = pd.read_csv(params.outputdir / 'iondict.csv', sep=',', header=[0], index_col=[0]) + fc = iondict['fc'].dropna() + assert len(fc) > 0 + assert fc.min() >= 0.01 + assert fc.max() <= 100.0 + + +def test_stats_outputs_land_in_output_dir_not_cwd(pipeline_result): + """The t-test/q-value tables now write into the run's output directory + (previously bare 'msdata_teststats_test.csv'/'qdata.csv' in the cwd).""" + params, _ = pipeline_result + assert (params.outputdir / (params.filename.stem + '_teststats.csv')).exists() + assert (params.outputdir / (params.filename.stem + '_qvalues.csv')).exists() + + +def test_qvalues_are_finite_positive_and_consistent_with_logq(pipeline_result): + """The BH q-values must be finite and positive, and the persisted '-logq' + column must equal -log10(qval) (the relationship runttest derives). Strict + p-ascending monotonicity is deliberately NOT asserted: the cummin step-up + only guarantees it in the loop's processing order, and tied p-values can + reorder under an independent re-sort -- a known BH tie subtlety, not a bug.""" + import numpy as np + params, _ = pipeline_result + qdata = pd.read_csv(params.outputdir / (params.filename.stem + '_qvalues.csv'), + sep=',', header=[0]) + qval = qdata['qval'].to_numpy() + assert np.isfinite(qval).all() + assert (qval > 0).all() + np.testing.assert_allclose(qdata['-logq'].to_numpy(), -np.log10(qval), rtol=1e-9) diff --git a/code/tests/test_npatlasupdate.py b/code/tests/test_npatlasupdate.py new file mode 100644 index 0000000..ebfd2de --- /dev/null +++ b/code/tests/test_npatlasupdate.py @@ -0,0 +1,144 @@ +"""Unit tests for the NPAtlas updater (``npatlasupdate.py``). + +The network is never touched: ``download_atlas`` takes an injectable +``opener``, so tests feed canned bytes through a fake response object and +assert the staleness logic, header validation, and atomic-replace behaviour. +""" + +import io +import os +import time + +import pytest + +import npatlasupdate as nu + + +# --------------------------------------------------------------------------- # +# fixtures / helpers +# --------------------------------------------------------------------------- # + +VALID_HEADER = ( + 'npaid\tcompound_id\tcompound_name\tcompound_m_plus_h\tcompound_m_plus_na\t' + 'compound_smiles\torigin_type\tgenus\n' +) +VALID_TSV = VALID_HEADER + '1\t0.80_418n\tFoo\t419.1\t441.1\tCCO\tBacterium\tStreptomyces\n' + + +class _FakeResponse(io.BytesIO): + """A BytesIO that also works as a context manager (like urlopen's result).""" + def __enter__(self): + return self + + def __exit__(self, *exc): + self.close() + return False + + +def _opener_returning(content_bytes, record=None): + def opener(url, timeout=None): + if record is not None: + record.append(url) + return _FakeResponse(content_bytes) + return opener + + +# --------------------------------------------------------------------------- # +# staleness +# --------------------------------------------------------------------------- # + +def test_age_is_none_when_missing(tmp_path): + assert nu.atlas_age_days(tmp_path / 'nope.tsv') is None + + +def test_update_due_when_missing(tmp_path): + assert nu.is_update_due(tmp_path / 'nope.tsv') is True + + +def test_update_not_due_for_fresh_file(tmp_path): + p = tmp_path / 'npatlas.tsv' + p.write_text(VALID_TSV) + # Just created -> age ~0 days -> not due. + assert nu.is_update_due(p, max_age_days=30) is False + + +def test_update_due_for_old_file(tmp_path): + p = tmp_path / 'npatlas.tsv' + p.write_text(VALID_TSV) + # Backdate mtime to 45 days ago. + old = time.time() - 45 * 86400 + os.utime(p, (old, old)) + assert nu.is_update_due(p, max_age_days=30) is True + assert nu.atlas_age_days(p) == pytest.approx(45, abs=0.1) + + +# --------------------------------------------------------------------------- # +# header validation +# --------------------------------------------------------------------------- # + +def test_validate_header_accepts_full_header(): + assert nu.validate_tsv_header(VALID_HEADER) is True + + +def test_validate_header_rejects_missing_columns(): + assert nu.validate_tsv_header('npaid\tcompound_id\tgenus\n') is False + + +def test_validate_header_rejects_html_error_page(): + assert nu.validate_tsv_header('') is False + + +# --------------------------------------------------------------------------- # +# download (atomic + validated) +# --------------------------------------------------------------------------- # + +def test_download_writes_validated_tsv(tmp_path): + dest = tmp_path / 'npatlas.tsv' + seen = [] + n = nu.download_atlas(dest, url='http://example/atlas.tsv', + opener=_opener_returning(VALID_TSV.encode(), record=seen)) + assert dest.exists() + assert n == len(VALID_TSV.encode()) + assert nu.validate_tsv_header(dest.read_text().splitlines(keepends=True)[0]) + assert seen == ['http://example/atlas.tsv'] + + +def test_download_overwrites_existing_atlas_atomically(tmp_path): + dest = tmp_path / 'npatlas.tsv' + dest.write_text('OLD CONTENT') + nu.download_atlas(dest, opener=_opener_returning(VALID_TSV.encode())) + assert 'OLD CONTENT' not in dest.read_text() + assert dest.read_text() == VALID_TSV + + +def test_invalid_download_is_rejected_and_existing_atlas_preserved(tmp_path): + dest = tmp_path / 'npatlas.tsv' + dest.write_text(VALID_TSV) # a good existing atlas + bad = b'503 Service Unavailable' + with pytest.raises(ValueError): + nu.download_atlas(dest, opener=_opener_returning(bad)) + # The good atlas must be untouched, and no temp files left behind. + assert dest.read_text() == VALID_TSV + leftovers = [f for f in os.listdir(tmp_path) if f.startswith('.npatlas_')] + assert leftovers == [] + + +def test_empty_download_is_rejected(tmp_path): + dest = tmp_path / 'npatlas.tsv' + dest.write_text(VALID_TSV) + with pytest.raises(ValueError): + nu.download_atlas(dest, opener=_opener_returning(b'')) + assert dest.read_text() == VALID_TSV + + +def test_network_error_leaves_existing_atlas(tmp_path): + dest = tmp_path / 'npatlas.tsv' + dest.write_text(VALID_TSV) + + def failing_opener(url, timeout=None): + raise OSError('connection refused') + + with pytest.raises(OSError): + nu.download_atlas(dest, opener=failing_opener) + assert dest.read_text() == VALID_TSV + assert [f for f in os.listdir(tmp_path) if f.startswith('.npatlas_')] == [] diff --git a/code/tests/test_qualityscore.py b/code/tests/test_qualityscore.py new file mode 100644 index 0000000..817cdcf --- /dev/null +++ b/code/tests/test_qualityscore.py @@ -0,0 +1,157 @@ +"""Unit tests for the CV-plot quality metrics (``qualityscore.py``). + +The key test is a *faithfulness* check: ``_reference_inline`` is a verbatim +copy of the original computation that used to live in ``plotting.prev_cv.plot`` +(before it was extracted into ``qualityscore``), and the extracted function is +asserted to reproduce its numbers exactly -- so the refactor provably did not +change any displayed Reproducibility / Skewness / Overall value. Plus a couple +of sanity checks and a run against the real bundled example dataset's iondict. +""" + +import math +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +import qualityscore + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXAMPLE_DIR = REPO_ROOT / 'rawdata' / 'PTY087I2' + + +def _reference_inline(iondict, average_n): + """Verbatim copy of the original prev_cv.plot() computation (pre-extraction), + returning (rep, sumskew, qualscore). Do not 'clean up' -- it exists to pin + the extracted module to the exact historical behaviour.""" + iondict = iondict[~np.isnan(iondict['average CV'])] + iondictmean = iondict.sort_values(['average CV']).reset_index() + iondictmed = iondict.sort_values(['median CV']).reset_index() + iondictmean = iondictmean.reset_index() + iondictmed = iondictmed.reset_index() + iondictmean.iloc[:, 0] = 100 * iondictmean.iloc[:, 0] / len(iondictmean['average CV']) + iondictmed.iloc[:, 0] = 100 * iondictmed.iloc[:, 0] / len(iondictmed['median CV']) + modelstdevlist = [1] + [0] * (int(average_n) - 1) + modelstdev = pd.Series(modelstdevlist).std() / pd.Series(modelstdevlist).mean() + prevav = 0 + aucav = 0 + prevmed = 0 + aucmed = 0 + for pos in range(0, len(iondictmean.iloc[:, 0])): + dist = iondictmean.iloc[pos, :]['average CV'] - prevav + aucav += dist * iondictmean.iloc[pos, 0] + prevav = iondictmean.iloc[pos, :]['average CV'] + dist = iondictmed.iloc[pos, :]['median CV'] - prevmed + aucmed += dist * iondictmed.iloc[pos, 0] + prevmed = iondictmed.iloc[pos, :]['median CV'] + sumskew = 0 + if math.isnan(modelstdev): + modelstdev = 1.7 + for val in range(1, int((modelstdev * 100))): + pos = val / 100 + meanav = iondictmean[abs(iondictmean['average CV'] - pos - modelstdev / 200) < modelstdev / 200].iloc[:, 0].mean() + meanmed = iondictmed[abs(iondictmed['average CV'] - pos - modelstdev / 200) < modelstdev / 200].iloc[:, 0].mean() + skew = abs(meanmed - meanav) + if not np.isnan(skew): + sumskew += skew * modelstdev / 100 + sumskew = sumskew / ((aucmed + aucav) / 2) + rep = ((aucmed + aucav) / 2) / (modelstdev * 100) + qualscore = (1 - sumskew) * rep * 100 + return rep, sumskew, qualscore + + +def _synthetic_iondict(n=200, seed=0): + rng = np.random.RandomState(seed) + # Plausible CVs: mostly low, a tail of high ones; median a touch below mean. + avg = np.abs(rng.normal(0.15, 0.08, size=n)) + med = np.clip(avg - np.abs(rng.normal(0.01, 0.01, size=n)), 0, None) + return pd.DataFrame({'average CV': avg, 'median CV': med}, + index=[f'f{i}' for i in range(n)]) + + +# --------------------------------------------------------------------------- # +# noise model +# --------------------------------------------------------------------------- # + +def test_noise_model_cv_matches_count_statistics(): + # [1, 0, 0] -> std/mean of that series. + s = pd.Series([1, 0, 0]) + assert qualityscore.noise_model_cv(3) == pytest.approx(s.std() / s.mean()) + + +def test_noise_model_cv_falls_back_when_undefined(): + # average_n = 1 -> series is just [1] -> std is NaN -> fallback 1.7. + assert qualityscore.noise_model_cv(1) == 1.7 + + +# --------------------------------------------------------------------------- # +# faithful extraction +# --------------------------------------------------------------------------- # + +@pytest.mark.parametrize('average_n', [3, 4, 6]) +def test_matches_original_inline_on_synthetic(average_n): + iondict = _synthetic_iondict() + result = qualityscore.compute_cv_quality(iondict, average_n) + ref_rep, ref_skew, ref_qual = _reference_inline(iondict, average_n) + assert result.rep == pytest.approx(ref_rep) + assert result.sumskew == pytest.approx(ref_skew) + assert result.qualscore == pytest.approx(ref_qual) + + +def test_result_fields_are_finite_and_sensible(): + result = qualityscore.compute_cv_quality(_synthetic_iondict(), average_n=3) + assert np.isfinite(result.rep) and result.rep > 0 + assert np.isfinite(result.sumskew) + assert np.isfinite(result.qualscore) + # The rarefaction curves run from ~0 to 100% of features. + assert result.iondictmean.iloc[:, 0].max() == pytest.approx(100, abs=1) + + +def test_matches_original_inline_on_real_iondict(tmp_path): + """If the example dataset is present, run the real pipeline once and check + the extracted metrics match the original inline computation on the real + iondict.csv (the strongest faithfulness guarantee).""" + csv = EXAMPLE_DIR / '200826_PTY087I2codingdataset.csv' + if not csv.exists(): + pytest.skip('example dataset not present') + + from MSFaST import analysis_parameters, run_MSFaST + from groupsets import GroupSetModel, build_query_dict + + params = analysis_parameters() + params.filename = csv + params.samplelistfilename = EXAMPLE_DIR / 'samplelist.csv' + params.extractmetadatafilename = EXAMPLE_DIR / 'extractmetadata.csv' + params.outputdir = tmp_path / params.filename.stem + params.outputdir.mkdir(parents=True) + params.relfil = True; params.merge = True + params.ringingwin = 0.5; params.isopeakwin = 0.01; params.dimerpeakwin = 0.01 + params.RTwin = 0.005; params.maxisowin = 3 + params.grpave = True; params.prperr = True + params.blnkfltr = True; params.blnkgrp = 'Blanks'; params.blankfilthresh = 0.01 + params.CVfil = True; params.cvthresh = 0.2; params.cvparam = 'median CV' + params.decon = True; params.deconthresh = 0.95 + params.FC = False; params.Ttest = False + params.statstgrps = ['250um_Ce', '0um_Ce'] + params.graphfilters = ['cv', 'rel', 'insource'] + params.MZRTplt = params.FC3Dplt = params.KMD = False + params.PCA = params.Dendrogram = params.Volcanoplt = False + model = GroupSetModel() + model.add('Features not in blanks', all_groups=['Blanks', 'Media', '0um_Ce', '250um_Ce']) + model.update(0, src=['Media', '0um_Ce', '250um_Ce'], excl=['Blanks']) + params.querydict = build_query_dict(model, params.graphfilters) + params.querylist = list(params.querydict.keys()) + run_MSFaST(params) + + iondict = pd.read_csv(params.outputdir / 'iondict.csv', header=0, index_col=0) + header = pd.read_csv(params.outputdir / (params.filename.stem + '_filtered.csv'), + sep=',', header=None, index_col=[0, 1, 2]).iloc[:3, :].transpose() + header.columns = ['Biolgroup', 'Sample', 'Injection'] + average_n = header['Injection'].nunique() / header['Sample'].nunique() + + result = qualityscore.compute_cv_quality(iondict, average_n) + ref_rep, ref_skew, ref_qual = _reference_inline(iondict, average_n) + assert result.qualscore == pytest.approx(ref_qual) + assert result.rep == pytest.approx(ref_rep) + assert result.sumskew == pytest.approx(ref_skew) diff --git a/code/tests/test_translators.py b/code/tests/test_translators.py index c3c7641..71a28c8 100644 --- a/code/tests/test_translators.py +++ b/code/tests/test_translators.py @@ -200,3 +200,21 @@ def test_parse_real_fragment_files(name): entries = t.parse_fragments(path) assert len(entries) > 0 assert all(e.mz is not None for e in entries) + + +def test_reindex_real_progenesis_msp_against_real_peaktable(tmp_path): + """End-to-end reindex on the bundled Progenesis MSP + peak table -- the + compound-id fast path should match (nearly) every entry, and the output + must be parseable and renumbered into ascending row order.""" + pk = REPO_ROOT / 'progenesis.csv' + msp = REPO_ROOT / 'progenesis.msp' + if not (pk.exists() and msp.exists()): + pytest.skip('progenesis example files not present') + out = tmp_path / 'reindexed.msp' + n = t.reindex_fragments(pk, msp, out) + assert n > 0 + # Output parses back to exactly the matched count, with assigned scan numbers. + reparsed = t.parse_msp(out) + assert len(reparsed) == n + text = out.read_text() + assert 'SCANNUMBER:' in text diff --git a/devnotes.md b/devnotes.md index 29d3e71..adbd9b8 100644 --- a/devnotes.md +++ b/devnotes.md @@ -746,27 +746,17 @@ is now **159 passing** (the count above is stale). ### Findings NOT changed (need a decision or live-GUI validation) -- **The "Or Groups" Plot-Feature-Set control is functionally inert.** The - groupset editor has three lists — And (`listWidget_andgrps` -> `incl`), - Or (`listWidget_orgrps` -> `src`), Exclude (`listWidget_allgrps` -> - `excl`). `src` is edited, persisted to `.mpct`, and joined into the - descriptive name, but **`MSFaST.groupset.__init__` only filters on `incl` - and `excl` — it never applies `src`.** So a user can add groups to the - "Or" list and it silently changes nothing about which features are - selected/coloured. This is the most significant finding. It's *not* fixed - here because implementing the "feature present in at least one of `src`" - semantics changes which features plot — and `enumerate_inputs`'s default - "Features not in blanks" groupset already populates `src` with every - non-blank group, so turning `src` on would retroactively add a filter to - the default view. Needs the GUI run against real data to validate. Sketch - of the fix (in `groupset.__init__`, after the `excl`/`incl` passes): - ```python - if self.src: - pattern = '|'.join(' ' + str(g) for g in self.src) # leading-space convention - iondict = iondict.loc[iondict['groups'].str.contains(pattern), 'groups'].to_frame() - ``` - Decide first whether "Or" should be an independent constraint or whether - the default groupset should stop pre-filling `src`. +- **The "Or Groups" (`src`) control not being applied is intended, NOT a + bug** (confirmed by the developer, 2026-06-30). The groupset editor has + three lists — And (`listWidget_andgrps` -> `incl`, feature must be in all), + Exclude (`listWidget_allgrps` -> `excl`, feature must not be in any), and + Or (`listWidget_orgrps` -> `src`, the groups a feature is *allowed* to + appear in). `MSFaST.groupset.__init__` deliberately filters only on `incl` + and `excl`: a feature that already satisfies And/Exclude is a member of the + groupset, and `src` ("allowed in") by design doesn't further remove it, so + there's nothing for `src` to do at filter time. This matches the observed + behaviour. (Earlier in this review pass it was mis-flagged as an inert + control — that was wrong; leaving the note here so it isn't re-flagged.) - **`mspwriter.convert_to_msp` num-peaks loop is fragile.** `for frags in sources: numpeaks = len(frags)` overwrites rather than accumulates, and assumes `sources` is a list-of-one-list. It happens to be correct for the @@ -817,22 +807,32 @@ filling when convenient (all Qt-free, so headless-testable): - `stats.runfc`/`runttest` numeric outputs (FC clamping, q-value monotonicity) against a tiny synthetic `iondict.csv`. +**Update (2026-06-30, second pass):** all four gaps above are now filled — +`tests/test_translators_e2e.py`, `tests/test_getfragdb.py`, +`tests/test_msfast_grpave_off.py`, `tests/test_stats_numeric.py`. Plus the +three new subsystems below ship with their own Qt-free tests +(`test_npatlasupdate.py`, `test_mpactupdate.py`, `test_crashreport.py`). + ## Future feature dev plan (post-review, 2026-06-30) Candidate features, ordered roughly by value-to-effort. None started; all need the GUI runnable against real data to validate. Several already appear in `main.py`'s TODO block — this is the triaged version. -1. **Wire up the "Or Groups" groupset constraint** (see finding above). - Smallest, highest-impact correctness item — a visible UI control that - currently does nothing. Backend change is a few lines; the work is - deciding the default-groupset interaction and validating in the GUI. -2. **Data-quality score / summary** (TODO: "overall data quality score, AUC - on CV plot"). The pieces already exist (`average CV`/`median CV` columns - in `iondict.csv`, per-group RSDs in `_summarydata.csv`, the dendrogram - purity `n_pure/n_total` summary). A single headline QC number + a small - summary panel could be assembled Qt-free in a new `qualityscore.py` - module (testable) and surfaced on the Data Review tab. +1. ~~Wire up the "Or Groups" groupset constraint~~ — **withdrawn**: not a + bug, the `src` "allowed in" semantics are intended (see finding above). +2. ~~Data-quality score / summary~~ — **partially done.** The score the TODO + asked for already existed (Reproducibility / Skewness / Overall, from the + AUC of the CV rarefaction curve), but the math was buried untested inside + `plotting.prev_cv.plot()`. Extracted verbatim into the Qt-free, unit-tested + `qualityscore.py` (`compute_cv_quality`), pinned against a copy of the + original by `tests/test_qualityscore.py` so no displayed number changed; + `prev_cv` is now a thin draw-the-result wrapper. (Also fixed a latent pandas + FutureWarning in the extracted percentage assignment.) Remaining/optional: + surface the score outside the CV tab (e.g. Data Review summary), and fold in + the other available signals (per-group RSDs from `_summarydata.csv`, the + dendrogram purity `n_pure/n_total`) if a richer composite is wanted -- both + are scientific-design calls to make with the lab, not coded blind. 3. **OPLS-DA ordination method** (next item after the PCA/NMDS/PLS-DA rework, already deferred — see "Multivariate ordination plot"). Needs either the unmaintained `pyopls` or a from-scratch OSC implementation plus a @@ -852,3 +852,96 @@ in `main.py`'s TODO block — this is the triaged version. 7. **Specificity/sensitivity & comparison-mode plots** (TODO, "likely items that need more thought"). Larger scientific-design questions; needs spec work with the lab before implementation. + +## New subsystems (2026-06-30, second pass) + +Three new Qt-free, unit-tested modules plus thin GUI wiring in `main.py`. The +cores are fully testable headlessly (network/git/dialog all injected); the +GUI wiring (`MainWindow._run_startup_checks`/`_check_atlas_freshness`/ +`_check_app_update` and the `__main__` crash-dialog) is the only part that +needs a live launch to verify. **No new hard dependencies** — all three use +only the stdlib (`urllib`, `json`, `subprocess`, `webbrowser`, `platform`) +plus `packaging` (already present, with a tuple-comparison fallback), so +`requirements.txt`/the portable build are unaffected. + +### NPAtlas auto-updater (`npatlasupdate.py`, `tests/test_npatlasupdate.py`) + +On startup (deferred via `QTimer.singleShot` so the window paints first), if +`npatlas.tsv` is missing or its mtime is > 30 days old, the user is asked +whether to re-download it from +`https://www.npatlas.org/static/downloads/NPAtlas_download.tsv`. The download +streams to a temp file, is **validated** (header must contain the columns the +app uses — `compound_id`/`compound_m_plus_h`/`compound_m_plus_na`/ +`compound_smiles`/`origin_type`/`genus`) and only then `os.replace`-d over the +existing file, so a server error page / partial transfer / network drop can +never clobber a working atlas. + +- **Format decision (asked: would changing format help?): no — stay on TSV.** + `main.py` reads the atlas with `pd.read_csv(sep='\t')` and `dbsearch` keys + off the specific columns above; the published `NPAtlas_download.tsv` already + has exactly those, so it's a drop-in. The `NPAtlas_download.json` is the + same data in a nested shape that would need flattening before pandas/dbsearch + could touch it — pure cost, no benefit. The `.json` URL is recorded in the + module (`DEFAULT_JSON_URL`) only for completeness. +- **Refactor evaluation (asked): minimal and not needed now.** `dbsearch.py` + is already the clean Qt-free matcher; the only related cleanup is that the + atlas read in `main.py:enumerate_inputs` (`pd.read_csv('npatlas.tsv', ...)`) + is hardcoded to that filename/cwd — the updater writes to the same path, so + no change required. If a second database is added later (HMDB etc., dev-plan + item 5), factor the atlas load + column-name mapping into a small loader then. +- **Threading caveat:** the 33 MB download currently runs on the main thread + behind a wait cursor. It's a user-confirmed, infrequent (>30-day-gated) + action so blocking briefly is acceptable, but moving it onto a `QThread` + worker (like `AnalysisWorker`) is the obvious future improvement — left out + here because GUI threading can't be verified headlessly. + +### MPACT self-update checker (`mpactupdate.py`, `tests/test_mpactupdate.py`) + +On startup, queries the GitHub Releases API for the configured repo +(`robertsamples/mpact` by default — Robert's fork), compares the latest +published release tag against the running version (`__version__`, kept in +`mpactupdate.py`; **keep it in sync with `main.py`'s `label_credits`** string, +currently `v1.00.01` -> `__version__ = '1.0.1'`), and if newer offers a +`git pull --ff-only` update (with a "please restart" prompt on success, or +opens the release page on failure). Version compare uses `packaging.version` +(PEP 440, numeric — so 2.10 > 2.9) with a dotted-int fallback; an unparseable +tag is treated as "not newer" (never nags). Every failure mode (offline, no +releases yet/404, malformed JSON, no git) is non-fatal and silent. + +- **Updater-framework evaluation (asked): no off-the-shelf framework.** The + standard option, `pyupdater`, targets *frozen* PyInstaller/cx_Freeze apps + and needs its own patch-server + signing setup — heavyweight for a tool run + from a git clone. For a source checkout the meaningful update is `git pull`, + and "is there a newer release" is one API call + a version compare, which is + all this module is. **Action needed from you:** tag releases on the fork + (e.g. `v1.0.1`) and bump `__version__` per release, or this finds nothing. +- For the *portable PyInstaller build* (no git), `apply_git_update` will fail + gracefully and the user is sent to the release page to download manually — + a real auto-updater for the frozen build is the `pyupdater`-shaped project + to consider only if/when that distribution channel matters. + +### Crash / error reporter (`crashreport.py`, `tests/test_crashreport.py`) + +Installs a `sys.excepthook` (after `QApplication` exists) that, on any +unhandled exception: chains to the default hook (traceback still hits the +console), formats a full report (traceback + MPACT/Python/platform versions + +timestamp + optional context), writes it to a timestamped file under +`~/.mpact/crashlogs/`, and shows a dialog offering to open a **prefilled +GitHub issue** (title + fenced traceback body) in the browser. Nothing is sent +without the user clicking through. The excepthook is hardened to never raise. + +- **Crash-logger-framework evaluation (asked): Sentry is the off-the-shelf + option, deliberately not used.** `sentry-sdk` is built for hosted/web + services: it sends events to a Sentry project by default (silent cloud + egress — wrong default for a desktop research tool), needs a DSN/account + provisioned, and *still* needs a custom `before_send` hook + dialog to honour + "ask the user first." The local-log + prefilled-GitHub-issue flow gives the + maintainer the same thing (a complete traceback) with zero infrastructure + and no privacy surprise. If MPACT later ships to many non-technical users and + a central error feed becomes worthwhile, Sentry with `before_send` gating is + the documented upgrade path (noted in `crashreport.py`). +- **PyQt5 note to verify live:** PyQt5 routes unhandled exceptions raised + inside Qt slots through `sys.excepthook` (then may abort), so this should + catch most in-GUI crashes — but the exact abort-after-hook behaviour is + PyQt5-version-dependent and is the one thing to confirm by actually + triggering an error in the running app. From 698af6f936522055bc6e491ff8ff7bf0004f5f7c Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 10:00:34 -0400 Subject: [PATCH 19/20] efficiency updates changes made to database searching to improve search speed, gitignore cleaned up --- .gitignore | 448 +----------------------------------- code/dbsearch.py | 59 ++++- code/dialogs.py | 111 +++++++++ code/main.py | 35 ++- code/mpactupdate.py | 2 +- code/qualityscore.py | 25 +- code/tests/test_dbsearch.py | 41 ++++ code/tests/test_dialogs.py | 68 ++++++ devnotes.md | 95 +++++++- docs/installation.md | 2 +- 10 files changed, 386 insertions(+), 500 deletions(-) create mode 100644 code/dialogs.py create mode 100644 code/tests/test_dialogs.py diff --git a/.gitignore b/.gitignore index 3b26373..4bc3623 100644 --- a/.gitignore +++ b/.gitignore @@ -26,459 +26,17 @@ code/__pycache__/files.cpython-39.pyc code/__pycache__/files_rc.cpython-38.pyc code/__pycache__/files_rc.cpython-39.pyc code/__pycache__/main.cpython-38.pyc -code/__pycache__/main.cpython-39.pyc code/__pycache__/plotting.cpython-38.pyc -code/__pycache__/plotting.cpython-39.pyc -code/__pycache__/stats.cpython-39.pyc code/__pycache__/ui_featureinfo.cpython-38.pyc code/__pycache__/ui_featureinfo.cpython-39.pyc code/__pycache__/ui_functions.cpython-38.pyc code/__pycache__/ui_main.cpython-38.pyc code/__pycache__/ui_main.cpython-39.pyc code/__pycache__/ui_plotparam.cpython-39.pyc -code/compoundimages/(+)-5(6),13-halimadiene-15-ol.png -code/compoundimages/(+)-caryolan-1-ol.png -code/compoundimages/(+)-discoipyrrole A.png -code/compoundimages/(-)-7-Geranylindolactam V.png -code/compoundimages/(-)-Neoverrucosan-5beta-ol.png -code/compoundimages/(-)-Verrucosan-2beta-ol.png -code/compoundimages/(1S,3aR)-jadomycin V.png -code/compoundimages/(2E,4E)-7-methylocta-2,4-dienoic acid amide.png -code/compoundimages/(2S,3S)-3-Hydroxy-1,4-diphenylbutan-2-yl-acetate.png -code/compoundimages/(5R) 5-hydroxy-3-\[\[2-(4-hydroxyphenyl)ethyl\]amino\]-5-vinyl-2-cyclopenten-1-one.png -code/compoundimages/(5S,S)-5-methyl-3-(2-methylbutyl)furan-2(5H)-one.png -code/compoundimages/(5S,S)-5-methyl-3-(3-methylpentyl)furan-2(5H)-one.png -code/compoundimages/(E)-12-methyltridec-3-enenitrile.png -code/compoundimages/(E)-ethyl 8-oxooctadec-9-enoate.png -code/compoundimages/(E)-tetradec-3-enenitrile.png -code/compoundimages/(S)-N-tetradecanoyl-HSL.png -code/compoundimages/(Z)-12-methyltridec-3-enenitrile.png -code/compoundimages/(Z)-15-Methylhexadec-10-en-2-one.png -code/compoundimages/(Z)-octadec-11-enenitrile.png -code/compoundimages/(Z)-tetradec-3-enenitrile.png -code/compoundimages/(Z)-tetradec-7-enenitrile.png -code/compoundimages/-L-glutamyl-L-leucine.png -code/compoundimages/1 '- (beta- Glucopyranosyloxy) di- O-demethylspirilloxan thin.png -code/compoundimages/1'-beta-glucopyranosyl-3,4,3',4'-tetradehydro-1', 2'-dihydro-beta,psi-caroten-2-one.png -code/compoundimages/1,4-dihydroxy-2,5-dimethoxy-9,10-anthraquinone.png -code/compoundimages/1-hydroxymethylindole-3-carboxylic acid.png -code/compoundimages/1-methyl-pseudouridine.png -code/compoundimages/1-methylthio-2,3-di-O-(3',7',11',15'-tetramethylhexadecyl)glycerol (diphytanylglyceryl methylthioether).png -code/compoundimages/10,15-dihydroxyamorph-4-en-3-one.png -code/compoundimages/12-Deoxy-deoxysaxitoxin.png -code/compoundimages/12-hydroxy-13-butoxyethoxyfumitremorgin B.png -code/compoundimages/16-methyloxazolomycin.png -code/compoundimages/17-Methylenespiramycin.png -code/compoundimages/17-O-ethylnotoamide M.png -code/compoundimages/18-methyltacrolimus.png -code/compoundimages/2-(2-carboxyethyl)-8-hydroxyquinazolin-4(3H)-one.png -code/compoundimages/2-(furan-2-yl)-6-(2S,3S,4-trihydroxybutyl)pyrazine.png -code/compoundimages/2-(Heptadecyl)-3,6-dihydroxy-1,4-benzoquinone.png -code/compoundimages/2-amino-4-methoxy-5-cyanopyrrolo\[2,3-d\]pyrimidine.png -code/compoundimages/2-amino-6-hydroxyphenoxazin-3-one.png -code/compoundimages/2-Demethylmonensin B.png -code/compoundimages/2-ethyl-7-hydroxy-6,7-dihydro-5H-indolizin-3-one.png -code/compoundimages/23-(6-methyl)heptanoic acid demalonylazalomycin F3a ester.png -code/compoundimages/3-hydroxydehydrodaidzein.png -code/compoundimages/3-Isobutylpropanamide-2-cyclopenten-1-one.png -code/compoundimages/3-N-formyl- holyrine A.png -code/compoundimages/4,5-dihydroxy-7-methylphthalide.png -code/compoundimages/4-\[2-O-9Z-hexadecenoyl--glucopyranosyl\]-4,4-diapolycopene-4,4-dioic acid.png -code/compoundimages/4-desmethylepothilone D.png -code/compoundimages/4-methoxy-3H-isobenzofuran-1-one.png -code/compoundimages/4-quinolinecarboxylic acid.png -code/compoundimages/41-Demethylhomooligomycin B.png -code/compoundimages/5'-deoxyguanosine.png -code/compoundimages/5,18-dedihydroxycyclooctatin.png -code/compoundimages/5,7,3',4'-Tetrahydroxy-8-methylisoflavon.png -code/compoundimages/5-Hydroxy-3-(1-hydroxy-2-methylbutyl)-4-methyl-2(5H)-furanone.png -code/compoundimages/5-hydroxydeoxyvasicinone.png -code/compoundimages/6-acetylphenazine-1-carboxylic acid.png -code/compoundimages/6-deoxyerythronolide B.png -code/compoundimages/6-Hydroxysordarin.png -code/compoundimages/7-Hydroxy-8,16-dimethyl-9-octadecenoic acid.png -code/compoundimages/7-Tetradecenoic acid.png -code/compoundimages/8,9-dihydrolactimidomycin.png -code/compoundimages/8-desmethoxy-isomigrastatin.png -code/compoundimages/8-hydroxy-8,9-dihydrolactidomycin.png -code/compoundimages/\[D-Asp, Dhb^7\]microcystin-LR.png -code/compoundimages/\[D-Asp3,Ser7\]MC-LR.png -code/compoundimages/Abenquine B2.png -code/compoundimages/Abenquine C.png -code/compoundimages/Abyssomicin C.png -code/compoundimages/Abyssomicin P.png -code/compoundimages/Acidiphilamide C.png -code/compoundimages/Actiketal.png -code/compoundimages/Actinoallolide D.png -code/compoundimages/Actinoramide E.png -code/compoundimages/AHB-6-Methylneamine.png -code/compoundimages/AI-77-F.png -code/compoundimages/Albatrelin F.png -code/compoundimages/Albogrisin B.png -code/compoundimages/Albucyclone A.png -code/compoundimages/Albumycin.png -code/compoundimages/Aldgamycin E.png -code/compoundimages/Aldgamycin I.png -code/compoundimages/Aldgamycin K.png -code/compoundimages/Alokicenone C.png -code/compoundimages/AM-2604 A.png -code/compoundimages/Amphibactin T.png -code/compoundimages/Amycolatopsin C.png -code/compoundimages/Ananstrep C.png -code/compoundimages/AnhydroSEK4b.png -code/compoundimages/Antarlide F.png -code/compoundimages/Antascomicin B.png -code/compoundimages/Antascomicin E.png -code/compoundimages/Antillatoxin.png -code/compoundimages/Aquayamycin.png -code/compoundimages/Argimicin B.png -code/compoundimages/Arthripenoid B.png -code/compoundimages/Ashimide B.png -code/compoundimages/Aspergilone A.png -code/compoundimages/Asterobactin B.png -code/compoundimages/Azicemicin A.png -code/compoundimages/Bacillamidin G.png -code/compoundimages/Bacilysocin.png -code/compoundimages/Bafilomycin C2.png -code/compoundimages/Bafilomycin G.png -code/compoundimages/Balgacyclamide C.png -code/compoundimages/Bananamide 2.png -code/compoundimages/Banegasine.png -code/compoundimages/Bartoloside H.png -code/compoundimages/Bartoloside I.png -code/compoundimages/BE-10988.png -code/compoundimages/BE-14106.png -code/compoundimages/BE-32030A.png -code/compoundimages/Benzastatin A.png -code/compoundimages/Biphenomycin C.png -code/compoundimages/Biseokeaniamide A.png -code/compoundimages/Blastmycetin C.png -code/compoundimages/Brasilibactin A.png -code/compoundimages/Brasiliquinone C.png -code/compoundimages/Brintonamide A.png -code/compoundimages/Brintonamide B.png -code/compoundimages/Caerulomycin G.png -code/compoundimages/Caldorin.png -code/compoundimages/Carboxymycobactin-7.png -code/compoundimages/Cepafungin I.png -code/compoundimages/Cephamycin C.png -code/compoundimages/Chaiyaphumine D.png -code/compoundimages/Chejuenolide A.png -code/compoundimages/Chlorotonil B.png -code/compoundimages/Chrondamide 12D.png -code/compoundimages/Circumdatin D.png -code/compoundimages/Cis-7-tetradecenoyl-D-asparagine.png -code/compoundimages/Citreamicin alpha.png -code/compoundimages/Citreo-g-pyrone.png -code/compoundimages/Clavirolide A.png -code/compoundimages/Coibacin C.png -code/compoundimages/Columbamide B.png -code/compoundimages/Concanamycin B.png -code/compoundimages/Conglobatin.png -code/compoundimages/Coronafacoyl-L-isoleucine.png -code/compoundimages/Coronatine.png -code/compoundimages/Cosmomycin A.png -code/compoundimages/Crocagin A.png -code/compoundimages/Cryptophycin-16.png -code/compoundimages/Cryptophycin-38.png -code/compoundimages/Cyanopeptolin 920.png -code/compoundimages/Cyclo(D)-Pro-(D)-Leu.png -code/compoundimages/Cyclo-(L-Ala-L-Tyr).png -code/compoundimages/Cyclodysidin D.png -code/compoundimages/Cylindrocyclophane C2.png -code/compoundimages/Daryamide E.png -code/compoundimages/Defumarylhygrolidin.png -code/compoundimages/Dehydro tilivalline.png -code/compoundimages/Dehydroxynocardamine.png -code/compoundimages/Demethylblasticidin S.png -code/compoundimages/Deoxynybomycin.png -code/compoundimages/Desferrioxamine X4.png -code/compoundimages/Desotamide A.png -code/compoundimages/Desotamide C.png -code/compoundimages/Desotamide G.png -code/compoundimages/Diaphorin.png -code/compoundimages/Dietziamide A.png -code/compoundimages/Dihydromaltophilin.png -code/compoundimages/Dioxolide A.png -code/compoundimages/Diploptene.png -code/compoundimages/DKxanthene 574.png -code/compoundimages/Dokdolipid B.png -code/compoundimages/Dolastatin 10.png -code/compoundimages/Dragonamide D.png -code/compoundimages/Eicosanedioic acid.png -code/compoundimages/Emericellamide A.png -code/compoundimages/Enniatin L.png -code/compoundimages/Enniatin M1.png -code/compoundimages/Epohelmin B.png -code/compoundimages/Eponemycin.png -code/compoundimages/Epothilone D.png -code/compoundimages/Epothilone D1.png -code/compoundimages/Epothilone I1.png -code/compoundimages/Erythromycin G.png -code/compoundimages/ethyl homononactyl homononactate.png -code/compoundimages/Etrogol.png -code/compoundimages/Eurystatin C.png -code/compoundimages/F-Met I.png -code/compoundimages/Flexirubin.png -code/compoundimages/Fluvirucin B2.png -code/compoundimages/Fluvirucin B6.png -code/compoundimages/Fontonamide.png -code/compoundimages/Formicamycin D.png -code/compoundimages/FR-66979.png -code/compoundimages/FR-900848.png -code/compoundimages/Frenolicin G.png -code/compoundimages/Fumaquinone.png -code/compoundimages/Furaquinocin B.png -code/compoundimages/Furaquinocin D.png -code/compoundimages/Fusaricidin D.png -code/compoundimages/Geralcin E.png -code/compoundimages/GGL.3.png -code/compoundimages/Glidobactin C.png -code/compoundimages/Gln-Asp-Val-Leu.png -code/compoundimages/Glomecidin.png -code/compoundimages/Glycocinnasperimicin D.png -code/compoundimages/Gobichelin B.png -code/compoundimages/Griselimycin.png -code/compoundimages/Guineamide C.png -code/compoundimages/H2-6-Hydroxymethylpterin.png -code/compoundimages/Halstoctacosanolide B.png -code/compoundimages/Hapalindole D.png -code/compoundimages/Heliomycin.png -code/compoundimages/Hexadecanenitrile.png -code/compoundimages/Hexose-palythine-serine.png -code/compoundimages/Homorapamycin A.png -code/compoundimages/Hoshinolactam.png -code/compoundimages/IC-202-A.png -code/compoundimages/Ilanefuranone.png -code/compoundimages/Indigoidine.png -code/compoundimages/Indisocin.png -code/compoundimages/Indole-3-acetic acid methyl ester.png -code/compoundimages/Inonotusic acid.png -code/compoundimages/Iromycin C.png -code/compoundimages/Isobongkrekic acid,.png -code/compoundimages/Isobutyrylvalindomycin.png -code/compoundimages/Isomalyngamide A.png -code/compoundimages/Isorhizopodin.png -code/compoundimages/Isotuberculosino.png -code/compoundimages/Izenamide B.png -code/compoundimages/JBIR-05.png -code/compoundimages/JBIR-80.png -code/compoundimages/Jomthonic acid E.png -code/compoundimages/Juglomycin I.png -code/compoundimages/Kalafungin.png -code/compoundimages/Kalimantacin B.png -code/compoundimages/Kandenol A.png -code/compoundimages/Kijimicin.png -code/compoundimages/Koreenceine A.png -code/compoundimages/Koreenceine B.png -code/compoundimages/Koreenceine C.png -code/compoundimages/Korormicin K.png -code/compoundimages/Kribelloside A.png -code/compoundimages/Kribelloside B.png -code/compoundimages/L--(3-hydroxyureido)-alanine.png -code/compoundimages/Lactoquinomycin.png -code/compoundimages/Lagunamide B.png -code/compoundimages/Landomycin A.png -code/compoundimages/Landomycin S.png -code/compoundimages/Lentzeoside E.png -code/compoundimages/Leptofuranin D.png -code/compoundimages/Leualacin G.png -code/compoundimages/Leupyrrin B2.png -code/compoundimages/Leuseramycin.png -code/compoundimages/Lipoamide C.png -code/compoundimages/Lipstatin.png -code/compoundimages/Lobarialide C.png -code/compoundimages/Lobosamide C.png -code/compoundimages/Lodopyridone.png -code/compoundimages/Luminmide B.png -code/compoundimages/Lutoside.png -code/compoundimages/Lyngbyatoxin A.png -code/compoundimages/Maculalactone K.png -code/compoundimages/Maculalactone M.png -code/compoundimages/Mandelalide A.png -code/compoundimages/Mansouramycin D.png -code/compoundimages/Maremycin D2.png -code/compoundimages/Marformycin D.png -code/compoundimages/Maridomycin III.png -code/compoundimages/Marinactinone A.png -code/compoundimages/Marinobactin-D1.png -code/compoundimages/Martinomycin.png -code/compoundimages/Matlystatin A.png -code/compoundimages/Mer-WF3010.png -code/compoundimages/Metacridamide A.png -code/compoundimages/methyl 1-(methyl propionate)--carboline-3-carboxylate.png -code/compoundimages/Microginin 576.png -code/compoundimages/Microginin 91-A.png -code/compoundimages/Microginin FR9.png -code/compoundimages/Microtermolide A.png -code/compoundimages/Milbemycin 10.png -code/compoundimages/Milbemycin 26.png -code/compoundimages/Minutissamide J.png -code/compoundimages/MKN-004C.png -code/compoundimages/Mohangic acid A.png -code/compoundimages/Mohangic acid E.png -code/compoundimages/Monactin.png -code/compoundimages/Mupirocin F.png -code/compoundimages/Mutaxanthene B.png -code/compoundimages/Mycemycin A.png -code/compoundimages/Mycemycin E.png -code/compoundimages/Myxochromide S3.png -code/compoundimages/Myxopyronin B.png -code/compoundimages/Myxotyroside B.png -code/compoundimages/N,N'-diisobutylurea.png -code/compoundimages/N-Acetyl-tyramine.png -code/compoundimages/N-carboxamido-staurosporine.png -code/compoundimages/N-methylphloretamide.png -code/compoundimages/N-Tetradecadienoyl-L-homoserine lactone.png -code/compoundimages/Nai414-B.png -code/compoundimages/Namalide C.png -code/compoundimages/Namalide E.png -code/compoundimages/Naphthacemycin B3.png -code/compoundimages/Naphthgeranine A.png -code/compoundimages/Neomacrophorin III.png -code/compoundimages/Nevaltophin A.png -code/compoundimages/Nigerapyrone H.png -code/compoundimages/Nitrosoxacin C.png -code/compoundimages/Nocapyrone R.png -code/compoundimages/Nocardichelin B.png -code/compoundimages/Nocardiopyrone A.png -code/compoundimages/Nostopeptolide A3.png -code/compoundimages/Nostopeptolide L3.png -code/compoundimages/Nostophycin.png -code/compoundimages/Not named.png -code/compoundimages/NP-101A.png -code/compoundimages/NW-G03.png -code/compoundimages/Obscurolide-C2 methyl ester.png -code/compoundimages/Octacyclomycin.png -code/compoundimages/Odyverdiene B.png -code/compoundimages/Okaramine H.png -code/compoundimages/Oryzamide B.png -code/compoundimages/Oscillamide B.png -code/compoundimages/Oscillatoxin E.png -code/compoundimages/Oxepinamide C.png -code/compoundimages/Palmyrrolinone.png -code/compoundimages/Panclicin E.png -code/compoundimages/Panosialin C.png -code/compoundimages/Paulomycin E.png -code/compoundimages/PD-118576-A3.png -code/compoundimages/Pepsatin Pr.png -code/compoundimages/Pestabacillin B.png -code/compoundimages/Phe+CO\[Lys+Val+Leu+MeHty+MetO\].png -code/compoundimages/Phenalinolactone A.png -code/compoundimages/Phenoxan.png -code/compoundimages/Phenylbutenote.png -code/compoundimages/Phenylnannolone A.png -code/compoundimages/Phenylnannolone C.png -code/compoundimages/Photopyrone A.png -code/compoundimages/Phototemtide A.png -code/compoundimages/Piericidin B5.png -code/compoundimages/Pimprinol A.png -code/compoundimages/Planktocyclin.png -code/compoundimages/Planktopeptin BL843.png -code/compoundimages/PM-toxin B.png -code/compoundimages/Porpoisamide B.png -code/compoundimages/Poststatin.png -code/compoundimages/Pseudoaeruginosin NS1.png -code/compoundimages/Pseudodestruxin A.png -code/compoundimages/Psi-tectorigenin.png -code/compoundimages/Psuedodestruxin C.png -code/compoundimages/Pukeleimide E.png -code/compoundimages/Pulicatin A.png -code/compoundimages/Pyonitrin A.png -code/compoundimages/Pyridindolol K2.png -code/compoundimages/Pyridinopyrone C.png -code/compoundimages/Pyrroindomycin B.png -code/compoundimages/Qinimycin C.png -code/compoundimages/Quinolobactin.png -code/compoundimages/Rakicidin B.png -code/compoundimages/Ralfuranone B.png -code/compoundimages/Ralfuranone I.png -code/compoundimages/Ralstonin B.png -code/compoundimages/Rhabdopeptide 8.png -code/compoundimages/Rhizomide B.png -code/compoundimages/RHM2.png -code/compoundimages/Rhodopeptin C1.png -code/compoundimages/Rhodopeptin C4.png -code/compoundimages/Ribocyclophane C.png -code/compoundimages/Rifamycin Z.png -code/compoundimages/RK-144171.png -code/compoundimages/Roquefortine A.png -code/compoundimages/Roseobacticide H.png -code/compoundimages/Salinipostin D.png -code/compoundimages/Salinosporamide I.png -code/compoundimages/Sanglifehrin C.png -code/compoundimages/Sarpeptin B.png -code/compoundimages/SCH 38518.png -code/compoundimages/Sch 39185.png -code/compoundimages/Sclerolizine.png -code/compoundimages/Semiplenamide F.png -code/compoundimages/Serinolamides D.png -code/compoundimages/Serratamolide C.png -code/compoundimages/Serratamolide D.png -code/compoundimages/Serratamolide E.png -code/compoundimages/SF-1902-A3.png -code/compoundimages/SF-1902-A4b.png -code/compoundimages/SF-2140.png -code/compoundimages/SF2738C.png -code/compoundimages/Shikometabolin A.png -code/compoundimages/Siastatin B.png -code/compoundimages/Silalthride.png -code/compoundimages/Sordarin-1-glucose ester.png -code/compoundimages/Spliceostatin E.png -code/compoundimages/Spongiporic acid A.png -code/compoundimages/Sporminarin B.png -code/compoundimages/Stoloniferone L.png -code/compoundimages/Strepantibin C.png -code/compoundimages/Streptoaminal-8n.png -code/compoundimages/Streptoaminal9n.png -code/compoundimages/Streptofactin.png -code/compoundimages/Streptoone C.png -code/compoundimages/Streptovirudin D1.png -code/compoundimages/Strevertene B.png -code/compoundimages/Syringolin G.png -code/compoundimages/T1801 A.png -code/compoundimages/Tasipeptin A.png -code/compoundimages/Tasipeptin B.png -code/compoundimages/Tautomycin.png -code/compoundimages/Teixobactin.png -code/compoundimages/Tenacibactin A.png -code/compoundimages/Tenacibactin B.png -code/compoundimages/Terresterol.png -code/compoundimages/Tetradecanenitrile.png -code/compoundimages/Tetronomycin.png -code/compoundimages/Thailandamide lactone.png -code/compoundimages/Thaxteramide A2.png -code/compoundimages/Thaxtomin B.png -code/compoundimages/Tjipanazole C1.png -code/compoundimages/Trichophycin C.png -code/compoundimages/Triedimycin A.png -code/compoundimages/Trierixin.png -code/compoundimages/Tyromycic acid G.png -code/compoundimages/U-77864.png -code/compoundimages/UK-78629.png -code/compoundimages/Unguisin E.png -code/compoundimages/USF-142A.png -code/compoundimages/Uvidin-A ester 2a.png -code/compoundimages/Violapyrone B.png -code/compoundimages/Violapyrone E.png -code/compoundimages/VLP T.png -code/compoundimages/Wortmanamide B.png -code/compoundimages/Xanthocillin-X dimethylether.png -code/compoundimages/Xefoampeptide C.png -code/compoundimages/Xentrivalpeptide O.png -code/compoundimages/Yanucamide A.png -code/compoundimages/Yatakemycin.png -code/compoundimages/Yicathin C.png -code/compoundimages/YM-47515 degradation product.png -code/compoundimages/Ypaoamide C.png -code/compoundimages/Z-4-2.png -code/test_upsetplt.png +code/compoundimages/*.png code/test_upsetplt.png code/treemap.png code/untitled0.py -code/test_upsetplt.png -code/test_upsetplt.png -code/treemap.png msdial.mgf msdial.msp msdial_unformatted.txt @@ -490,7 +48,3 @@ progenesis.csv progenesis.msp 241115_mpactreferenceguide.docx 220504_mpactmanual.docx -code/treemap.png -code/test_upsetplt.png -code/test_upsetplt.png -code/treemap.png diff --git a/code/dbsearch.py b/code/dbsearch.py index c752f4f..f4894ae 100644 --- a/code/dbsearch.py +++ b/code/dbsearch.py @@ -11,7 +11,6 @@ """ import numpy as np -import pandas as pd from csvcache import cached_read_csv, invalidate @@ -39,18 +38,54 @@ def search_npatlas(outputdir, filename_stem, atlas, ppm_threshold): msdata = cached_read_csv(outputdir / (filename_stem + '_filtered.csv'), sep=',', header=[2], index_col=None).iloc[:, :3] - for _, mrow in msdata.iterrows(): - # Iterates over iondict, filters DB matches within window. - # Repeats for adducts, uses length of concat DF for feature hits - mass = mrow['m/z'] - hits_h = atlas[abs(1000000 * (atlas['compound_m_plus_h'] - mass) / atlas['compound_m_plus_h']) < ppm_threshold].copy() - hits_h['ppm'] = abs(1000000 * (hits_h['compound_m_plus_h'] - mass) / hits_h['compound_m_plus_h']) - hits_na = atlas[abs(1000000 * (atlas['compound_m_plus_na'] - mass) / atlas['compound_m_plus_na']) < ppm_threshold].copy() - hits_na['ppm'] = abs(1000000 * (hits_na['compound_m_plus_na'] - mass) / hits_na['compound_m_plus_na']) - hits = pd.concat([hits_h, hits_na]) + # Pre-sort the two adduct-mass columns once so each feature only tests a + # tiny m/z window (via searchsorted) instead of scanning all ~36k atlas + # rows twice -- the old per-feature ``atlas[boolean_mask]`` over the whole + # table was O(features x atlas_rows). The exact original ppm test + # (``abs(1e6*(atlas_mz - mass)/atlas_mz) < ppm_threshold``) is re-applied to + # the windowed candidates, so the matched set is bit-for-bit identical; the + # window (mass*(1 +/- 2*t)) is a safe superset of the true ppm window for + # the small tolerances used here. Verified output-identical (hitdb frames, + # incl. row order + ppm, and the iondict 'hits' column) against the old + # implementation on the real example dataset (~5x faster there). + mph = atlas['compound_m_plus_h'].to_numpy(dtype=float) + mna = atlas['compound_m_plus_na'].to_numpy(dtype=float) + order_h = np.argsort(mph, kind='stable'); sorted_h = mph[order_h] + order_na = np.argsort(mna, kind='stable'); sorted_na = mna[order_na] + t = ppm_threshold / 1e6 + + def _match(mass, sorted_vals, order, col_vals): + # Atlas positions whose ppm error vs `mass` is below threshold, in + # ascending atlas-position (i.e. original boolean-mask) order, plus + # their ppm values. + lo = np.searchsorted(sorted_vals, mass * (1 - 2 * t), side='left') + hi = np.searchsorted(sorted_vals, mass * (1 + 2 * t), side='right') + cand = order[lo:hi] + if cand.size == 0: + return cand, cand.astype(float) + cv = col_vals[cand] + sel = np.sort(cand[np.abs(1e6 * (cv - mass) / cv) < ppm_threshold]) + sel_cv = col_vals[sel] + return sel, np.abs(1e6 * (sel_cv - mass) / sel_cv) + + masses = msdata['m/z'].to_numpy(dtype=float) + compounds = msdata.iloc[:, 0].to_numpy() + counts = np.empty(len(masses), dtype=float) + for i in range(len(masses)): + mass = masses[i] + # m+h matches then m+na matches, concatenated in that order (matching + # the old ``pd.concat([hits_h, hits_na])``) and slicing the atlas once. + pos_h, ppm_h = _match(mass, sorted_h, order_h, mph) + pos_na, ppm_na = _match(mass, sorted_na, order_na, mna) + positions = np.concatenate([pos_h, pos_na]) + hits = atlas.iloc[positions].copy() + hits['ppm'] = np.concatenate([ppm_h, ppm_na]) hits = hits.sort_values(by=['ppm']) - hitdb[mrow['Compound']] = hits - iondict.loc[mrow['Compound'], 'hits'] = hits.shape[0] + hitdb[compounds[i]] = hits + counts[i] = positions.size + # One vectorised column assignment instead of a per-feature ``.loc`` scalar + # set (msdata's Compound ids are unique, a subset of iondict's index). + iondict.loc[compounds, 'hits'] = counts iondict.to_csv(outputdir / 'iondict.csv', header=True, index=True) # iondict.csv just changed on disk (gained/updated the 'hits' column) -- diff --git a/code/dialogs.py b/code/dialogs.py new file mode 100644 index 0000000..00862e3 --- /dev/null +++ b/code/dialogs.py @@ -0,0 +1,111 @@ +""" +MPACT +Copyright 2022, Robert M. Samples, Sara P. Puckett, and Marcy J. Balunas + +Dark-themed QMessageBox helpers, matching the main GUI palette so the app's +dialogs don't render with invisible black-on-black text (the default when a +QMessageBox inherits the app's dark styling but has no colours of its own). + +Kept out of ``main.py`` (which can't be imported headlessly -- the documented +main<->ui_functions circular import) so the box construction/styling can be +unit-tested via Qt's offscreen platform (see ``tests/test_dialogs.py``), the +same approach used for ``searchtree.py``. + +Palette mirrors ui_main.py: background rgb(40,40,40), text rgb(212,212,212), +buttons rgb(62,62,62) / hover rgb(75,75,75). + +The push-buttons are styled *per widget* (``box.buttons()``) rather than via a +``QMessageBox QPushButton`` descendant selector: in practice that descendant +rule did not take effect on the standard buttons (they rendered borderless with +black text), while the box/label rules did -- styling each button object +directly is selector-independent and reliable. On Windows the native title bar +is also switched to dark + square corners via the DWM API so the dialog frame +matches the app's dark theme instead of the default light, rounded Win11 bar. +""" + +import sys + +from PyQt5 import QtWidgets + +_BG = 'rgb(40,40,40)' +_TEXT = 'rgb(212,212,212)' + +DIALOG_STYLE = """ +QMessageBox { background-color: %s; } +QMessageBox QLabel { color: %s; } +QMessageBox QTextEdit { background-color: rgb(35,35,35); color: %s; } +""" % (_BG, _TEXT, _TEXT) + +_BUTTON_STYLE = """ +QPushButton { + background-color: rgb(62,62,62); + color: rgb(212,212,212); + border: 1px solid rgb(120,120,120); + border-radius: 3px; + padding: 4px 16px; + min-width: 64px; +} +QPushButton:hover { background-color: rgb(75,75,75); } +QPushButton:pressed { background-color: rgb(55,55,55); } +QPushButton:default { border: 1px solid rgb(160,160,160); } +""" + + +def apply_dark_titlebar(widget): + """Best-effort: make a top-level window's title bar dark with square + corners on Windows 11 (no-op / silently ignored everywhere else). + + Uses the DWM window attributes (immersive dark mode + corner preference). + Must run before the window is first shown to take effect cleanly, so call + it after ``winId()`` realises the native handle but before ``exec_()``. + """ + if sys.platform != 'win32': + return + try: + import ctypes + hwnd = int(widget.winId()) + dwm = ctypes.windll.dwmapi + flag = ctypes.c_int(1) + # DWMWA_USE_IMMERSIVE_DARK_MODE: 20 on current Win10/11, 19 on early + # 20H1 builds -- set both; the wrong one just returns a failure code. + for attr in (20, 19): + dwm.DwmSetWindowAttribute(hwnd, attr, ctypes.byref(flag), ctypes.sizeof(flag)) + # DWMWA_WINDOW_CORNER_PREFERENCE = 33, DWMWCP_DONOTROUND = 1 (Win11). + corner = ctypes.c_int(1) + dwm.DwmSetWindowAttribute(hwnd, 33, ctypes.byref(corner), ctypes.sizeof(corner)) + except Exception: + pass + + +def build_message_box(parent, icon, title, text, buttons=None, default=None, + detailed=None): + """Construct a QMessageBox styled to match the MPACT GUI (does NOT exec). + + Separated from :func:`styled_message_box` so tests can inspect the + configured box without blocking on a modal ``exec_()``. + """ + box = QtWidgets.QMessageBox(parent) + box.setIcon(icon) + box.setWindowTitle(title) + box.setText(text) + if buttons is not None: + box.setStandardButtons(buttons) + if default is not None: + box.setDefaultButton(default) + if detailed is not None: + box.setDetailedText(detailed) + box.setStyleSheet(DIALOG_STYLE) + # Style each button object directly -- reliable where the descendant + # selector wasn't applied to the standard buttons. + for button in box.buttons(): + button.setStyleSheet(_BUTTON_STYLE) + return box + + +def styled_message_box(parent, icon, title, text, buttons=None, default=None, + detailed=None): + """Build the styled box, show it modally, and return the clicked button.""" + box = build_message_box(parent, icon, title, text, buttons=buttons, + default=default, detailed=detailed) + apply_dark_titlebar(box) + return box.exec_() diff --git a/code/main.py b/code/main.py index fc0e883..c884095 100644 --- a/code/main.py +++ b/code/main.py @@ -48,6 +48,7 @@ import npatlasupdate import mpactupdate import crashreport +from dialogs import styled_message_box from indigo import Indigo from indigo.renderer import IndigoRenderer @@ -573,12 +574,12 @@ def _check_atlas_freshness(self, atlas_path='npatlas.tsv', max_age_days=30): return age = npatlasupdate.atlas_age_days(atlas_path) age_msg = 'missing' if age is None else ('about %d days old' % int(age)) - reply = QtWidgets.QMessageBox.question( - self, 'Update Natural Products Atlas?', + reply = styled_message_box( + self, QtWidgets.QMessageBox.Question, 'Update Natural Products Atlas?', 'Your local Natural Products Atlas database is %s.\n\n' 'Download the latest copy from npatlas.org now (about 30 MB)?' % age_msg, - QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, - QtWidgets.QMessageBox.No) + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + default=QtWidgets.QMessageBox.No) if reply != QtWidgets.QMessageBox.Yes: return # The download is large and runs on the main thread (wait cursor); a @@ -604,20 +605,20 @@ def _check_app_update(self): notes = (info.notes or '').strip() if len(notes) > 800: notes = notes[:800] + '...' - reply = QtWidgets.QMessageBox.question( - self, 'MPACT update available', + reply = styled_message_box( + self, QtWidgets.QMessageBox.Question, 'MPACT update available', 'A newer MPACT release is available.\n\n' 'Installed: %s\nLatest: %s\n\n%s\n\n' 'Update now (git pull)?' % (info.current, info.latest, notes), - QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, - QtWidgets.QMessageBox.No) + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + default=QtWidgets.QMessageBox.No) if reply != QtWidgets.QMessageBox.Yes: return repo_dir = Path(__file__).resolve().parent.parent ok, output = mpactupdate.apply_git_update(repo_dir) if ok: - QtWidgets.QMessageBox.information( - self, 'Update complete', + styled_message_box( + self, QtWidgets.QMessageBox.Information, 'Update complete', 'MPACT was updated. Please restart the application.\n\n' + output) else: self.error('Automatic update failed; opening the release page instead.') @@ -1475,19 +1476,17 @@ def mousePressEvent(self, event): # See crashreport.py for the design (and why not Sentry). def _crash_dialog(report, log_path, issue_url): try: - box = QtWidgets.QMessageBox() - box.setIcon(QtWidgets.QMessageBox.Critical) - box.setWindowTitle('MPACT encountered an error') text = 'An unexpected error occurred.' if log_path: text += '\n\nA crash log was saved to:\n' + log_path text += ('\n\nReport this on GitHub? Your browser will open a ' 'prefilled issue — nothing is sent automatically.') - box.setText(text) - box.setDetailedText(report) - box.setStandardButtons(QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No) - box.setDefaultButton(QtWidgets.QMessageBox.No) - if box.exec_() == QtWidgets.QMessageBox.Yes and issue_url: + if styled_message_box( + None, QtWidgets.QMessageBox.Critical, + 'MPACT encountered an error', text, + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + default=QtWidgets.QMessageBox.No, + detailed=report) == QtWidgets.QMessageBox.Yes and issue_url: webbrowser.open(issue_url) except Exception: pass diff --git a/code/mpactupdate.py b/code/mpactupdate.py index cc2810c..4eb9689 100644 --- a/code/mpactupdate.py +++ b/code/mpactupdate.py @@ -31,7 +31,7 @@ #: this checker can see it. Kept here as the single in-code source of truth; #: keep it consistent with main.py's ``label_credits`` display string #: (currently shows ``v1.00.01``). -__version__ = '1.0.1' +__version__ = '1.0.01' DEFAULT_REPO = 'robertsamples/mpact' _RELEASES_LATEST = 'https://api.github.com/repos/{repo}/releases/latest' diff --git a/code/qualityscore.py b/code/qualityscore.py index 889f33b..b35e11f 100644 --- a/code/qualityscore.py +++ b/code/qualityscore.py @@ -110,18 +110,19 @@ def compute_cv_quality(iondict, average_n): modelstdev = noise_model_cv(average_n) # Area under each rarefaction curve (percentage integrated over CV). - prevav = 0 - aucav = 0 - prevmed = 0 - aucmed = 0 - for pos in range(0, len(iondictmean.iloc[:, 0])): - dist = iondictmean.iloc[pos, :]['average CV'] - prevav - aucav += dist * iondictmean.iloc[pos, 0] - prevav = iondictmean.iloc[pos, :]['average CV'] - - dist = iondictmed.iloc[pos, :]['median CV'] - prevmed - aucmed += dist * iondictmed.iloc[pos, 0] - prevmed = iondictmed.iloc[pos, :]['median CV'] + # Vectorised equivalent of the original per-row loop + # aucav += (cv[pos] - cv[pos-1]) * pct[pos] (cv[-1] := 0) + # using np.diff(prepend=0) so the first step's "previous" value is 0, + # matching the loop's prevav/prevmed starting at 0. (np.sum's pairwise + # summation can differ from the loop's sequential add by <1 ULP, far below + # the 0.1%-rounded display precision -- the faithfulness test pins this.) + cv_av = iondictmean['average CV'].to_numpy() + pct_av = iondictmean.iloc[:, 0].to_numpy() + aucav = float(np.sum(np.diff(cv_av, prepend=0.0) * pct_av)) + + cv_med = iondictmed['median CV'].to_numpy() + pct_med = iondictmed.iloc[:, 0].to_numpy() + aucmed = float(np.sum(np.diff(cv_med, prepend=0.0) * pct_med)) # Integrated gap between the mean and median curves (distribution skew). sumskew = 0 diff --git a/code/tests/test_dbsearch.py b/code/tests/test_dbsearch.py index 94467e3..f12606f 100644 --- a/code/tests/test_dbsearch.py +++ b/code/tests/test_dbsearch.py @@ -66,6 +66,47 @@ def test_no_match_outside_ppm_window(tmp_path): assert hitdb['c2'].empty +def _write_single_feature(tmp_path, mz): + """One-feature filtered table + matching iondict, for the ordering tests.""" + stem = 'example' + pd.DataFrame({'Compound': ['feat'], 'other': [1]}).to_csv(tmp_path / 'iondict.csv', index=False) + with open(tmp_path / (stem + '_filtered.csv'), 'w') as f: + f.write(',,\n,,\nCompound,m/z,Retention time (min)\n') + f.write('feat,%s,1.0\n' % mz) + return tmp_path, stem + + +def test_hits_sorted_by_ppm_across_both_adducts(tmp_path): + # m+h matches A (~20 ppm) and B (~30 ppm); m+na matches C (~5 ppm). + # The combined result must be ascending by ppm: C, A, B. + atlas = pd.DataFrame({ + 'compound_name': ['A', 'B', 'C'], + 'compound_m_plus_h': [200.000, 200.010, 999.0], + 'compound_m_plus_na': [999.0, 999.0, 200.005], + }) + outputdir, stem = _write_single_feature(tmp_path, 200.004) + hitdb, _ = search_npatlas(outputdir, stem, atlas, ppm_threshold=100) + hits = hitdb['feat'] + assert list(hits['compound_name']) == ['C', 'A', 'B'] + assert list(hits['ppm']) == sorted(hits['ppm']) # ascending + + +def test_single_atlas_row_matching_both_adducts_appears_twice(tmp_path): + # One atlas row whose [M+H] and [M+Na] are both at the feature mass must + # appear once per adduct (two rows), matching the old concat behaviour. + atlas = pd.DataFrame({ + 'compound_name': ['D'], + 'compound_m_plus_h': [300.000], + 'compound_m_plus_na': [300.000], + }) + outputdir, stem = _write_single_feature(tmp_path, 300.000) + hitdb, _ = search_npatlas(outputdir, stem, atlas, ppm_threshold=10) + assert len(hitdb['feat']) == 2 + assert list(hitdb['feat']['compound_name']) == ['D', 'D'] + on_disk = pd.read_csv(outputdir / 'iondict.csv', index_col=0) + assert on_disk.loc['feat', 'hits'] == 2 + + def test_invalidates_stale_cached_reads_under_other_shapes(tmp_path): """Regression guard: fillfttree() (main.py) reads iondict.csv with header=[0], index_col=None -- a different cache key than diff --git a/code/tests/test_dialogs.py b/code/tests/test_dialogs.py new file mode 100644 index 0000000..b951d44 --- /dev/null +++ b/code/tests/test_dialogs.py @@ -0,0 +1,68 @@ +"""Headless tests for the dark-themed dialog helpers (``dialogs.py``). + +Uses the offscreen Qt platform (the ``qapp`` fixture in conftest.py) to build +the message boxes without a display and without blocking on a modal exec. +Guards the regression that prompted this module: a QMessageBox with no +explicit colours rendered black-on-black under the app's dark styling. +""" + +from PyQt5 import QtWidgets + +import dialogs + + +def test_style_sets_visible_label_and_dark_background(qapp): + # The style must define a light label colour and a non-default background + # (the fix for the invisible black-on-black text). + assert 'color: rgb(212,212,212)' in dialogs.DIALOG_STYLE + assert 'background-color: rgb(40,40,40)' in dialogs.DIALOG_STYLE + + +def test_build_message_box_applies_style_and_content(qapp): + box = dialogs.build_message_box( + None, QtWidgets.QMessageBox.Question, 'Title here', 'Body text', + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + default=QtWidgets.QMessageBox.No) + assert isinstance(box, QtWidgets.QMessageBox) + assert box.text() == 'Body text' + assert box.windowTitle() == 'Title here' + # The dark theme is actually applied to this box. + assert 'rgb(212,212,212)' in box.styleSheet() + assert box.standardButtons() == (QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No) + assert box.defaultButton() == box.button(QtWidgets.QMessageBox.No) + box.deleteLater() + + +def test_each_button_is_styled_directly(qapp): + # Regression: the descendant selector didn't reach the standard buttons, + # so each button must carry the button stylesheet itself (visible text + + # border). + box = dialogs.build_message_box( + None, QtWidgets.QMessageBox.Question, 't', 'b', + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No) + buttons = box.buttons() + assert len(buttons) == 2 + for button in buttons: + sheet = button.styleSheet() + assert 'color: rgb(212,212,212)' in sheet + assert 'border: 1px solid' in sheet + box.deleteLater() + + +def test_apply_dark_titlebar_never_raises(qapp): + # No-op off Windows; on Windows it best-effort sets DWM attributes and must + # swallow any failure (e.g. an offscreen/invalid HWND). + box = dialogs.build_message_box(None, QtWidgets.QMessageBox.Information, 't', 'b') + dialogs.apply_dark_titlebar(box) # must not raise + box.deleteLater() + + +def test_build_message_box_supports_detailed_text(qapp): + box = dialogs.build_message_box( + None, QtWidgets.QMessageBox.Critical, 'Crash', 'Something failed', + buttons=QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No, + detailed='full traceback here') + assert box.detailedText() == 'full traceback here' + # The detailed-text pane is a QTextEdit, also styled for visibility. + assert 'QTextEdit' in box.styleSheet() + box.deleteLater() diff --git a/devnotes.md b/devnotes.md index adbd9b8..0de516e 100644 --- a/devnotes.md +++ b/devnotes.md @@ -766,14 +766,12 @@ is now **159 passing** (the count above is stale). length and the inner `for fragment in frags:` would iterate characters. Left as-is (single caller, wrapped in try/except), but worth hardening if the MSP writer is ever reused. -- **Docs repo-URL inconsistency.** `mkdocs.yml` `repo_url` and `docs/index.md` - link to `github.com/robertsamples/mpact` (the `origin` fork), but - `docs/installation.md`'s `git clone` line uses - `github.com/BalunasLab/mpact` (the `upstream`/lab repo). Pick one canonical - public URL and make all three consistent. Not changed because which one is - the intended *published* home is a call only you can make (both remotes - exist locally). `docs/index.md`'s stale "multivariate analysis (NMDS)" - feature blurb *was* updated to "(PCA/NMDS/PLS-DA)". +- ~~Docs repo-URL inconsistency~~ — **resolved.** Canonical repo is + `github.com/robertsamples/mpact` (confirmed by the developer); + `docs/installation.md`'s `git clone` line was corrected from `BalunasLab` to + `robertsamples` to match `mkdocs.yml`/`docs/index.md`. `docs/index.md`'s + stale "multivariate analysis (NMDS)" blurb was also updated to + "(PCA/NMDS/PLS-DA)". - **Two orphaned/broken scratch scripts in `code/`.** `npatlassearch.py` reads `npatlas.csv` (the real file is `npatlas.tsv`) at @@ -901,7 +899,7 @@ On startup, queries the GitHub Releases API for the configured repo (`robertsamples/mpact` by default — Robert's fork), compares the latest published release tag against the running version (`__version__`, kept in `mpactupdate.py`; **keep it in sync with `main.py`'s `label_credits`** string, -currently `v1.00.01` -> `__version__ = '1.0.1'`), and if newer offers a +currently `v1.00.01` -> `__version__ = '1.0.01'`), and if newer offers a `git pull --ff-only` update (with a "please restart" prompt on success, or opens the release page on failure). Version compare uses `packaging.version` (PEP 440, numeric — so 2.10 > 2.9) with a dotted-int fallback; an unparseable @@ -945,3 +943,82 @@ without the user clicking through. The excepthook is hardened to never raise. catch most in-GUI crashes — but the exact abort-after-hook behaviour is PyQt5-version-dependent and is the one thing to confirm by actually triggering an error in the running app. + +### Dialog styling (`dialogs.py`, `tests/test_dialogs.py`) + +The three subsystems above all pop `QMessageBox` dialogs. On the live app these +first rendered **black-on-black** (an unstyled dark background with invisible +black text — confirmed from a user screenshot): a `QMessageBox` inherits the +app's dark look but ships no text/background colours of its own. +`dialogs.styled_message_box()` applies a stylesheet matching the GUI palette +(background `rgb(40,40,40)`, text `rgb(212,212,212)`, detailed-text `QTextEdit` +darkened too) so every app dialog is legible and on-theme. Kept in its own +module (not `main.py`) so the box construction is headless-testable via +offscreen Qt (`build_message_box` returns the box without the blocking +`exec_`); `main.py`'s atlas/update/crash prompts all route through it. + +Two follow-ups after the first attempt (from a second user screenshot): +- **Buttons stayed black-on-black/borderless.** The `QMessageBox QPushButton` + *descendant* selector did not take effect on the standard buttons even + though the box/label rules did. Fixed by styling each button object + directly (`for b in box.buttons(): b.setStyleSheet(...)`) with a clearly + visible border (`rgb(120,120,120)`) — selector-independent and reliable. +- **Native title bar was light + rounded** (Win11). `apply_dark_titlebar()` + sets the DWM window attributes (immersive dark mode `20`/`19`, corner + preference `33` = do-not-round) via `ctypes`/`dwmapi`, best-effort and + Windows-only (no-op elsewhere, all failures swallowed). Called from + `styled_message_box` after `winId()` realises the handle but before + `exec_()` (dark mode must be set pre-show). **Verify live on Win11** — this + is the part that can't be checked headlessly. + +## Performance pass (2026-06-30, measurement-driven) + +Profiled `run_MSFaST` on the bundled example dataset (cProfile + wall timing; +scratch scripts not committed) and benchmarked the algorithmic sections that +scale with feature/DB size. **Every change below was verified output-identical +against the original on real data, not just "looks equivalent"** — the bar the +user set ("functionally identical in terms of I/O"). + +Finding: on the small example the *pipeline* is dominated by pandas CSV +I/O (the inter-stage `iondict.csv`/`_formatted.csv` round-trips, ~0.6s of +to_csv + ~0.5s of read_csv out of ~2.3s), not by Python loops. That I/O chain +is the already-logged "bigger, multi-session" refactor (threading an `iondict` +DataFrame through `filter`/`stats`); left alone here as too invasive/risky for +this pass. The wins below are in the per-feature/per-DB-row algorithmic code, +which is what actually scales badly on large real datasets. + +- **`dbsearch.search_npatlas`: ~5x faster, output identical.** Was + O(features x atlas_rows): per feature it scanned all ~36k atlas rows twice + (once per adduct) with a full-DataFrame boolean mask, then `.copy()` + + `pd.concat` + `sort_values` + a scalar `.loc` write. Now pre-sorts the two + adduct-mass columns once and uses `np.searchsorted` to test only a tiny m/z + window per feature; the **exact original ppm test is re-applied to the + windowed candidates** so the matched set is bit-identical (the window + `mass*(1 ± 2·ppm/1e6)` is a proven superset of the true ppm window). Also: + build one DataFrame per feature from concatenated m+h/m+na positions (no + per-feature `pd.concat`), iterate numpy arrays instead of `iterrows`, and + assign the `hits` column once instead of 979 scalar `.loc` sets. Verified on + the real example (979 feats × 36,454 atlas rows): 1.41s → 0.28s, **0 hitdb + DataFrame mismatches** (incl. row order + `ppm` values) and an identical + `iondict['hits']` column. New edge-case tests in `test_dbsearch.py` + (ppm-sort across both adducts; a single atlas row matching both adducts + appearing twice). +- **`qualityscore.compute_cv_quality`: ~6.5x faster, output identical.** The + AUC-under-the-CV-curve step was a per-feature Python loop doing + `iondict.iloc[pos, :]['col']` scalar lookups (the classic slow pandas + pattern) over thousands of rows. Replaced with the vectorised equivalent + `np.sum(np.diff(cv, prepend=0) * pct)`. ~0.4s → ~0.06s per call (n≈5000). + The faithfulness test (`test_qualityscore.py`, which pins against a verbatim + copy of the original loop) confirms identical values; np.sum's pairwise + summation can differ from the sequential loop by <1 ULP, far below the + 0.1%-rounded display precision. +- **`stats.groupave`: dead sum-of-squares chain removed** (pass 1) — dropped a + `(chunk**2).groupby().sum()` per CSV chunk that only fed an unused variance. +- **`filter.relationalfilter`: measured, left alone.** Looks O(n²) but the + early `break` once past the max isotope window makes it O(n·k) with small k: + benchmarked at 0.017 / 0.077 / 0.371 s for 2k / 8k / 20k synthetic features + (near-linear). Not a bottleneck; its intricate ringing/dimer-band logic + isn't worth the regression risk to micro-optimize. +- **`filter.decon` / `stats.groupave` remaining cost is the per-stage CSV + round-trips**, i.e. the same I/O-chain refactor noted above — not addressed + here. diff --git a/docs/installation.md b/docs/installation.md index 0ed9819..64508b1 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -15,7 +15,7 @@ scipy, NumPy) that ships with Anaconda's base environment. Either: - Download and unzip the repository (GitHub: **Code → Download ZIP**), or -- Clone it (`git clone https://github.com/BalunasLab/mpact.git`, or with +- Clone it (`git clone https://github.com/robertsamples/mpact.git`, or with GitHub Desktop: **Code → Open with GitHub Desktop**). It doesn't matter where you place the folder — MPACT's launcher script and From 5f9a119401853b573f96c04d26e75e0757d9ad62 Mon Sep 17 00:00:00 2001 From: Robert Samples Date: Tue, 30 Jun 2026 10:21:00 -0400 Subject: [PATCH 20/20] resolve ci errors on python 3.11 and 3.9 macos --- code/tests/test_dialogs.py | 13 +++++++++- code/tests/test_qualityscore.py | 22 +++++++++++++---- devnotes.md | 42 +++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/code/tests/test_dialogs.py b/code/tests/test_dialogs.py index b951d44..fc25820 100644 --- a/code/tests/test_dialogs.py +++ b/code/tests/test_dialogs.py @@ -6,6 +6,8 @@ explicit colours rendered black-on-black under the app's dark styling. """ +import sys + from PyQt5 import QtWidgets import dialogs @@ -25,7 +27,16 @@ def test_build_message_box_applies_style_and_content(qapp): default=QtWidgets.QMessageBox.No) assert isinstance(box, QtWidgets.QMessageBox) assert box.text() == 'Body text' - assert box.windowTitle() == 'Title here' + if sys.platform != 'darwin': + # Qt's Cocoa (macOS) integration treats QMessageBox as a native + # alert panel -- per Apple HIG, alerts have no title bar -- and + # doesn't retain the windowTitle property for it specifically (other + # widget types aren't affected). build_message_box still calls + # setWindowTitle unconditionally since it's meaningful on every other + # platform (and harmless here); only the readback assertion is + # platform-gated. Reproduced on stock PyQt5 with no styling applied + # at all, so this is not something dialogs.py's theming can fix. + assert box.windowTitle() == 'Title here' # The dark theme is actually applied to this box. assert 'rgb(212,212,212)' in box.styleSheet() assert box.standardButtons() == (QtWidgets.QMessageBox.Yes | QtWidgets.QMessageBox.No) diff --git a/code/tests/test_qualityscore.py b/code/tests/test_qualityscore.py index 817cdcf..a3f1a65 100644 --- a/code/tests/test_qualityscore.py +++ b/code/tests/test_qualityscore.py @@ -23,15 +23,29 @@ def _reference_inline(iondict, average_n): """Verbatim copy of the original prev_cv.plot() computation (pre-extraction), - returning (rep, sumskew, qualscore). Do not 'clean up' -- it exists to pin - the extracted module to the exact historical behaviour.""" + returning (rep, sumskew, qualscore). Do not 'clean up' the ALGORITHM -- it + exists to pin the extracted module to the exact historical behaviour. + + One mechanical exception: the original assigned the rescaled percentage + back via ``.iloc[:, 0] = `` directly into an int64-dtype + rank column. That was already a FutureWarning on the pandas pinned in + qualityscore.py's own fix (see there); on a newer pandas resolved by CI + (this repo's tests.yml installs an unpinned ``pandas`` for Python 3.11) + the same line raises a hard TypeError instead, which would make this + reference function -- not the code under test -- unable to run at all. + Assigning by column LABEL instead of positional .iloc (replacing the + column's dtype wholesale rather than casting a float into the existing + int64 column) is dtype-mechanics only and was already proven + value-identical by qualityscore.py's own equivalent fix. + """ iondict = iondict[~np.isnan(iondict['average CV'])] iondictmean = iondict.sort_values(['average CV']).reset_index() iondictmed = iondict.sort_values(['median CV']).reset_index() iondictmean = iondictmean.reset_index() iondictmed = iondictmed.reset_index() - iondictmean.iloc[:, 0] = 100 * iondictmean.iloc[:, 0] / len(iondictmean['average CV']) - iondictmed.iloc[:, 0] = 100 * iondictmed.iloc[:, 0] / len(iondictmed['median CV']) + mean_col0, med_col0 = iondictmean.columns[0], iondictmed.columns[0] + iondictmean[mean_col0] = 100 * iondictmean.iloc[:, 0] / len(iondictmean['average CV']) + iondictmed[med_col0] = 100 * iondictmed.iloc[:, 0] / len(iondictmed['median CV']) modelstdevlist = [1] + [0] * (int(average_n) - 1) modelstdev = pd.Series(modelstdevlist).std() / pd.Series(modelstdevlist).mean() prevav = 0 diff --git a/devnotes.md b/devnotes.md index 0de516e..d68ff0c 100644 --- a/devnotes.md +++ b/devnotes.md @@ -1022,3 +1022,45 @@ which is what actually scales badly on large real datasets. - **`filter.decon` / `stats.groupave` remaining cost is the per-stage CSV round-trips**, i.e. the same I/O-chain refactor noted above — not addressed here. + +## CI matrix failures fixed (2026-07-01) + +`.github/workflows/tests.yml` runs a 3 OS x 2 Python-version matrix +(ubuntu/windows/macos x 3.9/3.11) with an **unpinned** `pandas` install (only +`numpy<2` is pinned) — so different runners can resolve genuinely different +pandas versions, and a test can pass on one cell and fail on another for +reasons that have nothing to do with the OS or Python version per se. + +- **`test_qualityscore.py`: 4 failures on Python 3.11 cells (pandas resolved + to 3.0.x there), 0 on 3.9 (older pandas).** `_reference_inline()`'s + deliberately-preserved verbatim copy of the *original* pre-extraction code + used `.iloc[:, 0] = ` to overwrite an int64-dtype column — + exactly the pattern `qualityscore.py` itself was already fixed to avoid (see + "Performance pass" above). On pandas 2.x this was only a `FutureWarning`; + **on pandas 3.x it's a hard `TypeError`**, confirmed by reproducing both the + old and new patterns against a real pandas 3.0.3 install in an isolated + venv. Since the *algorithm* under test wasn't the issue (only a dtype- + mechanics detail of the reference copy, which would have made the original + app code itself crash on a fresh pandas 3.x install, not just this test), + fixed `_reference_inline()` to use the same label-based assignment + (`df[col] = ...`) as the production fix. Re-verified output-identical and + passing under pandas 3.0.3 (all 222 tests), not just inspected. +- **`test_dialogs.py::test_build_message_box_applies_style_and_content`: failed + on every macOS cell (both 3.9 and 3.11), passed on Windows/Ubuntu.** + `box.windowTitle()` reads back `''` after `setWindowTitle('Title here')` on + macOS specifically — Qt's Cocoa integration renders `QMessageBox` as a + native alert panel (no title bar, per Apple HIG) and doesn't retain the + `windowTitle` property for that widget type there, independent of any + styling. Not a `dialogs.py` defect: `build_message_box` still calls + `setWindowTitle` unconditionally (meaningful everywhere else, harmless on + macOS); the test's readback assertion is now gated on + `sys.platform != 'darwin'`. +- **Watch item, not fixed (no failing test, no coverage to verify a fix + against): `mzmineimport.py` has several `.iloc[:, N] = .iloc[:, M]`-style + column reassignments** (lines ~70-71, ~190-202) reading `header=None` CSVs. + These are column-to-column copies within the same frame (not a computed- + float into a known-int column like the bug above), so lower risk, but + unpinned `pandas` in CI means a future resolve could expose the same class + of issue here too. No dedicated test file exists for `mzmineimport.py` + (format detection is covered via `translators.py`/`test_translators.py` + instead) — add coverage before touching this blind.