diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f23753d..2a6ba67 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,11 @@ repos: hooks: - id: mypy name: Static type checking with MyPy - args: [--ignore-missing-imports] + exclude: ^bioneuralnet/network/pysmccnet/ + args: [ + --ignore-missing-imports, + --follow-imports=silent, + ] - repo: local hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index d5d0728..bd6454c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,48 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/). +## [1.3.0] - 2026-04-01 + +### Network Module (`bioneuralnet.network`) +- **New dedicated module**: Network construction and analysis moved from `bioneuralnet.utils` to `bioneuralnet.network`. +- **Renamed construction functions**: `gen_similarity_graph` -> `similarity_network`, `gen_correlation_graph` -> `correlation_network`, `gen_threshold_graph` -> `threshold_network`, `gen_gaussian_knn_graph` -> `gaussian_knn_network`. +- **`NetworkAnalyzer`**: Moved to `bioneuralnet.network`; GPU-accelerated via PyTorch; added `hub_analysis`, `cross_omics_analysis`, `edge_weight_analysis`, `find_strongest_edges`, `degree_distribution`, `clustering_coefficient_gpu`, `connected_components`. +- **`auto_pysmccnet`**: Phenotype-driven network construction via SmCCNet 2.0; supports CCA and PLS modes, now fully implemented in native Python, simplifying user experience and removing the R dependency. + +### Utils Module +- **`impute_omics` / `impute_omics_knn` renamed**: Now `impute_simple` and `impute_knn`. +- **`normalize_omics` renamed**: Now `normalize`; supports `"standard"`, `"minmax"`, `"log2"`. +- **`beta_to_m` renamed**: Now `m_transform`. +- **New `feature_selection` submodule**: `laplacian_score`, `mad_filter`, `pca_loadings`, `correlation_filter`, `importance_rf`, `variance_threshold`, `top_anova_f_features`. +- **New `data` functions**: `data_stats`, `sparse_filter`, `nan_summary`, `zero_summary`. +- **`clean_internal`**: New cleaning function with configurable NaN threshold. + +### DPMON Enhancements +- **`tune_trials`**: Already introduced in 1.2.2; now fully documented. +- **`ae_architecture`**: New parameter; supports `"original"` and `"dynamic"` autoencoder architectures. +- **`correlation_mode`**: New parameter; supports `"abs_pearson"` (default) and `"adaptive"` node feature computation. +- **Inner CV tuning**: Ray Tune now performs epoch-synchronized inner k-fold cross-validation across all trials. + +### Datasets +- **PAAD removed** from built-in datasets. +- **Dataset size reduction**: BRCA, LGG, and KIPAN datasets significantly reduced from ~4,000 omics features per dataset to 700 (400 methylation, 200 mRNA, 100 miRNA) using Laplacian Score filtering, replacing the previous ANOVA-F & Random Forest intersection strategy. This standardization was necessary to stay within the PyPI 100 MB package size limit (v1.2.2 reached 97.9 MB) and results in substantially faster installs and downloads for users. + +### Documentation +- **Data Decision Framework**: New comprehensive stage-by-stage parameter reference (`quick_start/data_framework.rst`). +- **Quick Start notebooks**: New home for end-to-end `Quick_Start.ipynb` and `quick_start_bio.rst`. +- **Subgraph page**: Updated case studies from KIPAN to TCGA-LGG and ROSMAP with full algorithm documentation. +- **`network.rst`**: New dedicated page for the network module. +- **`utils.rst`**, **`datasets.rst`**, **`index.rst`**, **`subgraph.rst`**: Major updates throughout. +- **README**: GitHub readme updated to reflect all API changes, new images, and corrected function names. + +### Removed +- `gen_similarity_graph`, `gen_correlation_graph`, `gen_threshold_graph`, `gen_gaussian_knn_graph` from `bioneuralnet.utils`. +- `graph_analysis`, `repair_graph_connectivity`, `find_optimal_graph` from `bioneuralnet.utils` (superseded by `NetworkAnalyzer` and `network_search`). +- `impute_omics`, `impute_omics_knn`, `normalize_omics`, `beta_to_m` (renamed, see above). + +### Testing +- Test suite updated to align with new `network` module and renamed utils functions. + ## [1.2.2] - 2025-12-29 ### Documentation diff --git a/README.md b/README.md index 963eb55..4237266 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Documentation](https://img.shields.io/badge/docs-read%20the%20docs-blue.svg)](https://bioneuralnet.readthedocs.io/en/latest/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17503083.svg)](https://doi.org/10.5281/zenodo.17503083) -## Welcome to BioNeuralNet 1.2.2 +## Welcome to BioNeuralNet 1.3.0 ![BioNeuralNet Logo](assets/logo_update.png) @@ -386,4 +386,4 @@ For your convenience, you can use the following BibTeX entry: url={https://arxiv.org/abs/2507.20440}, } ``` - \ No newline at end of file + diff --git a/assets/logo_update.png b/assets/logo_update.png index b8eb7fe..84107c7 100644 Binary files a/assets/logo_update.png and b/assets/logo_update.png differ diff --git a/assets/logo_update2.png b/assets/logo_update2.png new file mode 100644 index 0000000..b8eb7fe Binary files /dev/null and b/assets/logo_update2.png differ diff --git a/bioneuralnet/__init__.py b/bioneuralnet/__init__.py index 25d5356..ef68588 100644 --- a/bioneuralnet/__init__.py +++ b/bioneuralnet/__init__.py @@ -13,7 +13,7 @@ """ -__version__ = "1.2.2" +__version__ = "1.3.0" # submodules to enable direct imports such as `from bioneuralnet import utils` from . import utils @@ -51,7 +51,7 @@ __all__ = [ "__version__", - + "utils", "metrics", "datasets", @@ -59,16 +59,17 @@ "network_embedding", "downstream_task", "network", + "external_tools", "GNNEmbedding", "SubjectRepresentation", "auto_pysmccnet", "DPMON", - + "DatasetLoader", "CorrelatedPageRank", "CorrelatedLouvain", - + "HybridLouvain", "load_example", diff --git a/bioneuralnet/clustering/__init__.py b/bioneuralnet/clustering/__init__.py index 6deedbb..a3020a2 100644 --- a/bioneuralnet/clustering/__init__.py +++ b/bioneuralnet/clustering/__init__.py @@ -1,16 +1,16 @@ r"""Network Clustering and Subgraph Detection. -This module implements hybrid algorithms for identifying phenotype-associated -subgraphs in multi-omics networks. It combines global modularity optimization +This module implements hybrid algorithms for identifying phenotype-associated +subgraphs in multi-omics networks. It combines global modularity optimization with local random-walk refinement, weighted by phenotypic correlation. Classes: - HybridLouvain: The primary pipeline. Iteratively alternates between global partitioning - (Louvain) and local refinement (PageRank) to find the most significant + HybridLouvain: The primary pipeline. Iteratively alternates between global partitioning + (Louvain) and local refinement (PageRank) to find the most significant subgraph associated with a phenotype. CorrelatedLouvain: Extends standard Louvain by optimizing a hybrid objective: Q_hybrid = k_L * Modularity + (1 - k_L) * Correlation. - CorrelatedPageRank: Performs a biased random walk (PageRank) followed by a sweep cut to + CorrelatedPageRank: Performs a biased random walk (PageRank) followed by a sweep cut to minimize a hybrid conductance objective: Phi_hybrid = k_P * Conductance + (1 - k_P) * Correlation. Louvain: Standard Louvain community detection (based on modularity maximization). @@ -27,4 +27,4 @@ "CorrelatedLouvain", "HybridLouvain", "Louvain" -] \ No newline at end of file +] diff --git a/bioneuralnet/clustering/correlated_louvain.py b/bioneuralnet/clustering/correlated_louvain.py index 12db0dd..2f674bd 100644 --- a/bioneuralnet/clustering/correlated_louvain.py +++ b/bioneuralnet/clustering/correlated_louvain.py @@ -1,18 +1,18 @@ r""" Correlated Louvain Community Detection. -This module extends the standard Louvain algorithm by incorporating an -absolute phenotype-correlation objective into the modularity maximization +This module extends the standard Louvain algorithm by incorporating an +absolute phenotype-correlation objective into the modularity maximization process. References: - Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in - Multi-omics Networks for Disease Pathway Identification," + Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in + Multi-omics Networks for Disease Pathway Identification," Frontiers in Big Data. Notes: **Hybrid Modularity Objective** - The algorithm optimizes connectivity and phenotype correlation + The algorithm optimizes connectivity and phenotype correlation simultaneously using the following weighted objective function: .. math:: @@ -20,22 +20,22 @@ Where: * :math:`Q`: Standard modularity (internal connectivity). - * :math:`\\rho`: Absolute Pearson correlation of the community's + * :math:`\\rho`: Absolute Pearson correlation of the community's first principal component (PC1) with phenotype :math:`Y`. * :math:`k_L`: User-defined weight on modularity (Suggested: 0.2). Algorithm: - The hierarchical loop and Phase 2 (network aggregation) remain - identical to the standard Louvain method. The modification occurs + The hierarchical loop and Phase 2 (network aggregation) remain + identical to the standard Louvain method. The modification occurs exclusively in **Phase 1 (Local Optimization)**. - When evaluating the movement of node :math:`v` from community :math:`D` + When evaluating the movement of node :math:`v` from community :math:`D` to community :math:`C`, the gain is calculated as: .. math:: \Delta_{hybrid} = k_L \Delta Q + (1 - k_L) \Delta \\rho - The correlation gain :math:`\Delta \\rho` is defined as the change in + The correlation gain :math:`\Delta \\rho` is defined as the change in total correlation across affected communities: .. math:: @@ -86,10 +86,10 @@ def __init__( raise ValueError(f"k_L must be in [0, 1], got {k_L}") super().__init__( - G=G, - weight=weight, + G=G, + weight=weight, max_passes=max_passes, - min_delta=min_delta, + min_delta=min_delta, seed=seed, ) @@ -196,7 +196,7 @@ def _collect_orig( s: Set[int] = set() for idx in np.where(community == comm_id)[0]: s.update(n2o[int(idx)]) - + return frozenset(s) def _correlated_phase1( @@ -227,7 +227,7 @@ def _correlated_phase1( for node in order: cur_comm = int(community[node]) nbr_idx = np.nonzero(A[node])[0] - + if len(nbr_idx) == 0: continue @@ -383,6 +383,6 @@ def get_top_communities(self, n: int = 1) -> List[Tuple[int, float, List[Any]]]: continue idx_set = frozenset(self.node_to_idx[nd] for nd in nds) ranked.append((cid, self._pc1_correlation(idx_set), nds)) - + ranked.sort(key=lambda x: x[1], reverse=True) - return ranked[:n] \ No newline at end of file + return ranked[:n] diff --git a/bioneuralnet/clustering/correlated_pagerank.py b/bioneuralnet/clustering/correlated_pagerank.py index fc8c651..87517ac 100644 --- a/bioneuralnet/clustering/correlated_pagerank.py +++ b/bioneuralnet/clustering/correlated_pagerank.py @@ -1,17 +1,17 @@ r""" Correlated PageRank Clustering. -This module implements a personalized PageRank algorithm combined with a +This module implements a personalized PageRank algorithm combined with a phenotype-aware sweep cut to detect significant subgraphs. References: - Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in - Multi-omics Networks for Disease Pathway Identification," + Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in + Multi-omics Networks for Disease Pathway Identification," Frontiers in Big Data. Algorithm: The PageRank vector is computed as the stationary distribution of: - + .. math:: pr_{\\alpha}(s) = \\alpha s + (1 - \\alpha) pr_{\\alpha}(s) W @@ -21,13 +21,13 @@ * :math:`W`: Transition matrix. .. important:: - The `networkx.pagerank` implementation uses a `alpha` parameter - representing the **damping factor** (link-following probability). + The `networkx.pagerank` implementation uses a `alpha` parameter + representing the **damping factor** (link-following probability). Therefore, :math:`\\text{nx_alpha} = 1 - \\alpha_{theoretical}`. Notes: **Sweep Cut Optimization** - Nodes are sorted by PageRank-per-degree in descending order. For each + Nodes are sorted by PageRank-per-degree in descending order. For each prefix set :math:`S_i`, the algorithm minimizes the **Hybrid Conductance**: .. math:: @@ -39,13 +39,13 @@ * :math:`k_P`: Trade-off weight (Default: ~0.5). **Personalization Vector (Seed Weighting)** - Teleportation probabilities for seeds are weighted by their marginal + Teleportation probabilities for seeds are weighted by their marginal contribution to correlation: .. math:: \\alpha_i = \\frac{\\rho_i}{\\max(\\rho_{seeds})} \\cdot \\alpha_{max} - Where :math:`\\rho_i = |\\rho(S)| - |\\rho(S \setminus \{i\})|`. + Where :math:`\\rho_i = |\\rho(S)| - |\\rho(S \setminus \{i\})|`. Values where :math:`\\rho_i < 0` are clamped to 0. """ @@ -103,13 +103,13 @@ def __init__( if not 0.0 <= teleport_prob <= 1.0: raise ValueError(f"teleport_prob must be in [0, 1], got {teleport_prob}") - + self.teleport_prob = teleport_prob self._nx_alpha = 1.0 - teleport_prob if not 0.0 <= k_P <= 1.0: raise ValueError(f"k_P must be in [0, 1], got {k_P}") - + self.k_P = k_P self.max_iter = max_iter self.tol = tol @@ -135,14 +135,14 @@ def _validate_inputs(self): """ if not isinstance(self.G, nx.Graph): raise TypeError("graph must be a networkx.Graph") - + if not isinstance(self.B, pd.DataFrame): raise TypeError("omics_data must be a pandas DataFrame") - + graph_nodes = set(str(n) for n in self.G.nodes()) omics_cols = set(str(c) for c in self.B.columns) missing = graph_nodes - omics_cols - + if missing: logger.warning( f"{len(missing)} graph nodes missing from omics columns " @@ -174,7 +174,7 @@ def phen_omics_corr(self, nodes: List[Any]) -> Tuple[float, float]: return 0.0, 1.0 B_sub = self.B[valid_cols] - + if B_sub.shape[0] < 2: return 0.0, 1.0 @@ -199,7 +199,7 @@ def phen_omics_corr(self, nodes: List[Any]) -> Tuple[float, float]: pc1, y_vals = pc1[:n_limit], y_vals[:n_limit] corr, pvalue = pearsonr(pc1, y_vals) - + return (float(corr), float(pvalue)) if np.isfinite(corr) else (0.0, 1.0) except Exception as e: @@ -255,7 +255,7 @@ def sweep_cut(self, pr_scores: Dict[Any, float]) -> Dict[str, Any]: vol_S = sum(d for _, d in self.G.degree(current_cluster, weight="weight")) vol_T = sum(d for _, d in self.G.degree(complement, weight="weight")) - + if min(vol_S, vol_T) == 0: continue @@ -310,7 +310,7 @@ def generate_weighted_personalization( if not nodes_excl: contributions.append(0.0) continue - + corr_excl, _ = self.phen_omics_corr(nodes_excl) rho_i = abs_total - abs(corr_excl) contributions.append(rho_i) @@ -347,12 +347,12 @@ def run(self, seed_nodes: List[Any]) -> Dict[str, Any]: graph_nodes = set(self.G.nodes()) missing = set(seed_nodes) - graph_nodes - + if missing: raise ValueError(f"Seed nodes not in graph: {missing}") personalization = self.generate_weighted_personalization(seed_nodes) - + logger.info( f"Personalization: {len(personalization)} nodes, " f"max_weight={max(personalization.values()):.4f}, " @@ -392,4 +392,4 @@ def run(self, seed_nodes: List[Any]) -> Dict[str, Any]: else: logger.warning("Sweep cut found no valid cluster.") - return results \ No newline at end of file + return results diff --git a/bioneuralnet/clustering/hybrid_louvain.py b/bioneuralnet/clustering/hybrid_louvain.py index 2fb22ae..948cf71 100644 --- a/bioneuralnet/clustering/hybrid_louvain.py +++ b/bioneuralnet/clustering/hybrid_louvain.py @@ -1,23 +1,23 @@ r""" Hybrid Louvain-PageRank - Significant Subgraph Detection. -This module implements an iterative refinement algorithm that alternates -between community detection (Louvain) and local ranking (PageRank) to +This module implements an iterative refinement algorithm that alternates +between community detection (Louvain) and local ranking (PageRank) to identify phenotype-correlated subgraphs. References: - Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in - Multi-omics Networks for Disease Pathway Identification," + Abdel-Hafiz et al. (2022), "Significant Subgraph Detection in + Multi-omics Networks for Disease Pathway Identification," Frontiers in Big Data. Algorithm: - The process alternates between global community detection and local + The process alternates between global community detection and local refinement until convergence: Iteration 1 (Global Scope): 1. Run Correlated Louvain on the full graph to optimize Hybrid Modularity. 2. Select the community with the highest phenotype correlation :math:`|\rho|`. - 3. Assign seed weights to these nodes based on their marginal + 3. Assign seed weights to these nodes based on their marginal contribution to :math:`\rho`. 4. Execute Correlated PageRank on the full graph. 5. Use a sweep cut to produce the initial refined subgraph. @@ -33,23 +33,23 @@ Notes: **Hybrid Modularity (Correlated Louvain)** Balances internal topological connectivity with phenotype correlation: - + .. math:: Q_{hybrid} = k_L Q + (1 - k_L) \rho - + **Hybrid Conductance (Correlated PageRank)** Balances the external cut/internal volume ratio with correlation: - + .. math:: \Phi_{hybrid} = k_P \Phi + (1 - k_P) \rho **Seed Weighting** - Teleportation probabilities :math:`\alpha_i` are weighted by a node's + Teleportation probabilities :math:`\alpha_i` are weighted by a node's marginal contribution: - + .. math:: \alpha_i = \frac{\rho_i}{\max(\rho_{seeds})} \cdot \alpha_{max} - + Where :math:`\rho_i = \rho(S) - \rho(S \setminus \{i\})`. """ @@ -106,7 +106,7 @@ def __init__( if isinstance(G, pd.DataFrame): logger.info("Converting adjacency DataFrame to NetworkX graph.") G = nx.from_pandas_adjacency(G) - + if not isinstance(G, nx.Graph): raise TypeError("G must be a networkx.Graph or adjacency DataFrame.") @@ -116,10 +116,10 @@ def __init__( graph_nodes = set(str(n) for n in G.nodes()) keep = [c for c in B.columns if str(c) in graph_nodes] dropped = len(B.columns) - len(keep) - + if dropped > 0: logger.info(f"Dropped {dropped} omics columns not in graph.") - + self.B = B.loc[:, keep].copy() if isinstance(Y, pd.DataFrame): @@ -315,7 +315,7 @@ def best_subgraph(self) -> Tuple[List[Any], float, int]: """ if self._best_idx is None: raise ValueError("Call run() first.") - + best = self._iterations[self._best_idx] - - return best["refined_nodes"], best["refined_rho"], best["iteration"] \ No newline at end of file + + return best["refined_nodes"], best["refined_rho"], best["iteration"] diff --git a/bioneuralnet/clustering/louvain.py b/bioneuralnet/clustering/louvain.py index c733203..286147d 100644 --- a/bioneuralnet/clustering/louvain.py +++ b/bioneuralnet/clustering/louvain.py @@ -1,7 +1,7 @@ r""" Standard Louvain Method for Community Detection - NumPy Implementation. -References: +References: Blondel et al. (2008), "Fast unfolding of communities in large networks." """ @@ -379,4 +379,4 @@ def history(self) -> List[Dict[str, Any]]: List[Dict[str, Any]]: A list of dictionaries containing stats for each level. """ - return list(self._history) \ No newline at end of file + return list(self._history) diff --git a/bioneuralnet/downstream_task/dpmon.py b/bioneuralnet/downstream_task/dpmon.py index 8e19f8b..67eda03 100644 --- a/bioneuralnet/downstream_task/dpmon.py +++ b/bioneuralnet/downstream_task/dpmon.py @@ -1,12 +1,12 @@ r""" DPMON: Optimized Network Embedding and Fusion for Disease Prediction. -This module implements an end-to-end Graph Neural Network (GNN) pipeline +This module implements an end-to-end Graph Neural Network (GNN) pipeline integrating network topology with subject-level omics data. References: - Hussein, S. et al. (2024), "Learning from Multi-Omics Networks to - Enhance Disease Prediction: An Optimized Network Embedding and + Hussein, S. et al. (2024), "Learning from Multi-Omics Networks to + Enhance Disease Prediction: An Optimized Network Embedding and Fusion Approach" IEEE BIBM. Algorithm: @@ -16,21 +16,21 @@ 1. Construct a multi-omics network. 2. Initialize node features using clinical correlation vectors. 3. Pass graph through a GNN (GAT/GCN/GIN). - + Phase 2: Dimensionality Reduction Compress embeddings into scalar weights via AutoEncoder/MLP. Phase 3: Fusion and Prediction - Fuse embeddings with subject-level data via element-wise + Fuse embeddings with subject-level data via element-wise multiplication (Feature Reweighting). Notes: The embedding space is optimized dynamically using the loss function: - + .. math:: L_{total} = L_{classification} + \lambda L_{regularization} - The fusion acts as a **Network-Guided Attention Mechanism**, + The fusion acts as a **Network-Guided Attention Mechanism**, amplifying features that are topologically central. """ @@ -73,7 +73,7 @@ os.environ["TUNE_DISABLE_IPY_WIDGETS"] = "1" os.environ["TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S"] = "0" os.environ["RAY_DEDUP_LOGS"] = "0" - + for logger_name in ("ray", "raylet", "ray.train.session", "ray.tune", "torch_geometric"): logging.getLogger(logger_name).setLevel(logging.WARNING) @@ -945,7 +945,7 @@ def get_stats(data_list): results_df = pd.DataFrame(all_fold_results) config_save_path = os.path.join(output_dir, "cv_tuning_results.csv") config_df = pd.DataFrame(fold_best_configs) - combined = pd.concat([results_df,config_df], axis=1) + combined = pd.concat([results_df,config_df], axis=1) combined.to_csv(config_save_path) logger.info(f"Successfully saved CV tuning parameters to: {config_save_path}") except Exception as e: @@ -1037,7 +1037,7 @@ def run_hyperparameter_tuning(X_train, y_train, adjacency_matrix, clinical_data, "X_val": torch.FloatTensor(X.iloc[val_idx].values), "y_val": torch.LongTensor(Y.iloc[val_idx].values), } - + fold_data_refs.append(ray.put(fold_tensors)) omics_network_ref = ray.put(omics_network_tg.cpu()) @@ -1066,7 +1066,7 @@ def tune_train_fn(config): "criterion": nn.CrossEntropyLoss() } folds.append(fold_dict) - + # one model + optimizer per inner fold models, optimizers = [], [] for _ in range(len(folds)): @@ -1103,7 +1103,7 @@ def tune_train_fn(config): epoch_val_auprs = [] for fi, fold in enumerate(folds): - # train step + # train step models[fi].train() optimizers[fi].zero_grad() out, _, _ = models[fi](fold["X_train"], omics_net) @@ -1116,7 +1116,7 @@ def tune_train_fn(config): models[fi].eval() with torch.no_grad(): val_out, _, _ = models[fi](fold["X_val"], omics_net) - vl = fold["criterion"](val_out, fold["y_val"]).item() + vl = fold["criterion"](val_out, fold["y_val"]).item() _, preds = torch.max(val_out, 1) probs = torch.softmax(val_out, dim=1) @@ -1126,7 +1126,7 @@ def tune_train_fn(config): va = (preds == fold["y_val"]).sum().item() / fold["y_val"].size(0) f1_mac = f1_score(y_true_np, predicted_np, average='macro', zero_division=0) - + try: n_classes = probs_np.shape[1] if n_classes == 2: @@ -1158,7 +1158,7 @@ def tune_train_fn(config): ckpt_dir = "trial_checkpoint" os.makedirs(ckpt_dir, exist_ok=True) - + torch.save( {"epoch": epoch, "model_state": models[0].state_dict()}, os.path.join(ckpt_dir, "checkpoint.pt"), @@ -1186,12 +1186,12 @@ def tune_train_fn(config): def short_dirname_creator(trial): return f"T{trial.trial_id}" - + #dummy reporter that fixes Ray-rune screen-clearing for jupyter notebooks class SilentReporter(CLIReporter): def should_report(self, trials, done=False): return False - + def report(self, trials, done, *sys_info): pass @@ -1256,7 +1256,7 @@ def report(self, trials, done, *sys_info): if new_num_samples == num_samples and new_gpu_per_trial == gpu_per_trial: logger.error("Cannot reduce num_samples or increase gpu_per_trial any further. Aborting.") raise - + logger.warning( f"Ray Tune failed (attempt {attempt + 1}). " f"Adjusting resources -> num_samples: {num_samples} to {new_num_samples}, " @@ -1301,7 +1301,7 @@ def train_model(model, criterion, optimizer, train_features, train_labels, epoch loss.backward() optimizer.step() - scheduler.step() + scheduler.step() if (epoch + 1) % 100 == 0 or epoch == 0: logger.debug(f"Epoch [{epoch+1}/{epoch_num}], Loss: {loss.item():.4f}") @@ -1482,7 +1482,7 @@ def __init__(self, input_dim: int, encoding_dim: int, architecture: str = "origi # 3-step funnel h1 = max(input_dim // 2, 8) h2 = max(input_dim // 4, 4) - + self.encoder = nn.Sequential( nn.Linear(input_dim, h1), nn.ReLU(), @@ -1495,7 +1495,7 @@ def __init__(self, input_dim: int, encoding_dim: int, architecture: str = "origi hidden_dim = max(input_dim // 2, encoding_dim * 2) if hidden_dim > input_dim: hidden_dim = input_dim - + self.encoder = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), diff --git a/bioneuralnet/downstream_task/subject_representation.py b/bioneuralnet/downstream_task/subject_representation.py index 6c7d115..9754f1a 100644 --- a/bioneuralnet/downstream_task/subject_representation.py +++ b/bioneuralnet/downstream_task/subject_representation.py @@ -29,6 +29,7 @@ from ..utils import get_logger logger = get_logger(__name__) + class SubjectRepresentation: """SubjectRepresentation Class for Integrating Network Embeddings into Omics Data. @@ -74,14 +75,14 @@ def __init__( raise ValueError("Omics data must be non-empty.") if embeddings is None: - self.logger.warning("No embeddings provided, please review documentation to see how to generate embeddings.") + logger.warning("No embeddings provided, please review documentation to see how to generate embeddings.") raise ValueError("Embeddings must be non-empty.") if not isinstance(embeddings, pd.DataFrame): raise ValueError("Embeddings must be provided as a pandas DataFrame.") if embeddings.empty: - self.logger.warning("No embeddings provided, please review documentation to see how to generate embeddings.") + logger.warning("No embeddings provided, please review documentation to see how to generate embeddings.") raise ValueError("Embeddings must be non-empty.") if tune and phenotype_data is None: @@ -144,7 +145,7 @@ def run(self) -> pd.DataFrame: logger.info("Starting Subject Representation workflow.") if self.embeddings.empty: - self.logger.warning( + logger.warning( "No embeddings provided. Please generate embeddings using GNNEmbeddings class.\nReturning original omics_data." ) return self.omics_data @@ -164,14 +165,14 @@ def run(self) -> pd.DataFrame: enhanced_omics_data = self._integrate_embeddings(reduced, method="multiply", alpha=1.0, beta=0.8) if enhanced_omics_data.empty: - self.logger.warning("Enhanced omics data is empty. Returning original omics_data.") + logger.warning("Enhanced omics data is empty. Returning original omics_data.") return self.omics_data logger.info(f"Subject Representation completed successfully. Final shape: {enhanced_omics_data.shape}") return enhanced_omics_data except Exception as e: - self.logger.error(f"Error in Subject Representation workflow: {e}") + logger.error(f"Error in Subject Representation workflow: {e}") raise def _reduce_embeddings(self, method: str, ae_params: Optional[dict[Any, Any]] = None, compressed_dim: int = 2) -> pd.DataFrame: @@ -273,7 +274,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= missing_features = set(self.omics_data.columns) - set(reduced.index) if missing_features: - self.logger.warning(f"Missing {len(missing_features)} features in reduced embeddings: {list(missing_features)[:5]}") + logger.warning(f"Missing {len(missing_features)} features in reduced embeddings: {list(missing_features)[:5]}") if method == "multiply": # the reduced embeddings to match the omics data columns. @@ -295,7 +296,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= # look for duplicate feature names. if not scaled_ranks.index.is_unique: - self.logger.error("Duplicate feature names detected in the reduced embeddings index.") + logger.error("Duplicate feature names detected in the reduced embeddings index.") # re-index to ensure we have a weight per omics_data feature. feature_weights = scaled_ranks.reindex(self.omics_data.columns,fill_value=0.0) @@ -309,7 +310,7 @@ def _integrate_embeddings(self, reduced: pd.DataFrame, method="multiply", alpha= if pd.notnull(weight_val): enhanced[feature] = beta * enhanced[feature] + (1 - beta) * (alpha * weight_val * enhanced[feature]) else: - self.logger.warning(f"Feature {feature} not found in the reduced weights.") + logger.warning(f"Feature {feature} not found in the reduced weights.") else: #Curently only supoort one method but left the parameter in case we supp0ort more in the future. raise ValueError(f"Unknown integration method: {method}.") @@ -383,7 +384,7 @@ def tune_helper(config): tune.report({"score": score}) except Exception as e: - self.logger.error(f"Tuning trial failed: {e}") + logger.error(f"Tuning trial failed: {e}") import traceback traceback.print_exc() tune.report({"score": 0.0}) @@ -410,7 +411,7 @@ def short_dirname_creator(trial): verbose=1, ) except TuneError as e: - self.logger.warning(f"Tuning completed with failures: {e}") + logger.warning(f"Tuning completed with failures: {e}") analysis = None if len(e.args) > 1 and hasattr(e.args[1], "get_best_trial"): analysis = e.args[1] @@ -423,9 +424,9 @@ def short_dirname_creator(trial): best_trial = analysis.get_best_trial("score", "max", "last") best_score = best_trial.last_result.get("score", 0.0) except Exception as e: - self.logger.error(f"Could not retrieve best trial details: {e}") + logger.error(f"Could not retrieve best trial details: {e}") else: - self.logger.error("Analysis object is invalid or missing get_best_trial().") + logger.error("Analysis object is invalid or missing get_best_trial().") logger.info(f"Best trial final score: {best_score}") diff --git a/bioneuralnet/external_tools/__init__.py b/bioneuralnet/external_tools/__init__.py index c806723..f923913 100644 --- a/bioneuralnet/external_tools/__init__.py +++ b/bioneuralnet/external_tools/__init__.py @@ -2,8 +2,8 @@ External Tools Module This module provides utility functions for interoperability between Python and R. -It handles the execution of external R scripts to extract, convert, and load -RData structures (such as cross-validation folds and network matrices) into +It handles the execution of external R scripts to extract, convert, and load +RData structures (such as cross-validation folds and network matrices) into standardized Python data structures like pandas DataFrames and NumPy arrays. Available Functions: @@ -20,4 +20,4 @@ "extract_and_load_folds", "load_r_export_folds", "rdata_to_df" -] \ No newline at end of file +] diff --git a/bioneuralnet/external_tools/extract_CVfold.R b/bioneuralnet/external_tools/extract_CVfold.R index dc83af8..5911535 100644 --- a/bioneuralnet/external_tools/extract_CVfold.R +++ b/bioneuralnet/external_tools/extract_CVfold.R @@ -10,7 +10,7 @@ options(scipen = 999) success <- tryCatch({ CVdata_path <- file.path(output_path, "CVFold.Rdata") load(CVdata_path) - + network_path <- file.path(output_path, "globalNetwork.Rdata") load(network_path) @@ -18,59 +18,59 @@ success <- tryCatch({ dir.create(export_base, showWarnings = FALSE, recursive = TRUE) for (f_name in names(folddata)) { - + fold_dir <- file.path(export_base, f_name) dir.create(fold_dir, showWarnings = FALSE) - + current_fold <- folddata[[f_name]] - + for (i in 1:length(current_fold$X_train)) { - write.csv(current_fold$X_train[[i]], - file = file.path(fold_dir, paste0("X_train_Omics_", i, ".csv")), + write.csv(current_fold$X_train[[i]], + file = file.path(fold_dir, paste0("X_train_Omics_", i, ".csv")), row.names = FALSE) } - + for (i in 1:length(current_fold$X_test)) { - write.csv(current_fold$X_test[[i]], - file = file.path(fold_dir, paste0("X_test_Omics_", i, ".csv")), + write.csv(current_fold$X_test[[i]], + file = file.path(fold_dir, paste0("X_test_Omics_", i, ".csv")), row.names = FALSE) } if (is.list(current_fold$Y_train)) { for (i in 1:length(current_fold$Y_train)) { - write.csv(current_fold$Y_train[[i]], - file = file.path(fold_dir, paste0("Y_train_Omics_", i, ".csv")), + write.csv(current_fold$Y_train[[i]], + file = file.path(fold_dir, paste0("Y_train_Omics_", i, ".csv")), row.names = FALSE) } } else { - write.csv(current_fold$Y_train, - file = file.path(fold_dir, "Y_train.csv"), + write.csv(current_fold$Y_train, + file = file.path(fold_dir, "Y_train.csv"), row.names = FALSE) } if (is.list(current_fold$Y_test)) { for (i in 1:length(current_fold$Y_test)) { - write.csv(current_fold$Y_test[[i]], - file = file.path(fold_dir, paste0("Y_test_Omics_", i, ".csv")), + write.csv(current_fold$Y_test[[i]], + file = file.path(fold_dir, paste0("Y_test_Omics_", i, ".csv")), row.names = FALSE) } } else { - write.csv(current_fold$Y_test, - file = file.path(fold_dir, "Y_test.csv"), + write.csv(current_fold$Y_test, + file = file.path(fold_dir, "Y_test.csv"), row.names = FALSE) } - + message("Finished exporting: ", f_name) } message("All files (X and Y) saved to: ", export_base) - write.csv(globalNetwork$AdjacencyMatrix, - file.path(output_path, "globalNetwork_R_matrix.csv"), + write.csv(globalNetwork$AdjacencyMatrix, + file.path(output_path, "globalNetwork_R_matrix.csv"), row.names = TRUE) TRUE - + }, error = function(e) { message("Error during extraction: ", e$message) FALSE @@ -80,4 +80,4 @@ if (!success) { quit(save = "no", status = 1) } else { quit(save = "no", status = 0) -} \ No newline at end of file +} diff --git a/bioneuralnet/external_tools/extract_CVfold.py b/bioneuralnet/external_tools/extract_CVfold.py index 151688f..a42236a 100644 --- a/bioneuralnet/external_tools/extract_CVfold.py +++ b/bioneuralnet/external_tools/extract_CVfold.py @@ -18,7 +18,7 @@ def load_r_export_folds(base_path: str, num_omics: int, k: int = 5) -> dict: Returns: - dict: A dictionary where keys are fold names (e.g., 'fold_1') and values are + dict: A dictionary where keys are fold names (e.g., 'fold_1') and values are dictionaries containing 'X_train' (list of numpy arrays), 'X_test' (list of numpy arrays), 'Y_train' (numpy array), and 'Y_test' (numpy array). @@ -29,40 +29,40 @@ def load_r_export_folds(base_path: str, num_omics: int, k: int = 5) -> dict: """ folddata = {} print(f"Loading R-exported folds from: {base_path}") - + for i in range(1, k + 1): fold_key = f"fold_{i}" fold_dir = os.path.join(base_path, fold_key) - + if not os.path.exists(fold_dir): raise FileNotFoundError(f"Could not find directory: {fold_dir}") - + x_train_list = [] x_test_list = [] - + for omics_idx in range(1, num_omics + 1): xtrain_path = os.path.join(fold_dir, f"X_train_Omics_{omics_idx}.csv") xtest_path = os.path.join(fold_dir, f"X_test_Omics_{omics_idx}.csv") - + x_train = pd.read_csv(xtrain_path).to_numpy() x_test = pd.read_csv(xtest_path).to_numpy() - + x_train_list.append(x_train) x_test_list.append(x_test) - + ytrain_path = os.path.join(fold_dir, "Y_train.csv") ytest_path = os.path.join(fold_dir, "Y_test.csv") - + y_train = pd.read_csv(ytrain_path).iloc[:, 0].to_numpy() y_test = pd.read_csv(ytest_path).iloc[:, 0].to_numpy() - + folddata[fold_key] = { "X_train": x_train_list, "X_test": x_test_list, "Y_train": y_train, "Y_test": y_test } - + print(f"Successfully loaded {k} folds.") return folddata @@ -97,7 +97,7 @@ def extract_and_load_folds(output_path: str, num_omics: int = 3, k: int = 5) -> raise EnvironmentError("Rscript not found in system path.") target_dir = Path(output_path).resolve() - + script_path = (Path(__file__).parent / "extract_CVfold.R").resolve() if not script_path.exists(): @@ -110,7 +110,7 @@ def extract_and_load_folds(output_path: str, num_omics: int = 3, k: int = 5) -> if proc.stdout: print(f"Rscript stdout:\n{proc.stdout}") - + if proc.stderr: if proc.returncode == 0: print(f"Rscript messages:\n{proc.stderr}") @@ -122,4 +122,4 @@ def extract_and_load_folds(output_path: str, num_omics: int = 3, k: int = 5) -> export_base_dir = os.path.join(str(target_dir), "CV_Export") - return load_r_export_folds(base_path=export_base_dir, num_omics=num_omics, k=k) \ No newline at end of file + return load_r_export_folds(base_path=export_base_dir, num_omics=num_omics, k=k) diff --git a/bioneuralnet/external_tools/rdata_to_df.R b/bioneuralnet/external_tools/rdata_to_df.R index 9d2f5f8..a1f584f 100644 --- a/bioneuralnet/external_tools/rdata_to_df.R +++ b/bioneuralnet/external_tools/rdata_to_df.R @@ -11,7 +11,7 @@ rm(list = setdiff(ls(), c("orig_input", "orig_csv", "target_obj"))) success <- tryCatch({ load(orig_input) - + loaded_objs <- setdiff(ls(), c("orig_input", "orig_csv", "target_obj")) message("Loaded objects: ", paste(loaded_objs, collapse = ", ")) @@ -97,4 +97,4 @@ if (!success) { quit(save = "no", status = 1) } else { quit(save = "no", status = 0) -} \ No newline at end of file +} diff --git a/bioneuralnet/external_tools/rdata_to_df.py b/bioneuralnet/external_tools/rdata_to_df.py index 4b45972..c0b7cd2 100644 --- a/bioneuralnet/external_tools/rdata_to_df.py +++ b/bioneuralnet/external_tools/rdata_to_df.py @@ -60,4 +60,4 @@ def rdata_to_df(rdata_file: Path, csv_file: Path, Object=None) -> pd.DataFrame: if not found: raise FileNotFoundError(f"No CSV at {csv_file}, nor in {possibilities}") - return pd.read_csv(csv_file, index_col=0) \ No newline at end of file + return pd.read_csv(csv_file, index_col=0) diff --git a/bioneuralnet/metrics/correlation.py b/bioneuralnet/metrics/correlation.py index 9e4de7a..817bca1 100644 --- a/bioneuralnet/metrics/correlation.py +++ b/bioneuralnet/metrics/correlation.py @@ -43,7 +43,6 @@ def omics_correlation(omics: pd.DataFrame, pheno: pd.DataFrame) -> Tuple[float, return corr, pvalue - def cluster_pca_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tuple: """Computes the Pearson correlation coefficient between PC1 of a cluster and phenotype. @@ -90,8 +89,8 @@ def cluster_pca_correlation(cluster_df: pd.DataFrame, pheno: pd.DataFrame) -> tu def cluster_correlation(louvain_cluster: pd.DataFrame) -> pd.DataFrame: """Builds a new correlation network from an extracted cluster/subnetwork. - - This function is often used as an intermediate step between HybridLouvain and plotting. + + This function is often used as an intermediate step between HybridLouvain and plotting. It allows the subnetwork to reveal its internal topological structure by computing the Pearson correlation of node connectivity profiles. Args: @@ -103,7 +102,9 @@ def cluster_correlation(louvain_cluster: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: Adjacency matrix. """ adjacency_matrix = louvain_cluster.corr(method="pearson") - np.fill_diagonal(adjacency_matrix.values, 0) + vals = adjacency_matrix.to_numpy().copy() + np.fill_diagonal(vals, 0) + adjacency_matrix = pd.DataFrame(vals, index=adjacency_matrix.index, columns=adjacency_matrix.columns) adjacency_matrix = adjacency_matrix.fillna(0) - return adjacency_matrix \ No newline at end of file + return adjacency_matrix diff --git a/bioneuralnet/network/__init__.py b/bioneuralnet/network/__init__.py index ce5cf3b..d703f0d 100644 --- a/bioneuralnet/network/__init__.py +++ b/bioneuralnet/network/__init__.py @@ -1,8 +1,8 @@ """Network Construction and Analysis. -This module provides tools for generating, searching, and analyzing multi-omics -networks. It includes methods for building networks from raw tabular data using -similarity, correlation, thresholding, and Gaussian KNN, as well as +This module provides tools for generating, searching, and analyzing multi-omics +networks. It includes methods for building networks from raw tabular data using +similarity, correlation, thresholding, and Gaussian KNN, as well as phenotype-driven strategies like PySmCCNet. """ @@ -29,4 +29,4 @@ "threshold_network", "gaussian_knn_network", "auto_pysmccnet", -] \ No newline at end of file +] diff --git a/bioneuralnet/network/pysmccnet/analysis.py b/bioneuralnet/network/pysmccnet/analysis.py index 3e0c260..e4938a7 100644 --- a/bioneuralnet/network/pysmccnet/analysis.py +++ b/bioneuralnet/network/pysmccnet/analysis.py @@ -36,25 +36,25 @@ def data_preprocess(X: Union[pd.DataFrame, np.ndarray], covariates: Optional[Uni """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) - + data = X.copy() - + # cv filtering if is_cv: if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - + data_tensor = torch.tensor(data.values, dtype=dtype, device=device) N = data_tensor.shape[0] col_mean = data_tensor.mean(dim=0) col_std = torch.sqrt(torch.sum((data_tensor - col_mean.unsqueeze(0)) ** 2, dim=0) / (N - 1)) - + safe_mean = torch.where(col_mean == 0, torch.ones_like(col_mean), col_mean) cv_values = torch.abs(col_std / safe_mean).cpu().numpy() - + if cv_quantile is None: raise ValueError("cv filtering quantile must be provided!") - + thresh = np.quantile(cv_values, cv_quantile) keep_mask = cv_values > thresh data = data.loc[:, keep_mask] @@ -62,7 +62,7 @@ def data_preprocess(X: Union[pd.DataFrame, np.ndarray], covariates: Optional[Uni # center and scale if center: data = data - data.mean(axis=0) - + if scale: std_devs = data.std(axis=0, ddof=1) std_devs[std_devs == 0] = 1 @@ -72,10 +72,10 @@ def data_preprocess(X: Union[pd.DataFrame, np.ndarray], covariates: Optional[Uni if covariates is not None: if not isinstance(covariates, pd.DataFrame): covariates = pd.DataFrame(covariates) - + if covariates.shape[1] == 0: raise ValueError('Covariate dataframe must have at least 1 column!') - + if not data.index.equals(covariates.index): common_idx = data.index.intersection(covariates.index) data = data.loc[common_idx] @@ -83,18 +83,18 @@ def data_preprocess(X: Union[pd.DataFrame, np.ndarray], covariates: Optional[Uni if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - + data_tensor = torch.tensor(data.values, dtype=dtype, device=device) cov_tensor = torch.tensor(covariates.values, dtype=dtype, device=device) - + ones_col = torch.ones((cov_tensor.shape[0], 1), device=device, dtype=dtype) X_cov = torch.cat([ones_col, cov_tensor], dim=1) - + result = torch.linalg.lstsq(X_cov, data_tensor) beta = result.solution - + residuals = data_tensor - torch.matmul(X_cov, beta) - + data = pd.DataFrame(residuals.cpu().numpy(), index=data.index, columns=data.columns) return data @@ -116,77 +116,77 @@ def get_can_cor_multi(X: List[torch.Tensor], cc_coef: Union[np.ndarray, torch.Te """ device = X[0].device dtype = X[0].dtype - + if not isinstance(Y, torch.Tensor): Y = torch.tensor(np.array(Y), device=device, dtype=dtype) - + if not isinstance(cc_coef, torch.Tensor): cc_coef = torch.tensor(np.array(cc_coef), device=device, dtype=dtype) - + num_datasets = len(X) - + # projections projections = [] for i in range(num_datasets): w = cc_weight[i] if isinstance(cc_weight[i], torch.Tensor) else torch.tensor(np.array(cc_weight[i]), device=device, dtype=dtype) - + if w.ndim == 1: w = w.view(-1, 1) - + proj = torch.matmul(X[i], w) projections.append(proj.reshape(-1, 1)) - + omics_projection = torch.cat(projections, dim=1) - + # correlation matrix k = omics_projection.shape[1] centered = omics_projection - omics_projection.mean(dim=0, keepdim=True) - + N = omics_projection.shape[0] cov_mat = torch.matmul(centered.t(), centered) / (N - 1) std_vec = torch.sqrt(torch.diag(cov_mat)) - + std_outer = std_vec.unsqueeze(1) * std_vec.unsqueeze(0) std_outer = torch.where(std_outer == 0, torch.ones_like(std_outer), std_outer) - + omics_cor_mat = cov_mat / std_outer - + # between-omics correlations if k > 1: row_idx, col_idx = torch.triu_indices(k, k, offset=1) omics_cor_vec = omics_cor_mat[row_idx, col_idx] else: omics_cor_vec = torch.tensor([], device=device, dtype=dtype) - + cc_coef_between = cc_coef[:num_datasets] cc_between = r_vec_mult_sum(cc_coef_between, omics_cor_vec) - + # omics-phenotype correlations y_flat = Y.flatten() y_centered = y_flat - torch.mean(y_flat) y_norm = torch.sqrt(torch.sum(y_centered ** 2)) - + pheno_cor_list = [] for i in range(k): col = omics_projection[:, i] col_centered = col - torch.mean(col) col_norm = torch.sqrt(torch.sum(col_centered ** 2)) - + denom = col_norm * y_norm if denom == 0: corr = torch.tensor(0.0, device=device, dtype=dtype) else: corr = torch.sum(col_centered * y_centered) / denom - + if torch.isnan(corr): corr = torch.tensor(0.0, device=device, dtype=dtype) - + pheno_cor_list.append(corr) - + pheno_cor_vec = torch.stack(pheno_cor_list) cc_coef_pheno = cc_coef[num_datasets:] pheno_cor_val = r_vec_mult_sum(pheno_cor_vec, cc_coef_pheno) - + return cc_between + pheno_cor_val @@ -208,32 +208,32 @@ def get_abar(ws: Union[pd.DataFrame, np.ndarray, torch.Tensor, List[float]], fea ws = torch.tensor(ws.values, dtype=torch.float32) elif not isinstance(ws, torch.Tensor): ws = torch.tensor(ws, dtype=torch.float32) - + w_abs = torch.abs(ws) - + # compute similarity matrix if w_abs.ndim == 1: abar = torch.outer(w_abs, w_abs) else: abar = torch.matmul(w_abs, w_abs.T) - + # zero diagonal abar.fill_diagonal_(0) - + # normalize max_val = torch.max(abar) if max_val > 0: abar = abar / max_val - + # format output if feature_label is None: raise ValueError("Need to provide FeatureLabel.") - + if len(feature_label) != abar.shape[0]: raise ValueError(f"FeatureLabel length ({len(feature_label)}) does not match matrix rows ({abar.shape[0]}).") abar_cpu = abar.detach().cpu().numpy() - + return pd.DataFrame(abar_cpu, index=feature_label, columns=feature_label) def get_omics_modules(Abar: pd.DataFrame, cut_height: float = 1 - 0.1**10) -> List[List[int]]: @@ -265,7 +265,7 @@ def get_omics_modules(Abar: pd.DataFrame, cut_height: float = 1 - 0.1**10) -> Li labels = fcluster(Z, t=cut_height, criterion='distance') - # Logic here only keep leaf nodes from merges below cut_height + # Logic here only keep leaf nodes from merges below cut_height merged_below = Z[:, 2] < cut_height n = abar_mat.shape[0] @@ -462,4 +462,4 @@ def prune_modules(Abar: pd.DataFrame, X_combined: np.ndarray, Y: np.ndarray, mod with open(os.path.join(saving_dir, save_name), 'wb') as f: pickle.dump(module_result, f) - return results \ No newline at end of file + return results diff --git a/bioneuralnet/network/pysmccnet/core.py b/bioneuralnet/network/pysmccnet/core.py index 0cfde2a..afd0384 100644 --- a/bioneuralnet/network/pysmccnet/core.py +++ b/bioneuralnet/network/pysmccnet/core.py @@ -21,13 +21,13 @@ def my_get_crit(xlist: List[torch.Tensor], ws: List[torch.Tensor], pair_cc: Unio """ # device management device = xlist[0].device - + if not isinstance(cc_coef, torch.Tensor): cc_coef = torch.tensor(cc_coef, device=device, dtype=torch.float32) - + if isinstance(pair_cc, torch.Tensor): pair_cc = pair_cc.cpu().numpy() - + num_pairs = pair_cc.shape[1] crits = [] @@ -35,12 +35,12 @@ def my_get_crit(xlist: List[torch.Tensor], ws: List[torch.Tensor], pair_cc: Unio for k in range(num_pairs): i = int(pair_cc[0, k]) j = int(pair_cc[1, k]) - + proj_i = torch.matmul(xlist[i], ws[i]) proj_j = torch.matmul(xlist[j], ws[j]) - + val = torch.matmul(proj_i.t(), proj_j) - + crits.append(val.view(-1)) # weighted sum @@ -49,7 +49,7 @@ def my_get_crit(xlist: List[torch.Tensor], ws: List[torch.Tensor], pair_cc: Unio crit = torch.sum(crits_vec * cc_coef) else: crit = torch.tensor(0.0, device=device) - + return crit.item() def my_get_cors(xlist: List[torch.Tensor], ws: List[torch.Tensor], pair_cc: Union[np.ndarray, torch.Tensor], cc_coef: Union[np.ndarray, torch.Tensor]) -> float: @@ -68,54 +68,54 @@ def my_get_cors(xlist: List[torch.Tensor], ws: List[torch.Tensor], pair_cc: Unio """ device = xlist[0].device - + if not isinstance(cc_coef, torch.Tensor): cc_coef = torch.tensor(cc_coef, device=device, dtype=torch.float32) - + if isinstance(pair_cc, torch.Tensor): pair_cc = pair_cc.cpu().numpy() - + num_pairs = pair_cc.shape[1] ccs = [] for k in range(num_pairs): i = int(pair_cc[0, k]) j = int(pair_cc[1, k]) - + # calculate projections u = torch.matmul(xlist[i], ws[i]).flatten() v = torch.matmul(xlist[j], ws[j]).flatten() - + # pearson correlation u_mean = torch.mean(u) v_mean = torch.mean(v) - + u_centered = u - u_mean v_centered = v - v_mean - + numerator = torch.sum(u_centered * v_centered) - + denom_u = torch.sqrt(torch.sum(u_centered**2)) denom_v = torch.sqrt(torch.sum(v_centered**2)) denominator = denom_u * denom_v - + if denominator == 0: corr = torch.tensor(0.0, device=device) else: corr = numerator / denominator - + if torch.isnan(corr): corr = torch.tensor(0.0, device=device) - + ccs.append(corr) - + # weighted sum if len(ccs) > 0: ccs_vec = torch.stack(ccs) cors = torch.sum(ccs_vec * cc_coef) else: cors = torch.tensor(0.0, device=device) - + return cors.item() @@ -143,26 +143,26 @@ def my_update_w(xlist: List[torch.Tensor], i: int, K: int, sumabsthis: float, ws """ device = xlist[i].device dtype = xlist[i].dtype - + if isinstance(cc_coef, torch.Tensor): cc_coef_list = cc_coef.cpu().tolist() elif isinstance(cc_coef, np.ndarray): cc_coef_list = cc_coef.tolist() else: cc_coef_list = list(cc_coef) - + if isinstance(pair_cc, torch.Tensor): pair_cc = pair_cc.cpu().numpy() else: pair_cc = np.array(pair_cc) - + tots = 0 num_pairs = len(cc_coef_list) - + # phase 1: matrix operations (gpu) for x in range(num_pairs): pairx = pair_cc[:, x] - + if pairx[0] != i and pairx[1] != i: continue else: @@ -170,41 +170,41 @@ def my_update_w(xlist: List[torch.Tensor], i: int, K: int, sumabsthis: float, ws j = int(pairx[1]) elif pairx[1] == i: j = int(pairx[0]) - + Xi = xlist[i] Xj = xlist[j] - + diagmat = torch.matmul( torch.matmul(ws_final[i].t(), Xi.t()), torch.matmul(Xj, ws_final[j]) ) - + diagmat = torch.diag(torch.diag(diagmat)) - + term1 = torch.matmul(Xi.t(), torch.matmul(Xj, ws[j])) - + term2_inner = torch.matmul(diagmat, torch.matmul(ws_final[j].t(), ws[j])) term2 = torch.matmul(ws_final[i], term2_inner) - + y = term1 - term2 y = y * cc_coef_list[x] - + tots = tots + y - + # phase 2: scalar optimization (cpu) if type == "standard": tots_cpu = tots.cpu().numpy() - + sumabsthis = binary_search(tots_cpu, sumabsthis) numerator = soft(tots_cpu, sumabsthis) denominator = l2n(numerator) w_cpu = numerator / denominator - + w = torch.tensor(w_cpu, device=device, dtype=dtype) - + else: raise ValueError("Current version requires all element types to be standard (not ordered).") - + return w def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[float], np.ndarray]] = None, ws: Optional[List[torch.Tensor]] = None, niter: int = 25, type: str = "standard", ncomponents: int = 1, trace: bool = True, standardize: bool = True, cc_coef: Optional[Union[List[float], np.ndarray, torch.Tensor]] = None) -> Dict[str, Any]: @@ -230,7 +230,7 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ # device management device = xlist[0].device dtype = xlist[0].dtype - + last_dims = xlist[-1].shape last_ncol = last_dims[1] if len(last_dims) > 1 else 1 @@ -240,29 +240,29 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ pairs = list(itertools.combinations(range(K), 2)) pair_cc = np.array(pairs).T num_cc = pair_cc.shape[1] - + if cc_coef is None: cc_coef = torch.ones(num_cc, device=device, dtype=dtype) else: if not isinstance(cc_coef, torch.Tensor): cc_coef = torch.tensor(cc_coef, device=device, dtype=dtype) - + if cc_coef.numel() != num_cc: raise ValueError(f"Invalid coefficients. Provide {num_cc} values.") - + if isinstance(type, str): if type != "standard": raise ValueError("Phenotype data must be continuous/standard.") type_vec = np.array([type] * K) else: type_vec = np.array(type) - + if len(type_vec) != K: raise ValueError("Type must be vector of length 1 or K.") - + if standardize: xlist = [r_scale_torch(x) for x in xlist] - + if ws is not None: make_null = False for i in range(K): @@ -270,22 +270,22 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ make_null = True if make_null: ws = None - + if ws is None: ws = [] for i in range(K): U, S, Vh = torch.linalg.svd(xlist[i], full_matrices=False) V = Vh.T ws.append(V[:, :ncomponents]) - + ws_init = [w.clone() for w in ws] - + if penalty is None: penalty = np.full(K, np.nan) for k in range(K): if type_vec[k] == "standard": penalty[k] = 4 - + if np.ndim(penalty) == 0: penalty = np.full(K, penalty) else: @@ -294,41 +294,41 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ ws_final = [w.clone() for w in ws_init] for i in range(K): ws_final[i] = torch.zeros((xlist[i].shape[1], ncomponents), device=device, dtype=dtype) - + cors = [] - + # optimization loop for comp in range(ncomponents): ws_curr = [] for i in range(K): ws_curr.append(ws_init[i][:, comp].reshape(-1, 1)) - + curiter = 1 crit_old = -10.0 crit = -20.0 storecrits = [] - + while (curiter <= niter and abs(crit_old - crit) / abs(crit_old) > 0.001 and crit_old != 0): - + crit_old = crit crit = my_get_crit(xlist, ws_curr, pair_cc, cc_coef) storecrits.append(crit) - + if trace: print(curiter, end=" ", flush=True) curiter += 1 - + for i in range(K): ws_curr[i] = my_update_w(xlist, i, K, penalty[i], ws_curr, ws_final, pair_cc, cc_coef, type=type_vec[i]) - + if trace: print("") - + for i in range(K): ws_final[i][:, comp] = ws_curr[i].flatten() - + cors.append(my_get_cors(xlist, ws_curr, pair_cc, cc_coef)) - + return { "ws": ws_final, "ws_init": ws_init, @@ -337,14 +337,14 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ "penalty": penalty, "cors": cors } - + else: # branch 2: phenotype included K = len(xlist) pairs = list(itertools.combinations(range(K), 2)) pair_cc = np.array(pairs).T num_cc = pair_cc.shape[1] - + if cc_coef is None: cc_coef = torch.ones(num_cc, device=device, dtype=dtype) else: @@ -357,10 +357,10 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ type_vec = np.array([type] * K) else: type_vec = np.array(type) - + if standardize: xlist = [r_scale_torch(x) for x in xlist] - + if ws is not None: make_null = False for i in range(K - 1): @@ -368,7 +368,7 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ make_null = True if make_null: ws = None - + if ws is None: ws = [] for i in range(K - 1): @@ -377,15 +377,15 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ ws.append(V[:, :ncomponents]) # phenotype weight fixed at 1.0 ws.append(torch.tensor([[1.0]], device=device, dtype=dtype)) - + ws_init = [w.clone() for w in ws] - + if penalty is None: penalty = np.full(K, np.nan) for k in range(K): if type_vec[k] == "standard": penalty[k] = 4 - + if np.ndim(penalty) == 0: penalty = np.full(K, penalty) else: @@ -394,43 +394,43 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ ws_final = [w.clone() for w in ws_init] for i in range(K - 1): ws_final[i] = torch.zeros((xlist[i].shape[1], ncomponents), device=device, dtype=dtype) - + cors = [] - + # optimization loop for comp in range(ncomponents): ws_curr = [] for i in range(K - 1): ws_curr.append(ws_init[i][:, comp].reshape(-1, 1)) ws_curr.append(torch.tensor([[1.0]], device=device, dtype=dtype)) - + curiter = 1 crit_old = -10.0 crit = -20.0 storecrits = [] - + while (curiter <= niter and abs(crit_old - crit) / abs(crit_old) > 0.001 and crit_old != 0): - + crit_old = crit crit = my_get_crit(xlist, ws_curr, pair_cc, cc_coef) storecrits.append(crit) - + if trace: print(curiter, end=" ", flush=True) curiter += 1 - + # update k-1 datasets for i in range(K - 1): ws_curr[i] = my_update_w(xlist, i, K, penalty[i], ws_curr, ws_final, pair_cc, cc_coef, type=type_vec[i]) - + if trace: print("") - + for i in range(K - 1): ws_final[i][:, comp] = ws_curr[i].flatten() - + cors.append(my_get_cors(xlist, ws_curr, pair_cc, cc_coef)) - + return { "ws": ws_final, "ws_init": ws_init, @@ -438,4 +438,4 @@ def my_multi_cca(xlist: List[torch.Tensor], penalty: Optional[Union[float, List[ "type": type_vec, "penalty": penalty, "cors": cors - } \ No newline at end of file + } diff --git a/bioneuralnet/network/pysmccnet/math_helpers.py b/bioneuralnet/network/pysmccnet/math_helpers.py index b32df18..4969db6 100644 --- a/bioneuralnet/network/pysmccnet/math_helpers.py +++ b/bioneuralnet/network/pysmccnet/math_helpers.py @@ -53,10 +53,10 @@ def binary_search(argu: Union[np.ndarray, list], sumabs: float) -> float: """ argu = np.array(argu) - + # calculate norm norm_val = l2n(argu) - + # check conditions if norm_val == 0 or np.sum(np.abs(argu / norm_val)) <= sumabs: return 0 @@ -64,21 +64,21 @@ def binary_search(argu: Union[np.ndarray, list], sumabs: float) -> float: lam1 = 0 lam2 = np.max(np.abs(argu)) - 1e-5 iter_count = 1 - + while iter_count < 150: mid_val = (lam1 + lam2) / 2 su = soft(argu, mid_val) - + su_norm = l2n(su) - + if np.sum(np.abs(su / su_norm)) < sumabs: lam2 = mid_val else: lam1 = mid_val - + if (lam2 - lam1) < 1e-6: return (lam1 + lam2) / 2 - + iter_count += 1 warnings.warn("Didn't quite converge") @@ -103,33 +103,33 @@ def r_vec_mult_sum(v1: Union[torch.Tensor, np.ndarray, list], v2: Union[torch.Te v1 = torch.tensor(np.array(v1).flatten(), dtype=torch.float64) else: v1 = v1.flatten() - + if not isinstance(v2, torch.Tensor): v2 = torch.tensor(np.array(v2).flatten(), dtype=torch.float64) else: v2 = v2.flatten() - + # device alignment device = v1.device v2 = v2.to(device) - + n1 = v1.numel() n2 = v2.numel() - + if n1 == 0 or n2 == 0: return 0.0 - + target_len = max(n1, n2) - + # recycling via repeat if n1 < target_len: repeats = (target_len // n1) + 1 v1 = v1.repeat(repeats)[:target_len] - + if n2 < target_len: repeats = (target_len // n2) + 1 v2 = v2.repeat(repeats)[:target_len] - + return torch.sum(v1 * v2).item() @@ -148,20 +148,20 @@ def r_scale_torch(x: Union[torch.Tensor, np.ndarray, list]) -> torch.Tensor: # convert to tensor if not isinstance(x, torch.Tensor): x = torch.tensor(x, dtype=torch.float32) - + # handle 1d arrays if x.ndim == 1: x = x.view(-1, 1) - + # calculate mean mean = torch.mean(x, dim=0) - + # calculate std std = torch.std(x, dim=0, unbiased=True) - + # handle constant columns std[std == 0] = 1.0 - + return (x - mean) / std @@ -179,20 +179,20 @@ def r_scale(x: Union[np.ndarray, list]) -> np.ndarray: """ # convert to float numpy x = np.array(x, dtype=float) - + # handle 1d arrays if x.ndim == 1: x = x.reshape(-1, 1) - + # calculate mean mean = np.mean(x, axis=0) - + # calculate std - std = np.std(x, axis=0, ddof=1) - + std = np.std(x, axis=0, ddof=1) + # handle constant columns std[std == 0] = 1.0 - + return (x - mean) / std def _splsda(x: np.ndarray, y: Union[np.ndarray, list], K: int = 3, eta: float = 0.5, kappa: float = 0.5, scale_x: bool = False) -> Dict[str, Union[np.ndarray, List[int]]]: @@ -214,16 +214,16 @@ def _splsda(x: np.ndarray, y: Union[np.ndarray, list], K: int = 3, eta: float = """ n, p = x.shape y_vec = np.array(y).flatten().astype(float) - + if scale_x: x = r_scale(x) - + X_res = x.copy() T_scores = np.zeros((n, K)) W_full = np.zeros((p, K)) - + K_actual = min(K, min(n, p)) - + for k in range(K_actual): # direction vector if kappa > 0: @@ -236,52 +236,52 @@ def _splsda(x: np.ndarray, y: Union[np.ndarray, list], K: int = 3, eta: float = w = X_res.T @ y_vec else: w = X_res.T @ y_vec - + # normalize w_norm = np.linalg.norm(w) if w_norm > 0: w = w / w_norm - + # soft thresholding threshold = eta * np.max(np.abs(w)) w_sparse = np.sign(w) * np.maximum(0, np.abs(w) - threshold) - + # normalize sparse weights w_sparse_norm = np.linalg.norm(w_sparse) if w_sparse_norm > 0: w_sparse = w_sparse / w_sparse_norm - + # compute scores t = X_res @ w_sparse t_norm_sq = t @ t - + if t_norm_sq == 0: T_scores[:, k] = 0 W_full[:, k] = 0 continue - + # deflation p_loading = X_res.T @ t / t_norm_sq X_res = X_res - np.outer(t, p_loading) - + T_scores[:, k] = t W_full[:, k] = w_sparse - + # identify selected variables selected_mask = np.any(W_full != 0, axis=1) A = np.where(selected_mask)[0] - + # fallback for empty selection if len(A) == 0: importance = np.sum(np.abs(W_full), axis=1) n_keep = max(1, int(p * (1 - eta))) A = np.argsort(importance)[-n_keep:] A = np.sort(A) - + W_selected = W_full[A, :] - + return { 'T': T_scores, 'W': W_selected, 'A': A - } \ No newline at end of file + } diff --git a/bioneuralnet/network/pysmccnet/pipeline.py b/bioneuralnet/network/pysmccnet/pipeline.py index a2d37ff..c9cf1d9 100644 --- a/bioneuralnet/network/pysmccnet/pipeline.py +++ b/bioneuralnet/network/pysmccnet/pipeline.py @@ -55,11 +55,11 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra """ np.random.seed(seed) - + # device configuration if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - + print("\n") print("**********************************") print("* Welcome to Automated SmCCNet! *") @@ -69,22 +69,22 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra if not isinstance(X, list): X = [X] - + if DataType is None: DataType = [f"Omics{i+1}" for i in range(len(X))] feature_labels = [] - + for i, data_obj in enumerate(X): prefix = DataType[i] - + if hasattr(data_obj, 'columns'): # DataFrame: use real column names with DataType prefix labels = [f"{prefix}_{col}" for col in data_obj.columns] else: # Numpy array: fallback to generic labels = [f"{prefix}_Feat{j+1}" for j in range(data_obj.shape[1])] - + feature_labels.extend(labels) # preprocessing @@ -92,17 +92,17 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra print("\n--------------------------------------------------") print(">> Starting data preprocessing...") print("--------------------------------------------------\n") - + if AdjustedCovar is not None: print("Covariate(s) provided. Regressing out effects.\n") AdjustedCovar = pd.DataFrame(AdjustedCovar) - + X_processed = [] for xx in range(len(X)): x_df = pd.DataFrame(X[xx]) processed = data_preprocess(X=x_df, covariates=AdjustedCovar, is_cv=False, center=True, scale=True, device=device, dtype=dtype) X_processed.append(processed) - + X = [x.to_numpy() for x in X_processed] else: X = [np.array(x) for x in X] @@ -112,9 +112,9 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra # determine analysis type and method if len(X) <= 1: raise ValueError("User must provide a list of data matrices (multiple omics).") - + AnalysisType = "multiomics" - + # method selection: CCA for continuous, PLS for binary if Y.ndim > 1 and Y.shape[1] > 1: method = "CCA" @@ -122,14 +122,14 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra method = "PLS" else: method = "CCA" - + print(f"This project uses {AnalysisType} {method}\n") # multi-omics scaling factor ScalingFactor = None ScalingFactorCC = None BDRatio = None - + print("\n--------------------------------------------------") print(">> Determining scaling factor...") print("--------------------------------------------------\n") @@ -143,17 +143,17 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra for i, (idx1, idx2) in enumerate(comb_indices): name = f"{DataTypePheno[idx1]}-{DataTypePheno[idx2]}" ScalingFactorNames.append(name) - + X_pair = [ torch.tensor(X[idx1], device=device, dtype=dtype), torch.tensor(X[idx2], device=device, dtype=dtype) ] - + CC_weight = get_can_weights_multi(X_pair, Trait=None, Lambda=ScalingPen, no_trait=True) - + proj1 = torch.matmul(X_pair[0], CC_weight[0]) proj2 = torch.matmul(X_pair[1], CC_weight[1]) - + p1 = proj1.flatten() p2 = proj2.flatten() p1_c = p1 - torch.mean(p1) @@ -161,7 +161,7 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra num = torch.sum(p1_c * p2_c) den = torch.sqrt(torch.sum(p1_c ** 2)) * torch.sqrt(torch.sum(p2_c ** 2)) corr_val = (num / den).item() if den.item() != 0 else 0.0 - + ScalingFactor_Omics[i] = abs(corr_val) # shrink between-omics scaling factor @@ -170,30 +170,30 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra if method == 'PLS': BDRatio = [np.mean(ScalingFactor_Omics), 1.0] print(f"The between-omics and omics-phenotype importance ratio is: {BDRatio[0]:.4f}:{BDRatio[1]}\n") - + # build full scaling factor vector ScalingFactorCC = np.concatenate([ScalingFactor_Omics, np.ones(len(X))]) - + K_total = len(X) + 1 comb_total = list(itertools.combinations(range(K_total), 2)) - + # non-phenotype index non_pheno_index = [i for i, pair in enumerate(comb_total) if pair[1] != len(X)] - + ScalingFactor_full = np.ones(len(comb_total)) omics_pairs_lookup = {pair: val for pair, val in zip(comb_indices, ScalingFactor_Omics)} - + for i, pair in enumerate(comb_total): if pair in omics_pairs_lookup: ScalingFactor_full[i] = omics_pairs_lookup[pair] - + if method == 'PLS': ScalingFactor = ScalingFactor_full[non_pheno_index] else: ScalingFactor = ScalingFactor_full print("The scaling factor selection is: \n") - + # print all scaling factors all_names = [f"{DataTypePheno[pair[0]]}-{DataTypePheno[pair[1]]}" for pair in comb_total] if method == 'PLS': @@ -216,7 +216,7 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra pen_seq = np.linspace(tuneRangePLS[0], tuneRangePLS[1], tuneLength) grid_inputs = [pen_seq for _ in range(len(X) + 1)] PenComb = np.array(list(itertools.product(*grid_inputs))) - + n_grid = PenComb.shape[0] SubsamplingPercent = np.zeros(len(X)) @@ -242,7 +242,7 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra folddata_cpu = precomputed_fold_data else: print(">> Generating random K-Fold splits...") - + # scale data if method == 'CCA': X_scaled = [r_scale_torch(torch.tensor(x, device=device, dtype=dtype)) for x in X] @@ -251,19 +251,19 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra # PLS: scale X, keep Y binary X_scaled_np = [r_scale(x) for x in X] Y_binary = Y.flatten() - + n_samples = X[0].shape[0] indices = np.arange(n_samples) np.random.shuffle(indices) fold_indices = np.array_split(indices, Kfold) - + folddata = {} folddata_cpu = {} - + for k in range(Kfold): test_idx = np.sort(fold_indices[k]) train_idx = np.sort(np.setdiff1d(indices, test_idx)) - + if method == 'CCA': folddata[f"fold_{k+1}"] = { "X_train": [x[train_idx, :] for x in X_scaled], @@ -290,37 +290,37 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra # cv loop: CCA if method == 'CCA': CVResult_list = [] - + for k in range(Kfold): fold_key = f"fold_{k+1}" d = folddata[fold_key] - + RhoTrain = np.zeros(n_grid) RhoTest = np.zeros(n_grid) DeltaCor = np.zeros(n_grid) - + print(f"Processing fold {k+1}/{Kfold}...") for idx in range(n_grid): l1 = PenComb[idx, :] - + Y_train = d["Y_train"] if isinstance(Y_train, torch.Tensor) and Y_train.ndim == 1: Y_train = Y_train.reshape(-1, 1) Y_test = d["Y_test"] if isinstance(Y_test, torch.Tensor) and Y_test.ndim == 1: Y_test = Y_test.reshape(-1, 1) - + Ws = get_can_weights_multi(X=d["X_train"], Trait=Y_train, Lambda=l1, no_trait=False, cc_coef=ScalingFactor) - + rho_train = get_can_cor_multi(X=d["X_train"], Y=Y_train, cc_weight=Ws, cc_coef=ScalingFactorCC) - + rho_test = get_can_cor_multi(X=d["X_test"], Y=Y_test, cc_weight=Ws, cc_coef=ScalingFactorCC) - + RhoTrain[idx] = round(rho_train, 5) RhoTest[idx] = round(rho_test, 5) DeltaCor[idx] = abs(rho_train - rho_test) - + CVResult_df = pd.DataFrame({ "RhoTrain": RhoTrain, "RhoTest": RhoTest, @@ -329,55 +329,55 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra CVResult_list.append(CVResult_df) AggregatedCVResult = sum(CVResult_list) / len(CVResult_list) - + epsilon = 1e-10 EvalMetric = AggregatedCVResult["DeltaCor"] / (np.abs(AggregatedCVResult["RhoTest"]) + epsilon) best_idx = EvalMetric.idxmin() BestPen = PenComb[best_idx, :] - + print("\n") for xx in range(len(BestPen)): print(f"The best penalty term for {DataType[xx]} is: {BestPen[xx]}") - + best_rho = round(AggregatedCVResult.iloc[best_idx]["RhoTest"], 3) best_err = round(AggregatedCVResult.iloc[best_idx]["DeltaCor"], 3) print(f"Testing Canonical Correlation: {best_rho}, Prediction Error: {best_err}\n") - + # final run: CCA print("Running multi-omics CCA with best penalty on complete dataset.\n") - + X_scaled_final = [r_scale_torch(torch.tensor(x, device=device, dtype=dtype)) for x in X] Y_scaled_final = r_scale_torch(torch.tensor(Y, device=device, dtype=dtype)) - + Ws_final = get_robust_weights_multi(X=X_scaled_final, Trait=Y_scaled_final, no_trait=False, cc_coef=ScalingFactor, Lambda=BestPen, s=SubsamplingPercent, subsampling_num=subSampNum) # cv loop: PLS elif method == 'PLS': import statsmodels.api as sm - + CVResult_list = [] - + for k in range(Kfold): fold_key = f"fold_{k+1}" d = folddata[fold_key] - + TrainMetric = np.zeros(n_grid) TestMetric = np.zeros(n_grid) - + print(f"Processing fold {k+1}/{Kfold}...") - + for idx in range(n_grid): l1 = PenComb[idx, :] - + lambda_between = l1[:len(X)] lambda_pheno = l1[len(X)] - + X_train_torch = [torch.tensor(x, device=device, dtype=dtype) for x in d["X_train"]] X_test_list = d["X_test"] - + Y_train = np.array(d["Y_train"]).flatten().astype(float) Y_test = np.array(d["Y_test"]).flatten().astype(float) - + has_error = False try: projection = get_robust_weights_multi_binary( @@ -393,53 +393,53 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra eval_classifier=True, test_data=X_test_list ) - + out_train = projection['out_train'] out_test = projection['out_test'] - + train_data = out_train - + model = sm.GLM(Y_train, train_data, family=sm.families.Binomial()) result = model.fit(disp=0) - + train_pred = result.predict(train_data) test_pred = result.predict(out_test) - + TrainMetric[idx] = _compute_binary_metrics(Y_train, train_pred, EvalMethod) TestMetric[idx] = _compute_binary_metrics(Y_test, test_pred, EvalMethod) - + except Exception as e: print(f"Caught an error: {e} on iteration {idx}") has_error = True - + if has_error: continue - + CVResult_df = pd.DataFrame({ "TrainMetric": TrainMetric, "TestMetric": TestMetric }) CVResult_list.append(CVResult_df) - + AggregatedCVResult = sum(CVResult_list) / len(CVResult_list) - + best_idx = AggregatedCVResult["TestMetric"].idxmax() BestPen = PenComb[best_idx, :] - + print("\n") for xx in range(len(X)): print(f"The best penalty term for {DataType[xx]} is: {BestPen[xx]}") print(f"The best penalty term on classifier is: {BestPen[len(X)]}") - + best_metric = round(AggregatedCVResult.iloc[best_idx]["TestMetric"], 3) print(f"Testing {EvalMethod} score: {best_metric}\n") - + print("Running multi-omics PLS with best penalty on complete dataset.\n") - + # final run: PLS outcome = Y.flatten().astype(float) X_scaled_final = [torch.tensor(r_scale(x), device=device, dtype=dtype) for x in X] - + Ws_final = get_robust_weights_multi_binary( X=X_scaled_final, Y=outcome, @@ -452,7 +452,7 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra ncomp_pls=ncomp_pls, eval_classifier=False ) - + if isinstance(Ws_final, np.ndarray): Ws_final = torch.tensor(Ws_final, device=device, dtype=dtype) @@ -464,28 +464,28 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra feature_labels = [] for i in range(len(X)): feature_labels.extend([f"{DataType[i]}_Feat{j+1}" for j in range(X[i].shape[1])]) - + Abar = get_abar(Ws_final, feature_label=feature_labels) - + if not os.path.exists(saving_dir): os.makedirs(saving_dir) print("\n--------------------------------------------------") print(">> Now starting network clustering...") print("--------------------------------------------------\n") - + modules = get_omics_modules(Abar, cut_height=CutHeight) - + print("Clustering completed...\n") - + print("--------------------------------------------------") print(">> Now starting network pruning and summarization score extraction...") print("--------------------------------------------------\n") - + # Build combined omics matrix (n x p) for summarization X_combined = np.hstack(X) - + subnetwork_results = prune_modules( Abar=Abar, X_combined=X_combined, @@ -503,20 +503,20 @@ def auto_pysmccnet(X: List[Union[pd.DataFrame, np.ndarray]], Y: Union[pd.DataFra PenComb_df = pd.DataFrame(PenComb, columns=[f"l{i+1}" for i in range(PenComb.shape[1])]) AggregatedCVResultPen = pd.concat([PenComb_df, AggregatedCVResult], axis=1) AggregatedCVResultPen.to_csv(os.path.join(saving_dir, "CVResult.csv"), index=False) - + # save fold data with open(os.path.join(saving_dir, "CVFold.pkl"), 'wb') as f: pickle.dump(folddata_cpu, f) - + # save global network globalNetwork = {"AdjacencyMatrix": Abar, "Data": X, "Phenotype": Y} with open(os.path.join(saving_dir, "globalNetwork.pkl"), 'wb') as f: pickle.dump(globalNetwork, f) - + print("\n************************************") print("* Execution Finished! *") print("************************************\n") - + return { "AdjacencyMatrix": Abar, "Data": X, @@ -561,58 +561,58 @@ def _compute_binary_metrics(y_true: Union[np.ndarray, list], y_pred_prob: Union[ """ y_true = np.array(y_true).flatten() y_pred_prob = np.array(y_pred_prob).flatten() - + # AUC calculation if eval_method == 'auc': desc_order = np.argsort(-y_pred_prob) y_sorted = y_true[desc_order] - + n_pos = np.sum(y_true == 1) n_neg = np.sum(y_true == 0) - + if n_pos == 0 or n_neg == 0: return 0.5 - + tp = 0 fp = 0 auc_val = 0.0 - + for i in range(len(y_sorted)): if y_sorted[i] == 1: tp += 1 else: fp += 1 auc_val += tp / n_pos - + auc_val /= n_neg return auc_val - + # threshold predictions y_pred = (y_pred_prob > 0.5).astype(int) - + TP = np.sum((y_true == 1) & (y_pred == 1)) TN = np.sum((y_true == 0) & (y_pred == 0)) FP = np.sum((y_true == 0) & (y_pred == 1)) FN = np.sum((y_true == 1) & (y_pred == 0)) - + if eval_method == 'accuracy': return (TP + TN) / len(y_true) if len(y_true) > 0 else 0.0 - + elif eval_method == 'precision': return TP / (TP + FP) if (TP + FP) > 0 else 0.0 - + elif eval_method == 'recall': return TP / (TP + FN) if (TP + FN) > 0 else 0.0 - + elif eval_method == 'f1': precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0 recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0 if precision + recall == 0: return 0.0 return 2 * (precision * recall) / (precision + recall) - + else: raise ValueError( f"Invalid eval_method '{eval_method}'. " "Choose from: accuracy, auc, precision, recall, f1." - ) \ No newline at end of file + ) diff --git a/bioneuralnet/network/pysmccnet/wrappers.py b/bioneuralnet/network/pysmccnet/wrappers.py index 63df9ee..31320d8 100644 --- a/bioneuralnet/network/pysmccnet/wrappers.py +++ b/bioneuralnet/network/pysmccnet/wrappers.py @@ -31,11 +31,11 @@ def get_can_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor] = if Lambda is None: raise ValueError("Lambda must be provided.") Lambda = np.atleast_1d(np.array(Lambda, dtype=float)) - + for lam in Lambda: if abs(lam - 0.5) > 0.5: raise ValueError("Invalid penalty parameter. Lambda1 needs to be between zero and one.") - + if np.min(Lambda) == 0: raise ValueError("Invalid penalty parameter. Both Lambda1 and Lambda2 has to be greater than 0.") @@ -53,13 +53,13 @@ def get_can_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor] = else: if Trait is None: raise ValueError("Trait must be provided if no_trait is False.") - + scaled_trait = r_scale_torch(Trait) current_X.append(scaled_trait) - + trait_ncol = scaled_trait.shape[1] L.append(np.sqrt(trait_ncol)) - + out = my_multi_cca(current_X, penalty=L, cc_coef=cc_coef, trace=trace) # output extraction @@ -70,7 +70,7 @@ def get_can_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor] = ws = out['ws'] else: ws = out['ws'][:-1] - + return ws def get_robust_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor], Lambda: Union[List[float], np.ndarray], s: Optional[Union[List[float], np.ndarray]] = None, no_trait: bool = False, subsampling_num: int = 1000, cc_coef: Optional[np.ndarray] = None, trace: bool = False, trait_weight: bool = False) -> torch.Tensor: """PyTorch version of get_robust_weights_multi with subsampling loop. @@ -95,14 +95,14 @@ def get_robust_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor # device management device = X[0].device dtype = X[0].dtype - + if s is None: raise ValueError("s (subsampling proportions) must be provided.") - + s = np.atleast_1d(np.array(s, dtype=float)) if s.size == 1 and len(X) > 1: s = np.repeat(s, len(X)) - + Lambda = np.atleast_1d(np.array(Lambda, dtype=float)) # validation checks @@ -111,7 +111,7 @@ def get_robust_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor else: if np.sum(np.abs(s - 0.5) > 0.5) > 0: raise ValueError("Subsampling proportions can not exceed one.") - + if (np.sum(np.abs(Lambda - 0.5) > 0.5) > 0) or (np.sum(Lambda == 0) > 0): raise ValueError("Invalid penalty parameter. Lambda1 needs to be between zero and one.") @@ -122,7 +122,7 @@ def get_robust_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor # subsampling loop results = [] - + iter_range = range(subsampling_num) if subsampling_num > 1: iter_range = tqdm(range(subsampling_num), desc="Robust Weights") @@ -154,13 +154,13 @@ def get_robust_weights_multi(X: List[torch.Tensor], Trait: Optional[torch.Tensor for h in range(len(p_cum) - 1): global_indices = samp[h] + int(p_cum[h]) idx = torch.tensor(global_indices, device=device, dtype=torch.long) - + val = out[h] if not isinstance(val, torch.Tensor): val = torch.tensor(np.array(val).flatten(), device=device, dtype=dtype) else: val = val.flatten() - + w[idx] = val results.append(w) @@ -188,33 +188,33 @@ def get_robust_weights_single_binary(X1: np.ndarray, Trait: np.ndarray, Lambda1: """ X1 = np.array(X1, dtype=float) Trait = np.array(Trait).flatten() - + p1 = X1.shape[1] p1_sub = int(np.ceil(s1 * p1)) - + results = [] - + iter_range = range(subsampling_num) if subsampling_num > 1: iter_range = tqdm(range(subsampling_num), desc="Single Binary Weights") - + for _ in iter_range: # subsample features samp1 = np.sort(np.random.choice(p1, p1_sub, replace=False)) - + # scale subsampled data x1_par = r_scale(X1[:, samp1]) - + # run sparse pls-da out = _splsda(x=x1_par, y=Trait, K=K, eta=Lambda1, kappa=0.5, scale_x=False) - + u = np.zeros(p1_sub) w = np.zeros(p1) - + T_scores = out['T'] W_weights = out['W'] A_indices = out['A'] - + # fit logistic regression on latent factors try: model = sm.GLM(Trait, T_scores, family=sm.families.Binomial()) @@ -223,19 +223,19 @@ def get_robust_weights_single_binary(X1: np.ndarray, Trait: np.ndarray, Lambda1: except Exception: results.append(np.zeros(p1)) continue - + # compute weights u[A_indices] = np.abs(W_weights) @ np.abs(glm_coefs) - + # normalize norm_val = np.linalg.norm(u[A_indices]) if norm_val > 0: u[A_indices] = u[A_indices] / norm_val - + # scatter back to full feature space w[samp1] = u results.append(w) - + beta = np.column_stack(results) return beta @@ -264,13 +264,13 @@ def get_robust_weights_multi_binary(X: List[torch.Tensor], Y: Union[np.ndarray, if between_discriminate_ratio is None: between_discriminate_ratio = [1, 1] between_discriminate_ratio = np.array(between_discriminate_ratio, dtype=float) - + if lambda_between is None: raise ValueError("lambda_between must be provided.") lambda_between = np.array(lambda_between) - + eta = lambda_pheno - + # ensure Y is numpy if isinstance(Y, torch.Tensor): Y_np = Y.cpu().numpy().flatten() @@ -282,32 +282,32 @@ def get_robust_weights_multi_binary(X: List[torch.Tensor], Y: Union[np.ndarray, X, Trait=None, Lambda=lambda_between, s=subsampling_percent, no_trait=True, cc_coef=cc_coef, subsampling_num=subsampling_num ) - + # move to cpu for pls-da between_omics_weight = between_omics_weight.cpu().numpy() - + # column-bind all omics X_all = np.hstack([x.cpu().numpy() for x in X]) - + # feature type index type_index = np.concatenate([np.full(X[h].shape[1], h) for h in range(len(X))]) - + if not eval_classifier: # branch a: network construction n_subsamples = between_omics_weight.shape[1] p_total = between_omics_weight.shape[0] - + omics_pheno_weight = np.zeros_like(between_omics_weight) - + for iii in tqdm(range(n_subsamples), desc="Omics-Phenotype PLS"): selected_mask = between_omics_weight[:, iii] != 0 selected_indices = np.where(selected_mask)[0] - + if len(selected_indices) == 0: continue - + X_subset = X_all[:, selected_indices] - + try: # cpu: small matrix pls-da Ws_pheno = get_robust_weights_single_binary( @@ -317,19 +317,19 @@ def get_robust_weights_multi_binary(X: List[torch.Tensor], Y: Union[np.ndarray, ) except Exception: continue - + omics_pheno_weight[selected_indices, iii] = Ws_pheno.flatten() - + # normalize per data type for j in range(len(X)): type_mask = type_index == j norm_val = np.linalg.norm(omics_pheno_weight[type_mask, iii]) if norm_val > 0: omics_pheno_weight[type_mask, iii] /= norm_val - + # zero out between-omics where pheno is zero between_omics_weight[omics_pheno_weight == 0] = 0 - + # remove zero/nan columns if subsampling_num > 1: zero_cols = [] @@ -337,53 +337,53 @@ def get_robust_weights_multi_binary(X: List[torch.Tensor], Y: Union[np.ndarray, col = omics_pheno_weight[:, col_idx] if np.all(col == 0) or np.any(np.isnan(col)): zero_cols.append(col_idx) - + if len(zero_cols) > 0: keep_cols = np.setdiff1d(np.arange(omics_pheno_weight.shape[1]), zero_cols) between_omics_weight = between_omics_weight[:, keep_cols] omics_pheno_weight = omics_pheno_weight[:, keep_cols] - + # aggregate ratio_sum = np.sum(between_discriminate_ratio) w1 = between_discriminate_ratio[0] / ratio_sum w2 = between_discriminate_ratio[1] / ratio_sum - + cc_weight = w1 * between_omics_weight + w2 * omics_pheno_weight - + return cc_weight - + else: # branch b: classifier evaluation if subsampling_num != 1: raise ValueError("Subsampling number must be 1 when evaluating the classifier.") - + if test_data is None: raise ValueError("test_data must be provided when eval_classifier=True.") - + selected_mask = between_omics_weight[:, 0] != 0 selected_indices = np.where(selected_mask)[0] - + X_subset = X_all[:, selected_indices] - + try: out = _splsda(x=r_scale(X_subset), y=Y_np, K=ncomp_pls, eta=lambda_pheno, kappa=0.5, scale_x=False) - + out_data = np.zeros((X_subset.shape[1], ncomp_pls)) out_data[out['A'], :] = out['W'] - + # process test data X_all_test = np.hstack([t.cpu().numpy() if isinstance(t, torch.Tensor) else np.array(t) for t in test_data]) X_subset_test = X_all_test[:, selected_indices] out_test = X_subset_test @ out_data - + out_train = out['T'] - + return {'out_train': out_train, 'out_test': out_test} - + except Exception as e: print(f"Caught an error: {e}") n_train = X_all.shape[0] n_test = np.hstack([t.cpu().numpy() if isinstance(t, torch.Tensor) else np.array(t) for t in test_data]).shape[0] out_train = np.zeros((n_train, ncomp_pls)) out_test = np.zeros((n_test, ncomp_pls)) - return {'out_train': out_train, 'out_test': out_test} \ No newline at end of file + return {'out_train': out_train, 'out_test': out_test} diff --git a/bioneuralnet/network/tools.py b/bioneuralnet/network/tools.py index e56598c..c229da9 100644 --- a/bioneuralnet/network/tools.py +++ b/bioneuralnet/network/tools.py @@ -38,14 +38,14 @@ class NetworkAnalyzer: source_omics (list): Optional list of original DataFrames used to build the network to dynamically assign omics types. device (str): The target computing device, defaulting to 'cuda' if available. """ - def __init__(self, adjacency_matrix: pd.DataFrame, source_omics: list = None, device: str = 'cuda'): + def __init__(self, adjacency_matrix: pd.DataFrame, source_omics: Optional[list] = None, device: str = 'cuda'): self.device = device if torch.cuda.is_available() else 'cpu' self.feature_names = adjacency_matrix.index.tolist() self.n_nodes = len(self.feature_names) - - self.omics_types = {} - self.feature_to_omic = {} - + + self.omics_types: Dict[str, List[str]] = {} + self.feature_to_omic: Dict[str, str] = {} + if source_omics is not None: for i, df in enumerate(source_omics): omic_name = f"omic_{i+1}" @@ -61,13 +61,13 @@ def __init__(self, adjacency_matrix: pd.DataFrame, source_omics: list = None, de self.omics_types[omics] = [] self.omics_types[omics].append(feat) self.feature_to_omic[feat] = omics - + self.A = torch.tensor( - adjacency_matrix.values, - dtype=torch.float32, + adjacency_matrix.values, + dtype=torch.float32, device=self.device ) - + logger.info(f"Initialized on {self.device.upper()}") logger.info(f"Nodes: {self.n_nodes:,}") logger.info(f"Omics types: {list(self.omics_types.keys())}") @@ -105,21 +105,21 @@ def basic_statistics(self, threshold: float = 0.5) -> Dict[str, Union[float, int logger.info(f"\n{'='*60}") logger.info(f"BASIC NETWORK STATISTICS (threshold > {threshold})") logger.info(f"{'='*60}") - + A_bin = self.threshold_network(threshold) - + num_nodes = self.n_nodes num_edges = A_bin.sum().item() / 2 max_possible_edges = num_nodes * (num_nodes - 1) / 2 density = num_edges / max_possible_edges - + degrees = A_bin.sum(dim=1) avg_degree = degrees.mean().item() max_degree = degrees.max().item() min_degree = degrees.min().item() - + isolated = (degrees == 0).sum().item() - + logger.info(f"Nodes: {num_nodes:,}") logger.info(f"Edges: {int(num_edges):,}") logger.info(f"Density: {density:.6f}") @@ -127,7 +127,7 @@ def basic_statistics(self, threshold: float = 0.5) -> Dict[str, Union[float, int logger.info(f"Max Degree: {int(max_degree)}") logger.info(f"Min Degree: {int(min_degree)}") logger.info(f"Isolated Nodes: {isolated:,} ({100*isolated/num_nodes:.1f}%)") - + return { 'nodes': num_nodes, 'edges': int(num_edges), @@ -154,9 +154,9 @@ def degree_distribution(self, threshold: float = 0.5) -> pd.DataFrame: """ A_bin = self.threshold_network(threshold) degrees = A_bin.sum(dim=1).cpu().numpy().astype(int) - + unique, counts = np.unique(degrees, return_counts=True) - + return pd.DataFrame({ 'degree': unique, 'count': counts, @@ -181,18 +181,18 @@ def hub_analysis(self, threshold: float = 0.5, top_n: int = 10) -> pd.DataFrame: logger.info(f"\n{'='*60}") logger.info(f"TOP {top_n} HUB NODES (threshold > {threshold})") logger.info(f"{'='*60}") - + A_bin = self.threshold_network(threshold) degrees = A_bin.sum(dim=1) - + top_values, top_indices = torch.topk(degrees, top_n) - + results = [] for i, (idx, deg) in enumerate(zip(top_indices.cpu().numpy(), top_values.cpu().numpy())): feat_name = self.feature_names[idx] omics_type = self.feature_to_omic.get(feat_name, 'unknown') actual_name = feat_name - + results.append({ 'rank': i + 1, 'feature': feat_name, @@ -201,7 +201,7 @@ def hub_analysis(self, threshold: float = 0.5, top_n: int = 10) -> pd.DataFrame: 'degree': int(deg) }) logger.info(f"{i+1:2d}. {feat_name:<40s} | {omics_type:<6s} | degree: {int(deg)}") - + return pd.DataFrame(results) def clustering_coefficient_gpu(self, threshold: float = 0.5, sample_size: Optional[int] = None) -> Dict[str, Union[float, np.ndarray]]: @@ -222,18 +222,18 @@ def clustering_coefficient_gpu(self, threshold: float = 0.5, sample_size: Option logger.info(f"\n{'='*60}") logger.info(f"CLUSTERING COEFFICIENT ANALYSIS (threshold > {threshold})") logger.info(f"{'='*60}") - + A_bin = self.threshold_network(threshold) degrees = A_bin.sum(dim=1) - + if sample_size is None: valid_mask = degrees >= 2 n_valid = valid_mask.sum().item() - + if n_valid > 5000: logger.info(f"Large network ({n_valid} valid nodes). Sampling 5000 nodes...") sample_size = 5000 - + if sample_size: valid_indices = torch.where(degrees >= 2)[0] if len(valid_indices) > sample_size: @@ -243,44 +243,44 @@ def clustering_coefficient_gpu(self, threshold: float = 0.5, sample_size: Option sample_indices = valid_indices else: sample_indices = torch.where(degrees >= 2)[0] - + logger.info(f"Computing clustering for {len(sample_indices)} nodes...") - + clustering_coeffs = torch.zeros(self.n_nodes, device=self.device) - + batch_size = 500 n_batches = (len(sample_indices) + batch_size - 1) // batch_size - + for batch_idx in range(n_batches): start = batch_idx * batch_size end = min((batch_idx + 1) * batch_size, len(sample_indices)) batch_nodes = sample_indices[start:end] - + for node_idx in batch_nodes: node = node_idx.item() neighbors = torch.where(A_bin[node] > 0)[0] k = len(neighbors) - + if k >= 2: neighbor_subgraph = A_bin[neighbors][:, neighbors] triangles = neighbor_subgraph.sum().item() / 2 max_triangles = k * (k - 1) / 2 clustering_coeffs[node] = triangles / max_triangles - + if (batch_idx + 1) % 10 == 0: logger.info(f" Processed batch {batch_idx + 1}/{n_batches}") - + valid_cc = clustering_coeffs[sample_indices] avg_cc = valid_cc.mean().item() max_cc = valid_cc.max().item() min_cc = valid_cc[valid_cc > 0].min().item() if (valid_cc > 0).any() else 0 - + logger.info(f"\nClustering Coefficient Statistics:") logger.info(f" Average: {avg_cc:.4f}") logger.info(f" Maximum: {max_cc:.4f}") logger.info(f" Minimum (non-zero): {min_cc:.4f}") logger.info(f" Nodes with CC > 0: {(valid_cc > 0).sum().item()}") - + return { 'average': avg_cc, 'max': max_cc, @@ -305,45 +305,45 @@ def cross_omics_analysis(self, threshold: float = 0.5) -> Dict[tuple, Dict]: logger.info(f"\n{'='*60}") logger.info(f"CROSS-OMICS CONNECTIVITY (threshold > {threshold})") logger.info(f"{'='*60}") - + A_bin = self.threshold_network(threshold) - + omics_indices = {} for omics, features in self.omics_types.items(): omics_indices[omics] = [self.feature_names.index(f) for f in features] - + results = {} omics_list = list(self.omics_types.keys()) - + logger.info(f"\n{'Omics Pair':<20s} | {'Edges':>10s} | {'Max Possible':>12s} | {'Density':>10s}") logger.info("-" * 60) - + for i, om1 in enumerate(omics_list): for j, om2 in enumerate(omics_list): if i <= j: idx1 = torch.tensor(omics_indices[om1], device=self.device, dtype=torch.long) idx2 = torch.tensor(omics_indices[om2], device=self.device, dtype=torch.long) - + submatrix = A_bin[idx1][:, idx2] n_edges = submatrix.sum().item() - + if i == j: n_edges = n_edges / 2 max_edges = len(idx1) * (len(idx1) - 1) / 2 else: max_edges = len(idx1) * len(idx2) - + density = n_edges / max_edges if max_edges > 0 else 0 - + pair_name = f"{om1}-{om2}" if i != j else f"{om1} (within)" results[(om1, om2)] = { 'edges': int(n_edges), 'max_edges': int(max_edges), 'density': density } - + logger.info(f"{pair_name:<20s} | {int(n_edges):>10,} | {int(max_edges):>12,} | {density:>10.6f}") - + return results def edge_weight_analysis(self) -> Optional[np.ndarray]: @@ -363,16 +363,16 @@ def edge_weight_analysis(self) -> Optional[np.ndarray]: logger.info(f"\n{'='*60}") logger.info(f"EDGE WEIGHT DISTRIBUTION") logger.info(f"{'='*60}") - + upper_tri = torch.triu(self.A, diagonal=1) weights = upper_tri[upper_tri > 0] - + if len(weights) == 0: logger.info("No edges found!") return None - + weights_cpu = weights.cpu().numpy() - + logger.info(f"Total edges (weight > 0): {len(weights_cpu):,}") logger.info(f"Weight statistics:") logger.info(f" Mean: {weights_cpu.mean():.6f}") @@ -380,17 +380,17 @@ def edge_weight_analysis(self) -> Optional[np.ndarray]: logger.info(f" Median: {np.median(weights_cpu):.6f}") logger.info(f" Min: {weights_cpu.min():.6f}") logger.info(f" Max: {weights_cpu.max():.6f}") - + logger.info(f"\nPercentiles:") for p in [25, 50, 75, 90, 95, 99]: val = np.percentile(weights_cpu, p) logger.info(f" {p}th: {val:.6f}") - + logger.info(f"\nEdges at different biological thresholds:") for thresh in [0.001, 0.1, 0.3, 0.5, 0.7, 0.8, 0.9]: n_edges = (weights_cpu > thresh).sum() logger.info(f" > {thresh}: {n_edges:,} edges") - + return weights_cpu def find_strongest_edges(self, top_n: int = 50) -> pd.DataFrame: @@ -410,28 +410,28 @@ def find_strongest_edges(self, top_n: int = 50) -> pd.DataFrame: logger.info(f"\n{'='*60}") logger.info(f"TOP {top_n} STRONGEST EDGES") logger.info(f"{'='*60}") - + upper_tri = torch.triu(self.A, diagonal=1) - + flat = upper_tri.flatten() top_values, top_flat_indices = torch.topk(flat, top_n) - + n = self.n_nodes row_indices = top_flat_indices // n col_indices = top_flat_indices % n - + results = [] logger.info(f"{'Rank':<5s} | {'Feature 1':<35s} | {'Feature 2':<35s} | {'Weight':>10s}") logger.info("-" * 95) - + for i in range(top_n): row = row_indices[i].item() col = col_indices[i].item() weight = top_values[i].item() - + feat1 = self.feature_names[row] feat2 = self.feature_names[col] - + results.append({ 'rank': i + 1, 'feature1': feat1, @@ -440,9 +440,9 @@ def find_strongest_edges(self, top_n: int = 50) -> pd.DataFrame: 'omics2': self.feature_to_omic.get(feat2, 'unknown'), 'weight': weight }) - + logger.info(f"{i+1:<5d} | {feat1:<35s} | {feat2:<35s} | {weight:>10.6f}") - + return pd.DataFrame(results) def connected_components(self, threshold: float = 0.5) -> Dict[str, Union[int, np.ndarray, List[int]]]: @@ -462,27 +462,27 @@ def connected_components(self, threshold: float = 0.5) -> Dict[str, Union[int, n logger.info(f"\n{'='*60}") logger.info(f"CONNECTED COMPONENTS (threshold > {threshold})") logger.info(f"{'='*60}") - + A_bin = self.threshold_network(threshold) - + A_cpu = A_bin.cpu().numpy() - + A_sparse = csr_matrix(A_cpu) n_components, labels = connected_components(A_sparse, directed=False) - + unique, counts = np.unique(labels, return_counts=True) component_sizes = sorted(counts, reverse=True) - + logger.info(f"Number of components: {n_components}") logger.info(f"Largest component: {component_sizes[0]} nodes ({100*component_sizes[0]/self.n_nodes:.1f}%)") - + if n_components > 1: logger.info(f"Second largest: {component_sizes[1]} nodes") logger.info(f"\nTop 10 component sizes: {component_sizes[:10]}") - + isolated = (counts == 1).sum() logger.info(f"Isolated nodes: {isolated}") - + return { 'n_components': n_components, 'labels': labels, @@ -553,12 +553,12 @@ def network_search( for idx, (method_name, gen_params) in enumerate(all_configs, start=1): builder = dispatch.get(method_name) - if builder is None: + if builder is None or not callable(builder): continue try: G = builder(omics_data, **{**gen_params, "self_loops": False}) - f1_score = _feature_proxy(G, X_scaled, y_vec, cv, mode=centrality_mode, scoring=scoring) + f1_score = _feature_proxy(G, X_scaled, y_vec, cv, mode=centrality_mode, scoring=scoring) topo_score = _topology_quality(G) topo_norm = np.clip(topo_score, 0, 1) combined = ((1.0 - topology_weight) * f1_score+ topology_weight * topo_norm) @@ -660,25 +660,25 @@ def _build_config_grid(methods: list[str],seed: int,trials: Optional[int],) -> l return configs def _topology_quality(adj_df: pd.DataFrame) -> float: - A = adj_df.fillna(0.0).values + A = adj_df.fillna(0.0).values.copy() np.fill_diagonal(A, 0.0) n_nodes = max(A.shape[0], 1) - degrees = np.count_nonzero(A, axis=1) + degrees = np.count_nonzero(A, axis=1) connectivity = np.sum(degrees > 0) / n_nodes - + G = nx.from_numpy_array(A) - + if n_nodes > 0: largest_cc_size = len(max(nx.connected_components(G), key=len)) lcc_ratio = largest_cc_size / n_nodes else: lcc_ratio = 0.0 - + return 0.5 * connectivity + 0.5 * lcc_ratio def _feature_proxy(adj_df, X_df, y, cv, mode="laplacian", scoring="f1_macro"): - A = adj_df.fillna(0.0).values + A = adj_df.fillna(0.0).values.copy() np.fill_diagonal(A, 0.0) if mode == "laplacian": @@ -692,12 +692,12 @@ def _feature_proxy(adj_df, X_df, y, cv, mode="laplacian", scoring="f1_macro"): weights = nx.eigenvector_centrality_numpy(G_nx, weight="weight") except Exception: weights = dict(G_nx.degree(weight=None)) - w_vec = np.log1p(np.maximum(0.0, + w_vec = np.log1p(np.maximum(0.0, np.array([weights.get(c, 0.0) for c in X_df.columns]))) X_smooth = X_df.values * w_vec[None, :] scores = cross_val_score( RidgeClassifier(class_weight="balanced"), - X_smooth, y, cv=cv, scoring=scoring, n_jobs=-1, + X_smooth, y, cv=cv, scoring=scoring ) return float(scores.mean()) diff --git a/bioneuralnet/network_embedding/gnn_models.py b/bioneuralnet/network_embedding/gnn_models.py index 934c99b..0d29ab0 100644 --- a/bioneuralnet/network_embedding/gnn_models.py +++ b/bioneuralnet/network_embedding/gnn_models.py @@ -47,12 +47,12 @@ def get_activation(activation_choice): "elu": nn.ELU(), "leaky_relu": nn.LeakyReLU(negative_slope=0.01), } - + act = activations.get(activation_choice.lower()) - + if act is None: raise ValueError(f"Unsupported activation function: {activation_choice}") - + return act class GCN(nn.Module): @@ -88,7 +88,7 @@ def __init__( ): if seed is not None: set_seed(seed) - + super().__init__() self.dropout = process_dropout(dropout) @@ -119,7 +119,7 @@ def _message_pass(self, data): x = self.conv_first(x, edge_index, edge_weight=edge_weight) x = self.activation(x) - + if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) @@ -128,7 +128,7 @@ def _message_pass(self, data): x = self.activation(x) if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) - + return x def forward(self, data): @@ -136,7 +136,7 @@ def forward(self, data): Full forward pass including the task-specific head. """ x = self._message_pass(data) - + return self.regressor(x) def get_embeddings(self, data): @@ -177,7 +177,7 @@ def __init__( ): if seed is not None: set_seed(seed) - + super().__init__() self.dropout = process_dropout(dropout) @@ -208,13 +208,13 @@ def _message_pass(self, data): """ x, edge_index = data.x, data.edge_index edge_attr = getattr(data, "edge_attr", None) - + if edge_attr is not None and edge_attr.dim() == 1: edge_attr = edge_attr.unsqueeze(1) x = self.conv_first(x, edge_index, edge_attr=edge_attr) x = self.activation(x) - + if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) @@ -223,7 +223,7 @@ def _message_pass(self, data): x = self.activation(x) if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) - + return x def forward(self, data): @@ -231,7 +231,7 @@ def forward(self, data): Full forward pass. """ x = self._message_pass(data) - + return self.regressor(x) def get_embeddings(self, data): @@ -267,7 +267,7 @@ def __init__( ): if seed is not None: set_seed(seed) - + super().__init__() self.dropout = process_dropout(dropout) @@ -297,7 +297,7 @@ def _message_pass(self, data): x = self.conv_first(x, edge_index) x = self.activation(x) - + if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) @@ -306,7 +306,7 @@ def _message_pass(self, data): x = self.activation(x) if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) - + return x def forward(self, data): @@ -314,7 +314,7 @@ def forward(self, data): Full forward pass. """ x = self._message_pass(data) - + return self.regressor(x) def get_embeddings(self, data): @@ -351,7 +351,7 @@ def __init__( ): if seed is not None: set_seed(seed) - + super().__init__() self.dropout = process_dropout(dropout) @@ -383,13 +383,13 @@ def _message_pass(self, data): """ x, edge_index = data.x, data.edge_index edge_attr = getattr(data, "edge_attr", None) - + if edge_attr is not None and edge_attr.dim() == 1: edge_attr = edge_attr.unsqueeze(1) x = self.conv_first(x, edge_index, edge_attr=edge_attr) x = self.activation(x) - + if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) @@ -398,7 +398,7 @@ def _message_pass(self, data): x = self.activation(x) if self.dropout > 0.0: x = F.dropout(x, p=self.dropout, training=self.training) - + return x def forward(self, data): @@ -406,11 +406,11 @@ def forward(self, data): Full forward pass. """ x = self._message_pass(data) - + return self.regressor(x) def get_embeddings(self, data): """ Extract latent node embeddings. """ - return self._message_pass(data) \ No newline at end of file + return self._message_pass(data) diff --git a/bioneuralnet/utils/__init__.py b/bioneuralnet/utils/__init__.py index b0ce7dc..98311a3 100644 --- a/bioneuralnet/utils/__init__.py +++ b/bioneuralnet/utils/__init__.py @@ -1,6 +1,6 @@ """Utility Module -This module provides a collection of helper functions for data preprocessing, +This module provides a collection of helper functions for data preprocessing, feature selection, statistical data exploration, graph network pruning, and reproducibility. """ @@ -44,7 +44,7 @@ __all__ = [ "get_logger", "set_seed", - + "variance_summary", "zero_summary", "expression_summary", @@ -52,7 +52,7 @@ "nan_summary", "sparse_filter" "data_stats", - + "variance_threshold", "mad_filter", "pca_loadings", @@ -60,7 +60,7 @@ "correlation_filter", "importance_rf", "top_anova_f_features", - + "m_transform", "impute_simple", "impute_knn", @@ -68,9 +68,9 @@ "clean_inf_nan", "clean_internal", "preprocess_clinical", - + "prune_network", "prune_network_by_quantile", "network_remove_low_variance", "network_remove_high_zero_fraction", -] \ No newline at end of file +] diff --git a/bioneuralnet/utils/data.py b/bioneuralnet/utils/data.py index cf8126f..287cef9 100644 --- a/bioneuralnet/utils/data.py +++ b/bioneuralnet/utils/data.py @@ -15,7 +15,7 @@ def variance_summary(df: pd.DataFrame, var_threshold: Optional[float] = None) -> var_threshold (Optional[float]): A threshold used to count features falling below this variance level. Returns: - dict: A dictionary containing the mean, median, min, max, and standard deviation of the column variances. + dict: A dictionary containing the mean, median, min, max, and standard deviation of the column variances. If a threshold is provided, it also includes 'Number Of Low Variance Features'. """ variances = df.var() @@ -41,7 +41,7 @@ def zero_summary(df: pd.DataFrame, zero_threshold: Optional[float] = None) -> di zero_threshold (Optional[float]): A threshold used to count features whose zero-fraction exceeds this value. Returns: - dict: A dictionary containing the mean, median, min, max, and standard deviation of the zero fractions. + dict: A dictionary containing the mean, median, min, max, and standard deviation of the zero fractions. If a threshold is provided, it includes 'Number Of High Zero Features'. """ zero_fraction = (df == 0).sum(axis=0) / df.shape[0] @@ -90,7 +90,9 @@ def correlation_summary(df: pd.DataFrame) -> dict: dict: A dictionary containing the mean, median, min, max, and std of the max absolute correlations. """ corr_matrix = df.corr().abs() - np.fill_diagonal(corr_matrix.values, 0) + vals = corr_matrix.to_numpy().copy() + np.fill_diagonal(vals, 0) + corr_matrix = pd.DataFrame(vals, index=corr_matrix.index, columns=corr_matrix.columns) max_corr = corr_matrix.max() summary = { @@ -109,36 +111,36 @@ def nan_summary(df: pd.DataFrame, name: str = "Dataset", missing_threshold: floa df (pd.DataFrame): The input omics DataFrame. name (str): A descriptive name for the dataset for clear output labeling. missing_threshold (float): Percentage threshold (0-100) to trigger a warning for highly missing data. - + Returns: float: The global percentage of missing values (NaNs) in the DataFrame. """ total_elements = df.size total_nans = df.isna().sum().sum() pct_missing = (total_nans / total_elements) * 100 - + logger.info(f"=== {name} NaN Report ===") logger.info(f"Shape: {df.shape} (Samples: {df.shape[0]}, Features: {df.shape[1]})") logger.info(f"Global NaN: {pct_missing:.2f}%\n") - + if total_nans > 0: feature_nan_pct = (df.isna().sum(axis=0) / df.shape[0]) * 100 sample_nan_pct = (df.isna().sum(axis=1) / df.shape[1]) * 100 - + logger.info("Top 5 FEATURES with most missing data:") logger.info("\n" + feature_nan_pct.sort_values(ascending=False).head(5).to_string(float_format="{:.2f}%".format)) - + logger.info("\nTop 5 SAMPLES with most missing data:") logger.info("\n" + sample_nan_pct.sort_values(ascending=False).head(5).to_string(float_format="{:.2f}%".format)) - + high_missing_features = (feature_nan_pct > missing_threshold).sum() high_missing_samples = (sample_nan_pct > missing_threshold).sum() - + if high_missing_features > 0: logger.warning(f"{high_missing_features} features are missing in >{missing_threshold}% of samples.") if high_missing_samples > 0: logger.warning(f"{high_missing_samples} samples are missing >{missing_threshold}% of their features.") - + logger.info("-" * 50) return pct_missing @@ -155,14 +157,14 @@ def sparse_filter(df: pd.DataFrame, missing_fraction: float = 0.20) -> pd.DataFr """ min_valid_samples = int(df.shape[0] * (1 - missing_fraction)) df_filtered = df.dropna(axis=1, thresh=min_valid_samples) - + min_valid_features = int(df_filtered.shape[1] * (1 - missing_fraction)) return df_filtered.dropna(axis=0, thresh=min_valid_features) def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = False) -> None: """Prints a comprehensive set of key statistics for an omics DataFrame. - Combines variance, zero fraction, expression, correlation, and missingness summaries + Combines variance, zero fraction, expression, correlation, and missingness summaries for rapid data quality assessment. Recommends data cleaning steps if high missingness is found. Args: @@ -174,8 +176,7 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = None: Logs the statistics directly to the console. """ logger.info(f"=== {name} Statistics Overview ===") - - # --- Variance Summary --- + var_stats = variance_summary(df, var_threshold=1e-4) logger.info("--- Variance Summary ---") for key, value in var_stats.items(): @@ -183,7 +184,6 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = logger.info(f"{key:<32}: {clean_val}") logger.info("") - # --- Zero Summary --- zero_stats = zero_summary(df, zero_threshold=0.50) logger.info("--- Zero Summary ---") for key, value in zero_stats.items(): @@ -191,7 +191,6 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = logger.info(f"{key:<32}: {clean_val}") logger.info("") - # --- Expression Summary --- expr_stats = expression_summary(df) logger.info("--- Expression Summary ---") for key, value in expr_stats.items(): @@ -199,7 +198,6 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = logger.info(f"{key:<32}: {clean_val}") logger.info("") - # --- Correlation Summary --- if compute_correlation: try: corr_stats = correlation_summary(df) @@ -209,28 +207,27 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = logger.info(f"{key:<32}: {clean_val}") logger.info("") except Exception as e: - logger.warning(f"--- Correlation Summary ---\nCould not compute due to: {e}\n") + logger.info("--- Correlation Summary ---") + logger.info(f"Could not compute due to: {e}\n") else: logger.info("--- Correlation Summary ---") logger.info(f"{'Skipped':<32}: (compute_correlation=False)\n") - # --- NaN / Missingness Summary --- pct_missing = nan_summary(df, name=name) - - # --- SMART RECOMMENDATIONS ENGINE --- + logger.info(f"--- {name} Recommendations ---") - + # 1. Missingness Check if pct_missing > 30.0: logger.warning( f"SPARSITY: {pct_missing:.2f}% missing data. " f"Consider running `df = sparse_filter(df, missing_fraction=0.30)`." ) - + # 2. Beta Value Check (Bounded between 0 and 1) expr_min = expr_stats["Expression Min"] expr_max = expr_stats["Expression Max"] - + if expr_min >= 0.0 and expr_max <= 1.0: logger.warning( "NORMALIZATION: Values are strictly bounded between 0 and 1. " @@ -246,5 +243,5 @@ def data_stats(df: pd.DataFrame, name: str = "Data", compute_correlation: bool = else: logger.info("NORMALIZATION: Data distribution looks unbounded with low exact zeros. " "Appears properly transformed.") - - logger.info("=" * 50 + "\n") \ No newline at end of file + + logger.info("=" * 50 + "\n") diff --git a/bioneuralnet/utils/feature_selection.py b/bioneuralnet/utils/feature_selection.py index 809900c..5814cd2 100644 --- a/bioneuralnet/utils/feature_selection.py +++ b/bioneuralnet/utils/feature_selection.py @@ -55,10 +55,10 @@ def mad_filter(df: pd.DataFrame, n_keep: int) -> pd.DataFrame: """ if n_keep >= df.shape[1]: return df - + mad = (df - df.median(axis=0)).abs().median(axis=0) top_features = mad.nlargest(n_keep).index - + return df[top_features] def pca_loadings(df: pd.DataFrame, n_keep: int, n_components: int = 50, seed: int = 1883) -> pd.DataFrame: @@ -86,22 +86,22 @@ def pca_loadings(df: pd.DataFrame, n_keep: int, n_components: int = 50, seed: in scaled_values = scaler.fit_transform(df.values) pca = PCA(n_components=n_components, random_state=seed) - pca.fit(scaled_values) + pca.fit(scaled_values) weighted_loadings = np.abs(pca.components_).T * pca.explained_variance_ratio_ feature_importance = weighted_loadings.max(axis=1) - + importance_series = pd.Series(feature_importance, index=df.columns) top_features = importance_series.nlargest(n_keep).index - + return df[top_features] def laplacian_score(df: pd.DataFrame, n_keep: int, k_neighbors: int = 5) -> pd.DataFrame: """Unsupervised feature selection via the Laplacian Score to address dimensionality. - Evaluates a feature's ability to preserve the local manifold structure of the data. - The score is computed as the sum of squared differences between connected samples - weighted by the global network (W_ij), divided by the feature's variance. + Evaluates a feature's ability to preserve the local manifold structure of the data. + The score is computed as the sum of squared differences between connected samples + weighted by the global network (W_ij), divided by the feature's variance. Lower scores indicate higher importance (smoothness on the graph). Args: @@ -111,7 +111,7 @@ def laplacian_score(df: pd.DataFrame, n_keep: int, k_neighbors: int = 5) -> pd.D k_neighbors (int): Number of neighbors to use when building the k-NN graph. Returns: - + pd.DataFrame: DataFrame containing only the top n_keep features. """ if n_keep >= df.shape[1]: @@ -120,23 +120,23 @@ def laplacian_score(df: pd.DataFrame, n_keep: int, k_neighbors: int = 5) -> pd.D X = StandardScaler().fit_transform(df.values) A = kneighbors_graph(X, n_neighbors=k_neighbors, mode='connectivity', metric='cosine', include_self=False) W = A.maximum(A.T) - + D = np.array(W.sum(axis=1)).flatten() D_sum = D.sum() - + mean_f = np.dot(X.T, D) / D_sum X_centered = X - mean_f[np.newaxis, :] - + den = np.dot(D, X_centered**2) W_X = W.dot(X_centered) f_W_f = np.sum(X_centered * W_X, axis=0) num = den - f_W_f - + laplacian_scores = num / (den + 1e-8) - + importance_series = pd.Series(laplacian_scores, index=df.columns) top_features = importance_series.nsmallest(n_keep).index - + return df[top_features] def correlation_filter(X: pd.DataFrame, y: pd.Series | None = None, top_k: int = 1000) -> pd.DataFrame: @@ -314,4 +314,4 @@ def top_anova_f_features(X: pd.DataFrame, y: pd.Series, max_features: int, alpha logger.info(f"Selected {len(final_idx)} features by ANOVA (task={task}), {n_sig} significant, {n_pad} padded") - return num.iloc[:, final_idx] \ No newline at end of file + return num.iloc[:, final_idx] diff --git a/bioneuralnet/utils/logger.py b/bioneuralnet/utils/logger.py index 3039bd0..fb45883 100644 --- a/bioneuralnet/utils/logger.py +++ b/bioneuralnet/utils/logger.py @@ -34,4 +34,4 @@ def get_logger(name: str) -> logging.Logger: logger.addHandler(fh) logger.addHandler(ch) - return logger \ No newline at end of file + return logger diff --git a/bioneuralnet/utils/preprocess.py b/bioneuralnet/utils/preprocess.py index 0cef20e..1226664 100644 --- a/bioneuralnet/utils/preprocess.py +++ b/bioneuralnet/utils/preprocess.py @@ -74,8 +74,8 @@ def impute_simple(df: pd.DataFrame, method: str = "mean") -> pd.DataFrame: def impute_knn(df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame: """Imputes missing values (NaNs) using the K-Nearest Neighbors (KNN) approach. - KNN imputation replaces missing values with the average value from the - 'n_neighbors' most similar samples. + KNN imputation replaces missing values with the average value from the + 'n_neighbors' most similar samples. NOTE: Input data should be scaled/normalized prior to imputation. Args: @@ -93,7 +93,7 @@ def impute_knn(df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame: raise ValueError(err_msg) n_missing_before = df.isna().sum().sum() - + if n_missing_before == 0: logger.info(f"No NaNs found in DataFrame of shape {df.shape}. Skipping imputation.") return df @@ -105,7 +105,7 @@ def impute_knn(df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame: imputer = KNNImputer(n_neighbors=n_neighbors) imputed_data = imputer.fit_transform(df.values) - + imputed_df = pd.DataFrame(imputed_data, index=df.index, columns=df.columns) n_missing_after = imputed_df.isna().sum().sum() @@ -271,7 +271,7 @@ def preprocess_clinical(X: pd.DataFrame, scale: bool = False, drop_columns: Opti if impute: for col in df_numeric.columns: df_numeric[col] = df_numeric[col].fillna(df_numeric[col].median()) - + if scale: scaler = RobustScaler() scaled_array = scaler.fit_transform(df_numeric) @@ -286,7 +286,7 @@ def preprocess_clinical(X: pd.DataFrame, scale: bool = False, drop_columns: Opti for col in df_categorical.columns: df_categorical[col] = df_categorical[col].astype(str).str.lower().str.strip() df_categorical[col] = df_categorical[col].replace('nan', np.nan) - + df_cat_encoded = pd.get_dummies(df_categorical, dummy_na=True, drop_first=True, dtype=int) else: logger.info("No categorical data found to encode.") @@ -294,7 +294,7 @@ def preprocess_clinical(X: pd.DataFrame, scale: bool = False, drop_columns: Opti df_combined = pd.concat([df_numeric_scaled, df_cat_encoded], axis=1, join="outer") df_final = df_combined.loc[:, df_combined.std(axis=0, ddof=0) > 0] - + df_final = df_final.astype(np.float32) logger.info(f"Clinical data cleaning complete. Final shape: {df_final.shape}") @@ -435,4 +435,4 @@ def network_remove_high_zero_fraction(network: pd.DataFrame, threshold: float = filtered_network = network.loc[valid_indices, valid_indices] logger.info(f"Original network shape: {network.shape}, Filtered shape: {filtered_network.shape}") - return filtered_network \ No newline at end of file + return filtered_network diff --git a/docs/source/_static/logo_update.png b/docs/source/_static/logo_update.png index b8eb7fe..84107c7 100644 Binary files a/docs/source/_static/logo_update.png and b/docs/source/_static/logo_update.png differ diff --git a/docs/source/_static/logo_update2.png b/docs/source/_static/logo_update2.png new file mode 100644 index 0000000..b8eb7fe Binary files /dev/null and b/docs/source/_static/logo_update2.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 8ca89e1..181f17b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,7 +12,7 @@ try: release = metadata.version("bioneuralnet") except metadata.PackageNotFoundError: - release = "1.2.2" + release = "1.3.0" project = "BioNeuralNet" version = release @@ -54,7 +54,7 @@ html_css_files = ["custom.css"] html_theme = "furo" #html_title = "" -html_logo = "_static/LOGO_TB.svg" +html_logo = "_static/logo_update.png" #html_theme = "sphinx_rtd_theme" intersphinx_mapping = { diff --git a/setup.cfg b/setup.cfg index 7d93340..8d9f9e8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = bioneuralnet -version = 1.2.2 +version = 1.3.0 author = Vicente Ramos author_email = vicente.ramos@ucdenver.edu description = A Graph Neural Network based Multi-Omics Network Data Analysis Tool diff --git a/tests/__init__.py b/tests/__init__.py index 6a00bac..7bcdb40 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,11 +1,12 @@ from . import ( test_gnn_embedding, + test_network_tools, + test_network_generate, test_subject_representation, test_hybrid_louvain, test_downstream_dpmon, - test_utils_graph, - test_utils_graph_tools, test_utils_preprocess, + test_utils_feature_selection, test_utils_data, test_correlation_metrics, test_dataset_loader, @@ -13,12 +14,13 @@ __all__ = [ "test_gnn_embedding", + "test_network_tools", + "test_network_generate", "test_subject_representation", "test_hybrid_louvain", "test_downstream_dpmon", - "test_utils_graph", - "test_utils_graph_tools", "test_utils_preprocess", + "test_utils_feature_selection", "test_utils_data", "test_correlation_metrics", "test_dataset_loader", diff --git a/tests/test_correlation_metrics.py b/tests/test_correlation_metrics.py index 72b0579..ebd6c48 100644 --- a/tests/test_correlation_metrics.py +++ b/tests/test_correlation_metrics.py @@ -1,85 +1,106 @@ import unittest import pandas as pd import numpy as np -from scipy.stats import pearsonr -from bioneuralnet.metrics import omics_correlation -from bioneuralnet.metrics import cluster_correlation -from bioneuralnet.metrics import louvain_to_adjacency +from bioneuralnet.metrics.correlation import ( + omics_correlation, + cluster_pca_correlation, + cluster_correlation, +) -class TestDataFunctions(unittest.TestCase): +class TestCorrelationMetrics(unittest.TestCase): def setUp(self): self.omics = pd.DataFrame({ "g1": [1.0, 2.0, 3.0, 4.0], - "g2": [2.0, 4.0, 6.0, 8.0] + "g2": [2.0, 4.0, 6.0, 8.0], }, index=["s1", "s2", "s3", "s4"]) - self.pheno = pd.DataFrame({"phen": [10.0, 20.0, 30.0, 40.0]}, index=["s1", "s2", "s3", "s4"]) + self.pheno = pd.DataFrame( + {"phen": [10.0, 20.0, 30.0, 40.0]}, + index=["s1", "s2", "s3", "s4"], + ) def test_omics_correlation_valid(self): corr, pval = omics_correlation(self.omics, self.pheno) - expected_pc1 = np.array([ -1.34164, -0.44721, 0.44721, 1.34164 ]) - expected_corr, _ = pearsonr(expected_pc1, self.pheno["phen"].values) - self.assertAlmostEqual(corr, expected_corr, places=5) + self.assertIsInstance(corr, float) self.assertTrue(0 <= pval <= 1) + self.assertAlmostEqual(abs(corr), 1.0, places=5) - def test_omics_correlation_empty(self): - empty = pd.DataFrame() + def test_omics_correlation_empty_raises(self): with self.assertRaises(ValueError): - omics_correlation(empty, self.pheno) + omics_correlation(pd.DataFrame(), self.pheno) with self.assertRaises(ValueError): omics_correlation(self.omics, pd.DataFrame()) - def test_omics_correlation_mismatch(self): - pheno_short = pd.DataFrame({"phen": [1.0, 2.0]}, index=["s1", "s2"]) + def test_omics_correlation_length_mismatch_raises(self): + short_pheno = pd.DataFrame({"phen": [1.0, 2.0]}, index=["s1", "s2"]) with self.assertRaises(ValueError): - omics_correlation(self.omics, pheno_short) + omics_correlation(self.omics, short_pheno) - def test_cluster_correlation_size_one(self): + def test_cluster_pca_correlation_valid(self): + df = pd.DataFrame({ + "f1": [1.0, 2.0, 3.0, 4.0], + "f2": [2.0, 4.0, 6.0, 8.0], + }, index=["s1", "s2", "s3", "s4"]) + ph = pd.DataFrame({"p": [5.0, 10.0, 15.0, 20.0]}, index=["s1", "s2", "s3", "s4"]) + size, corr = cluster_pca_correlation(df, ph) + self.assertEqual(size, 2) + self.assertIsNotNone(corr) + self.assertAlmostEqual(abs(corr), 1.0, places=5) + + def test_cluster_pca_correlation_size_one(self): df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}, index=["x", "y", "z"]) - size, corr = cluster_correlation(df, pd.DataFrame({"p": [1.0, 2.0, 3.0]}, index=["x", "y", "z"])) + ph = pd.DataFrame({"p": [1.0, 2.0, 3.0]}, index=["x", "y", "z"]) + size, corr = cluster_pca_correlation(df, ph) self.assertEqual(size, 1) self.assertIsNone(corr) - def test_cluster_correlation_zero_variance(self): + def test_cluster_pca_correlation_zero_variance(self): df = pd.DataFrame({ "c1": [1.0, 1.0, 1.0], - "c2": [2.0, 2.0, 2.0] + "c2": [2.0, 2.0, 2.0], }, index=["i1", "i2", "i3"]) - size, corr = cluster_correlation(df, pd.DataFrame({"p": [1.0, 2.0, 3.0]}, index=["i1", "i2", "i3"])) + ph = pd.DataFrame({"p": [1.0, 2.0, 3.0]}, index=["i1", "i2", "i3"]) + size, corr = cluster_pca_correlation(df, ph) self.assertEqual(size, 2) self.assertIsNone(corr) - def test_cluster_correlation_insufficient_samples(self): + def test_cluster_pca_correlation_insufficient_samples(self): df = pd.DataFrame({ "f1": [1.0, 2.0], - "f2": [2.0, 4.0] + "f2": [2.0, 4.0], }, index=["s1", "s2"]) ph = pd.DataFrame({"p": [5.0, 10.0]}, index=["s1", "s2"]) - size, corr = cluster_correlation(df, ph) + size, corr = cluster_pca_correlation(df, ph) self.assertEqual(size, 2) self.assertIsNone(corr) - def test_cluster_correlation_valid(self): - df = pd.DataFrame({ - "f1": [1.0, 2.0, 3.0, 4.0], - "f2": [2.0, 4.0, 6.0, 8.0] - }, index=["s1", "s2", "s3", "s4"]) - ph = pd.DataFrame({"p": [5.0, 10.0, 15.0, 20.0]}, index=["s1", "s2", "s3", "s4"]) - size, corr = cluster_correlation(df, ph) - self.assertEqual(size, 2) - self.assertAlmostEqual(corr, 1.0, places=5) - - def test_louvain_to_adjacency(self): + def test_cluster_correlation_returns_adjacency(self): df = pd.DataFrame({ "a": [1.0, 2.0, 3.0], "b": [2.0, 4.0, 6.0], - "c": [1.0, 0.0, 1.0] + "c": [1.0, 0.0, 1.0], }, index=["x", "y", "z"]) - adj = louvain_to_adjacency(df) + adj = cluster_correlation(df) + self.assertIsInstance(adj, pd.DataFrame) self.assertEqual(set(adj.columns), {"a", "b", "c"}) self.assertEqual(set(adj.index), {"a", "b", "c"}) + + def test_cluster_correlation_diagonal_zero(self): + adj = cluster_correlation(self.omics) self.assertTrue((np.diag(adj.values) == 0).all()) + + def test_cluster_correlation_no_nans(self): + adj = cluster_correlation(self.omics) self.assertFalse(adj.isna().any().any()) + def test_cluster_correlation_symmetric(self): + df = pd.DataFrame({ + "a": [1.0, 2.0, 3.0, 4.0], + "b": [4.0, 3.0, 2.0, 1.0], + "c": [1.0, 3.0, 2.0, 4.0], + }, index=["s1", "s2", "s3", "s4"]) + adj = cluster_correlation(df) + pd.testing.assert_frame_equal(adj, adj.T) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_dataset_loader.py b/tests/test_dataset_loader.py index 10a01c9..9880dc2 100644 --- a/tests/test_dataset_loader.py +++ b/tests/test_dataset_loader.py @@ -8,7 +8,6 @@ load_brca, load_lgg, load_kipan, - load_paad ) class TestDatasetLoader(unittest.TestCase): @@ -35,7 +34,7 @@ def test_monet_loads(self): def test_brca_loads(self): loader = DatasetLoader("brca") keys = set(loader.data.keys()) - self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "meth"}) + self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "methylation"}) for df in loader.data.values(): self.assertIsInstance(df, pd.DataFrame) @@ -45,7 +44,7 @@ def test_brca_loads(self): def test_lgg_loads(self): loader = DatasetLoader("lgg") keys = set(loader.data.keys()) - self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "meth"}) + self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "methylation"}) for df in loader.data.values(): self.assertIsInstance(df, pd.DataFrame) @@ -55,17 +54,7 @@ def test_lgg_loads(self): def test_kipan_loads(self): loader = DatasetLoader("kipan") keys = set(loader.data.keys()) - self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "meth"}) - - for df in loader.data.values(): - self.assertIsInstance(df, pd.DataFrame) - self.assertGreater(df.shape[0], 0) - self.assertGreater(df.shape[1], 0) - - def test_paad_loads(self): - loader = DatasetLoader("paad") - keys = set(loader.data.keys()) - self.assertEqual(keys, {"cnv", "target", "clinical", "rna", "meth"}) + self.assertEqual(keys, {"mirna", "target", "clinical", "rna", "methylation"}) for df in loader.data.values(): self.assertIsInstance(df, pd.DataFrame) @@ -84,25 +73,11 @@ def test_functional_loaders(self): self.assertIsInstance(load_brca(), dict) self.assertIsInstance(load_lgg(), dict) self.assertIsInstance(load_kipan(), dict) - self.assertIsInstance(load_paad(), dict) self.assertEqual(load_brca().keys(), DatasetLoader("brca").data.keys()) def test_invalid_folder_raises(self): with self.assertRaises(FileNotFoundError): DatasetLoader("nonexistent_folder") - def test_unrecognized_name_raises(self): - base = Path(__file__).parent.parent / "bioneuralnet" / "datasets" - dummy = base / "dummy" - dummy.mkdir(exist_ok=True) - (dummy / "placeholder.csv").write_text("a,b\n1,2") - - with self.assertRaises(ValueError): - DatasetLoader("dummy") - - for child in dummy.iterdir(): - child.unlink() - dummy.rmdir() - if __name__ == "__main__": unittest.main() diff --git a/tests/test_hybrid_louvain.py b/tests/test_hybrid_louvain.py index a58cac8..500272e 100644 --- a/tests/test_hybrid_louvain.py +++ b/tests/test_hybrid_louvain.py @@ -5,137 +5,123 @@ from bioneuralnet.clustering.hybrid_louvain import HybridLouvain from bioneuralnet.datasets import DatasetLoader -from bioneuralnet.network.generate import gen_correlation_graph - +from bioneuralnet.network.generate import correlation_network class TestHybridLouvain(unittest.TestCase): def setUp(self): - # Using our synthetic example dataset for testing example = DatasetLoader("example") X1 = example["X1"] X2 = example["X2"] Y_df = example["Y"] - if isinstance(Y_df, pd.DataFrame): - self.Y = Y_df.iloc[:, 0] - else: - self.Y = Y_df - - # A small subset of features to keep tests light - X1_small = X1.iloc[:, 0:15] - X2_small = X2.iloc[:, 0:15] - - self.B = pd.concat([X1_small, X2_small], axis=1) - - # Build a feature–feature graph from the omics data - adj = gen_correlation_graph( - self.B, - k=3, - method="pearson", - signed=True, - normalize=True, - mutual=False, - per_node=True, - threshold=None, - self_loops=False, - ) + self.Y = Y_df.iloc[:, 0] if isinstance(Y_df, pd.DataFrame) else Y_df - self.G = nx.from_pandas_adjacency(adj) + self.B = pd.concat([X1.iloc[:, :15], X2.iloc[:, :15]], axis=1) - # we store node list so we can build fake partitions over real node names - self.node_list = [] - for n in self.G.nodes(): - self.node_list.append(n) + adj = correlation_network( + self.B, k=3, method="pearson", signed=True, + normalize=True, mutual=False, per_node=True, + threshold=None, self_loops=False, + ) + self.G = nx.from_pandas_adjacency(adj) + self.node_list = list(self.G.nodes()) + + def _make_fake_louvain(self): + fake = MagicMock() + partition = {n: 0 for n in self.node_list} + fake.run.return_value = partition + fake.get_top_communities.return_value = [(0, 0.7, self.node_list)] + fake.get_combined_quality.return_value = 0.5 + return fake + + def _make_fake_pagerank(self): + fake = MagicMock() + fake.run.return_value = { + "cluster_nodes": self.node_list, + "conductance": 0.5, + "composite_score": 0.6, + } + fake.phen_omics_corr.return_value = (0.8,) + return fake @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedLouvain", autospec=True) @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedPageRank", autospec=True) - def test_run_returns_partition_and_clusters_dict(self, mock_page_rank_cls, mock_louvain_cls): - fake_louvain = MagicMock() - - # partition: put every node into community 0 - partition = {} - for n in self.node_list: - partition[n] = 0 - fake_louvain.run.return_value = partition - fake_louvain.get_quality.return_value = 0.5 - - def fake_compute_corr(nodes): - return (0.7, None) - - fake_louvain._compute_community_correlation.side_effect = fake_compute_corr - mock_louvain_cls.return_value = fake_louvain + def test_run_returns_dict_with_expected_keys(self, mock_pr_cls, mock_lou_cls): + mock_lou_cls.return_value = self._make_fake_louvain() + mock_pr_cls.return_value = self._make_fake_pagerank() - fake_pagerank = MagicMock() - - def fake_pr_run(best_seed): - return {"cluster_nodes": best_seed,"conductance": 0.5,"correlation": 0.8,"composite_score": 0.6} - - fake_pagerank.run.side_effect = fake_pr_run - mock_page_rank_cls.return_value = fake_pagerank - - hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y.to_frame()) + hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y) result = hybrid.run(as_dfs=False) self.assertIsInstance(result, dict) - self.assertIn("curr", result) - self.assertIn("clus", result) + for key in ("best_nodes", "best_correlation", "best_iteration", "iterations", "all_subgraphs"): + self.assertIn(key, result) - # partition should be exactly what CorrelatedLouvain.run returned - self.assertEqual(result["curr"], partition) + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedLouvain", autospec=True) + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedPageRank", autospec=True) + def test_run_best_nodes_and_correlation(self, mock_pr_cls, mock_lou_cls): + mock_lou_cls.return_value = self._make_fake_louvain() + mock_pr_cls.return_value = self._make_fake_pagerank() - # 1 cluster recorded for iteration 0 - clus = result["clus"] - self.assertIsInstance(clus, dict) - self.assertIn(0, clus) + hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y) + result = hybrid.run(as_dfs=False) - cluster_nodes = clus[0] - self.assertEqual(len(cluster_nodes), len(self.node_list)) + self.assertIsInstance(result["best_nodes"], list) + self.assertIsInstance(result["best_correlation"], float) + self.assertGreaterEqual(result["best_correlation"], 0.0) + self.assertIsInstance(result["iterations"], list) + self.assertGreater(len(result["iterations"]), 0) - # Checking that all graph nodes are in the cluster - for n in self.node_list: - self.assertIn(n, cluster_nodes) + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedLouvain", autospec=True) + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedPageRank", autospec=True) + def test_run_as_dfs_returns_adjacency_dataframes(self, mock_pr_cls, mock_lou_cls): + mock_lou_cls.return_value = self._make_fake_louvain() + mock_pr_cls.return_value = self._make_fake_pagerank() - mock_louvain_cls.assert_called() - mock_page_rank_cls.assert_called() + hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y) + result = hybrid.run(as_dfs=True) + + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + for df in result: + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(df.shape[0], df.shape[1]) @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedLouvain", autospec=True) @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedPageRank", autospec=True) - def test_run_as_dfs_returns_list_of_dataframes(self, mock_page_rank_cls, mock_louvain_cls): - fake_louvain = MagicMock() - - partition = {} - for n in self.node_list: - partition[n] = 0 - fake_louvain.run.return_value = partition - fake_louvain.get_quality.return_value = 0.5 - fake_louvain._compute_community_correlation.side_effect = lambda nodes: (0.7, None) - mock_louvain_cls.return_value = fake_louvain + def test_run_accepts_dataframe_phenotype(self, mock_pr_cls, mock_lou_cls): + mock_lou_cls.return_value = self._make_fake_louvain() + mock_pr_cls.return_value = self._make_fake_pagerank() - fake_pagerank = MagicMock() + hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y.to_frame()) + result = hybrid.run(as_dfs=False) + self.assertIsInstance(result, dict) - fake_pagerank.run.side_effect = lambda best_seed: {"cluster_nodes": best_seed,"conductance": 0.1,"correlation": 0.1,"composite_score": 0.1} - mock_page_rank_cls.return_value = fake_pagerank + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedLouvain", autospec=True) + @patch("bioneuralnet.clustering.hybrid_louvain.CorrelatedPageRank", autospec=True) + def test_run_calls_louvain_and_pagerank(self, mock_pr_cls, mock_lou_cls): + mock_lou_cls.return_value = self._make_fake_louvain() + mock_pr_cls.return_value = self._make_fake_pagerank() hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y) - dfs_list = hybrid.run(as_dfs=True) - - self.assertIsInstance(dfs_list, list) - # we expect a single omics subnetwork DataFrame - self.assertEqual(len(dfs_list), 1) + hybrid.run(as_dfs=False) - df0 = dfs_list[0] - self.assertIsInstance(df0, pd.DataFrame) + mock_lou_cls.assert_called() + mock_pr_cls.assert_called() - # Columns should match the omics columns used to build the graph - cols_set = set(df0.columns) - expected_cols_set = set(self.B.columns) - self.assertEqual(cols_set, expected_cols_set) + def test_init_accepts_adjacency_dataframe(self): + adj_df = nx.to_pandas_adjacency(self.G) + hybrid = HybridLouvain(G=adj_df, B=self.B, Y=self.Y) + self.assertIsInstance(hybrid.G_original, nx.Graph) - # And values should match the original subset of omics data - pd.testing.assert_frame_equal(df0, self.B.loc[:, df0.columns]) + def test_init_invalid_graph_raises(self): + with self.assertRaises(TypeError): + HybridLouvain(G="not_a_graph", B=self.B, Y=self.Y) - mock_louvain_cls.assert_called() - mock_page_rank_cls.assert_called() + def test_best_subgraph_before_run_raises(self): + hybrid = HybridLouvain(G=self.G, B=self.B, Y=self.Y) + with self.assertRaises(ValueError): + _ = hybrid.best_subgraph if __name__ == "__main__": unittest.main() diff --git a/tests/test_utils_graph.py b/tests/test_network_generate.py similarity index 64% rename from tests/test_utils_graph.py rename to tests/test_network_generate.py index 27c8d22..d7d5cf6 100644 --- a/tests/test_utils_graph.py +++ b/tests/test_network_generate.py @@ -2,16 +2,12 @@ import pandas as pd import numpy as np -from bioneuralnet.network.generate import gen_similarity_graph -from bioneuralnet.network.generate import gen_correlation_graph -from bioneuralnet.network.generate import gen_threshold_graph -from bioneuralnet.network.generate import gen_gaussian_knn_graph -from bioneuralnet.network.generate import gen_lasso_graph -from bioneuralnet.network.generate import gen_mst_graph -from bioneuralnet.network.generate import gen_snn_graph +from bioneuralnet.network.generate import similarity_network +from bioneuralnet.network.generate import correlation_network +from bioneuralnet.network.generate import threshold_network +from bioneuralnet.network.generate import gaussian_knn_network from bioneuralnet.datasets import DatasetLoader - class TestUtilsGraph(unittest.TestCase): def setUp(self): example = DatasetLoader("example") @@ -49,15 +45,15 @@ def _basic_checks(self, G: pd.DataFrame): self.assertTrue(is_normalized.all()) def test_gen_similarity_graph_default(self): - G = gen_similarity_graph(self.X, k=3) + G = similarity_network(self.X, k=3) self._basic_checks(G) def test_gen_similarity_graph_type_error(self): with self.assertRaises(TypeError): - gen_similarity_graph(np.array([[1, 2], [3, 4]])) + similarity_network(np.array([[1, 2], [3, 4]])) def test_gen_correlation_graph_default(self): - G = gen_correlation_graph( + G = correlation_network( self.X, k=3, method="pearson", @@ -72,10 +68,10 @@ def test_gen_correlation_graph_default(self): def test_gen_correlation_graph_type_error(self): with self.assertRaises(TypeError): - gen_correlation_graph("not a df") + correlation_network("not a df") def test_gen_threshold_graph_default(self): - G = gen_threshold_graph( + G = threshold_network( self.X, b=4.0, k=3, @@ -87,10 +83,10 @@ def test_gen_threshold_graph_default(self): def test_gen_threshold_graph_type_error(self): with self.assertRaises(TypeError): - gen_threshold_graph(None) + threshold_network(None) def test_gen_gaussian_knn_graph_default(self): - G = gen_gaussian_knn_graph( + G = gaussian_knn_network( self.X, k=3, mutual=False, @@ -101,31 +97,7 @@ def test_gen_gaussian_knn_graph_default(self): def test_gen_gaussian_knn_graph_type_error(self): with self.assertRaises(TypeError): - gen_gaussian_knn_graph(123) - - def test_gen_lasso_graph_default(self): - G = gen_lasso_graph(self.X, alpha=0.1) - self._basic_checks(G) - - def test_gen_lasso_graph_type_error(self): - with self.assertRaises(TypeError): - gen_lasso_graph([1, 2, 3]) - - def test_gen_mst_graph_default(self): - G = gen_mst_graph(self.X) - self._basic_checks(G) - - def test_gen_mst_graph_type_error(self): - with self.assertRaises(TypeError): - gen_mst_graph(5.0) - - def test_gen_snn_graph_default(self): - G = gen_snn_graph(self.X, k=3) - self._basic_checks(G) - - def test_gen_snn_graph_type_error(self): - with self.assertRaises(TypeError): - gen_snn_graph("oops, not a df") + gaussian_knn_network(123) if __name__ == "__main__": diff --git a/tests/test_network_tools.py b/tests/test_network_tools.py new file mode 100644 index 0000000..5b214d1 --- /dev/null +++ b/tests/test_network_tools.py @@ -0,0 +1,127 @@ +import unittest +import pandas as pd +import numpy as np + +from bioneuralnet.network.tools import NetworkAnalyzer +from bioneuralnet.network.tools import network_search +from bioneuralnet.datasets import DatasetLoader + +class TestNetworkAnalyzer(unittest.TestCase): + def setUp(self): + self.adj = pd.DataFrame( + [ + [0.0, 0.8, 0.2, 0.0], + [0.8, 0.0, 0.5, 0.1], + [0.2, 0.5, 0.0, 0.9], + [0.0, 0.1, 0.9, 0.0], + ], + index=["n1", "n2", "n3", "n4"], + columns=["n1", "n2", "n3", "n4"], + ) + self.analyzer = NetworkAnalyzer(self.adj) + + def test_basic_statistics_returns_dict(self): + stats = self.analyzer.basic_statistics(threshold=0.5) + self.assertIsInstance(stats, dict) + for key in ("nodes", "edges", "density", "avg_degree", "isolated"): + self.assertIn(key, stats) + self.assertEqual(stats["nodes"], 4) + + def test_degree_distribution_returns_dataframe(self): + df = self.analyzer.degree_distribution(threshold=0.5) + self.assertIsInstance(df, pd.DataFrame) + for col in ("degree", "count", "percentage"): + self.assertIn(col, df.columns) + self.assertAlmostEqual(df["percentage"].sum(), 100.0, places=5) + + def test_hub_analysis_returns_top_n(self): + top_n = 3 + df = self.analyzer.hub_analysis(threshold=0.1, top_n=top_n) + self.assertIsInstance(df, pd.DataFrame) + self.assertLessEqual(len(df), top_n) + self.assertIn("degree", df.columns) + self.assertIn("feature", df.columns) + + def test_edge_weight_analysis_returns_array(self): + weights = self.analyzer.edge_weight_analysis() + self.assertIsInstance(weights, np.ndarray) + self.assertGreater(len(weights), 0) + self.assertTrue((weights > 0).all()) + + def test_find_strongest_edges_returns_dataframe(self): + top_n = 3 + df = self.analyzer.find_strongest_edges(top_n=top_n) + self.assertIsInstance(df, pd.DataFrame) + self.assertLessEqual(len(df), top_n) + for col in ("feature1", "feature2", "weight"): + self.assertIn(col, df.columns) + # weights should be sorted descending + self.assertTrue((df["weight"].diff().dropna() <= 0).all()) + + def test_connected_components_returns_dict(self): + result = self.analyzer.connected_components(threshold=0.1) + self.assertIsInstance(result, dict) + self.assertIn("n_components", result) + self.assertIn("labels", result) + self.assertIn("sizes", result) + self.assertEqual(len(result["labels"]), 4) + + def test_source_omics_assignment(self): + omics1 = pd.DataFrame(np.random.rand(5, 2), columns=["n1", "n2"]) + omics2 = pd.DataFrame(np.random.rand(5, 2), columns=["n3", "n4"]) + analyzer = NetworkAnalyzer(self.adj, source_omics=[omics1, omics2]) + self.assertIn("omic_1", analyzer.omics_types) + self.assertIn("omic_2", analyzer.omics_types) + +class TestNetworkSearch(unittest.TestCase): + def setUp(self): + example = DatasetLoader("example") + X1 = example["X1"] + X2 = example["X2"] + y_ex = example["Y"] + + if isinstance(y_ex, pd.DataFrame): + y_ex = y_ex.iloc[:, 0] + + y_binary = (y_ex > float(np.median(y_ex.values))).astype(int) + + self.omics_data = pd.concat([X1, X2], axis=1) + self.y = y_binary + + def test_network_search_returns_tuple(self): + best_G, best_params, results = network_search( + self.omics_data, + self.y, + methods=["correlation", "threshold"], + trials=10, + verbose=False, + ) + self.assertIsInstance(results, pd.DataFrame) + self.assertIsInstance(best_G, pd.DataFrame) + self.assertIsInstance(best_params, dict) + + def test_network_search_results_columns(self): + _, _, results = network_search( + self.omics_data, + self.y, + methods=["correlation"], + trials=10, + verbose=False, + ) + for col in ("method", "score", "f1", "topology"): + self.assertIn(col, results.columns) + + def test_network_search_graph_shape(self): + best_G, _, _ = network_search( + self.omics_data, + self.y, + methods=["threshold"], + trials=10, + verbose=False, + ) + n_features = self.omics_data.shape[1] + self.assertEqual(best_G.shape, (n_features, n_features)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils_data.py b/tests/test_utils_data.py index 4759909..54a2d95 100644 --- a/tests/test_utils_data.py +++ b/tests/test_utils_data.py @@ -1,15 +1,18 @@ import unittest -from unittest.mock import patch, call +from unittest.mock import patch import pandas as pd import numpy as np -import io -import logging import torch -from bioneuralnet.utils.data import variance_summary -from bioneuralnet.utils.data import zero_fraction_summary -from bioneuralnet.utils.data import expression_summary -from bioneuralnet.utils.data import correlation_summary -from bioneuralnet.utils.data import explore_data_stats + +from bioneuralnet.utils.data import ( + variance_summary, + zero_summary, + expression_summary, + correlation_summary, + nan_summary, + sparse_filter, + data_stats, +) from bioneuralnet.utils.reproducibility import set_seed class TestDataUtils(unittest.TestCase): @@ -25,8 +28,8 @@ def setUp(self): }) self.df_expr = pd.DataFrame({ "P": [10.0, 20.0, 30.0], - "Q": [1.0, 1.0, 1.0], - "R": [0.0, 5.0, 0.0], + "Q": [1.0, 1.0, 1.0], + "R": [0.0, 5.0, 0.0], }) self.df_corr = pd.DataFrame({ "U": [1.0, 2.0, 3.0], @@ -34,107 +37,110 @@ def setUp(self): "W": [1.0, 0.0, 1.0], }) - self.mock_logger = logging.getLogger('test_logger') - self.mock_logger.setLevel(logging.INFO) - self.mock_stream = io.StringIO() - self.mock_handler = logging.StreamHandler(self.mock_stream) - self.mock_logger.addHandler(self.mock_handler) - def test_variance_summary_no_threshold(self): - stats = variance_summary(self.df_var, low_var_threshold=None) - - self.assertAlmostEqual(stats["variance_mean"], 4.0) - self.assertAlmostEqual(stats["variance_median"], 4.0) - self.assertAlmostEqual(stats["variance_min"], 4.0) - self.assertAlmostEqual(stats["variance_max"], 4.0) - self.assertAlmostEqual(stats["variance_std"], 0.0) - self.assertNotIn("num_low_variance_features", stats) + stats = variance_summary(self.df_var, var_threshold=None) + self.assertAlmostEqual(stats["Variance Mean"], 4.0) + self.assertAlmostEqual(stats["Variance Median"], 4.0) + self.assertAlmostEqual(stats["Variance Min"], 4.0) + self.assertAlmostEqual(stats["Variance Max"], 4.0) + self.assertAlmostEqual(stats["Variance Std"], 0.0) + self.assertNotIn("Number Of Low Variance Features", stats) def test_variance_summary_with_threshold(self): - stats = variance_summary(self.df_var, low_var_threshold=5.0) - - self.assertIn("num_low_variance_features", stats) - self.assertEqual(stats["num_low_variance_features"], 2) - - def test_zero_fraction_summary_no_threshold(self): - stats = zero_fraction_summary(self.df_zero, high_zero_threshold=None) - - self.assertAlmostEqual(stats["zero_fraction_mean"], 0.5) - self.assertAlmostEqual(stats["zero_fraction_median"], 0.5) - self.assertAlmostEqual(stats["zero_fraction_min"], 0.0) - self.assertAlmostEqual(stats["zero_fraction_max"], 1.0) - + stats = variance_summary(self.df_var, var_threshold=5.0) + self.assertIn("Number Of Low Variance Features", stats) + self.assertEqual(stats["Number Of Low Variance Features"], 2) + + def test_zero_summary_no_threshold(self): + stats = zero_summary(self.df_zero, zero_threshold=None) + self.assertAlmostEqual(stats["Zero Mean"], 0.5) + self.assertAlmostEqual(stats["Zero Median"], 0.5) + self.assertAlmostEqual(stats["Zero Min"], 0.0) + self.assertAlmostEqual(stats["Zero Max"], 1.0) expected_std = pd.Series([0.5, 0.0, 1.0]).std() - self.assertAlmostEqual(stats["zero_fraction_std"], expected_std) - self.assertNotIn("num_high_zero_features", stats) - - def test_zero_fraction_summary_with_threshold(self): - stats = zero_fraction_summary(self.df_zero, high_zero_threshold=0.5) + self.assertAlmostEqual(stats["Zero Std"], expected_std) + self.assertNotIn("Number Of High Zero Features", stats) - self.assertIn("num_high_zero_features", stats) - self.assertEqual(stats["num_high_zero_features"], 1) + def test_zero_summary_with_threshold(self): + stats = zero_summary(self.df_zero, zero_threshold=0.5) + self.assertIn("Number Of High Zero Features", stats) + self.assertEqual(stats["Number Of High Zero Features"], 1) def test_expression_summary(self): stats = expression_summary(self.df_expr) - mean_exprs = pd.Series({"P": 20.0, "Q": 1.0, "R": (0.0 + 5.0 + 0.0) / 3}) - - self.assertAlmostEqual(stats["expression_mean"], mean_exprs.mean()) - self.assertAlmostEqual(stats["expression_median"], mean_exprs.median()) - self.assertAlmostEqual(stats["expression_min"], mean_exprs.min()) - self.assertAlmostEqual(stats["expression_max"], mean_exprs.max()) - self.assertAlmostEqual(stats["expression_std"], mean_exprs.std()) + mean_expr = self.df_expr.mean() + self.assertAlmostEqual(stats["Expression Mean"], float(mean_expr.mean())) + self.assertAlmostEqual(stats["Expression Median"], float(mean_expr.median())) + self.assertAlmostEqual(stats["Expression Min"], float(mean_expr.min())) + self.assertAlmostEqual(stats["Expression Max"], float(mean_expr.max())) + self.assertAlmostEqual(stats["Expression Std"], float(mean_expr.std())) def test_correlation_summary(self): stats = correlation_summary(self.df_corr) corr_abs = self.df_corr.corr().abs() - - np.fill_diagonal(corr_abs.values, 0.0) + vals = corr_abs.to_numpy().copy() + np.fill_diagonal(vals, 0.0) + corr_abs = pd.DataFrame(vals, index=corr_abs.index, columns=corr_abs.columns) max_corr = corr_abs.max() - - self.assertAlmostEqual(max_corr["U"], 1.0) - self.assertAlmostEqual(max_corr["V"], 1.0) - - self.assertAlmostEqual(stats["max_corr_mean"], max_corr.mean()) - self.assertAlmostEqual(stats["max_corr_median"], max_corr.median()) - self.assertAlmostEqual(stats["max_corr_min"], max_corr.min()) - self.assertAlmostEqual(stats["max_corr_max"], max_corr.max()) - self.assertAlmostEqual(stats["max_corr_std"], max_corr.std()) - - def test_set_seed_reproducibility(self): - test_seed = 177 - - set_seed(test_seed) - result1 = np.random.rand(5) - - set_seed(test_seed) - result2 = np.random.rand(5) - - np.testing.assert_array_equal(result1, result2) - set_seed(test_seed + 1) - tensor1 = torch.rand(5) - - set_seed(test_seed + 1) - tensor2 = torch.rand(5) - - self.assertTrue(torch.equal(tensor1, tensor2)) - - @patch('bioneuralnet.utils.data.logger') - def test_explore_data_stats_logs_all_sections(self, mock_logger): - mock_logger.addHandler(self.mock_handler) - explore_data_stats(self.df_corr, name="TestDF") - - all_call_args = [call[0][0] for call in mock_logger.info.call_args_list] - output = "\n".join(all_call_args) - - self.assertIn("Statistics for TestDF:", output) - self.assertIn("Variance Summary:", output) - self.assertIn("Zero Fraction Summary:", output) - self.assertIn("Expression Summary:", output) - self.assertIn("Correlation Summary:", output) - self.assertIn("variance_mean", output) - self.assertIn("zero_fraction_mean", output) - self.assertIn("expression_mean", output) - self.assertIn("max_corr_mean", output) + self.assertAlmostEqual(stats["Max Corr Mean"], float(max_corr.mean())) + self.assertAlmostEqual(stats["Max Corr Median"], float(max_corr.median())) + self.assertAlmostEqual(stats["Max Corr Min"], float(max_corr.min())) + self.assertAlmostEqual(stats["Max Corr Max"], float(max_corr.max())) + self.assertAlmostEqual(stats["Max Corr Std"], float(max_corr.std())) + + def test_nan_summary_no_nans(self): + pct = nan_summary(self.df_var, name="clean") + self.assertAlmostEqual(pct, 0.0) + + def test_nan_summary_with_nans(self): + df = pd.DataFrame({"x": [1.0, np.nan], "y": [np.nan, np.nan]}) + pct = nan_summary(df, name="sparse") + self.assertAlmostEqual(pct, 75.0) + + def test_sparse_filter_drops_sparse_columns_and_rows(self): + df = pd.DataFrame({ + "dense": [1.0, 2.0, 3.0, 4.0], + "sparse": [np.nan, np.nan, np.nan, 1.0], + }) + result = sparse_filter(df, missing_fraction=0.20) + self.assertNotIn("sparse", result.columns) + self.assertIn("dense", result.columns) + + def test_sparse_filter_clean_data_unchanged(self): + result = sparse_filter(self.df_var, missing_fraction=0.20) + self.assertEqual(result.shape, self.df_var.shape) + + @patch("bioneuralnet.utils.data.logger") + def test_data_stats_logs_sections(self, mock_logger): + data_stats(self.df_corr, name="TestDF", compute_correlation=True) + all_args = [c[0][0] for c in mock_logger.info.call_args_list] + output = "\n".join(all_args) + self.assertIn("TestDF", output) + self.assertIn("Variance", output) + self.assertIn("Zero", output) + self.assertIn("Expression", output) + self.assertIn("Correlation", output) + + @patch("bioneuralnet.utils.data.logger") + def test_data_stats_skips_correlation_when_false(self, mock_logger): + data_stats(self.df_corr, name="TestDF", compute_correlation=False) + all_args = [c[0][0] for c in mock_logger.info.call_args_list] + output = "\n".join(all_args) + self.assertIn("Skipped", output) + + def test_set_seed_reproducibility_numpy(self): + set_seed(42) + r1 = np.random.rand(5) + set_seed(42) + r2 = np.random.rand(5) + np.testing.assert_array_equal(r1, r2) + + def test_set_seed_reproducibility_torch(self): + set_seed(42) + t1 = torch.rand(5) + set_seed(42) + t2 = torch.rand(5) + self.assertTrue(torch.equal(t1, t2)) if __name__ == "__main__": unittest.main() diff --git a/tests/test_utils_feature_selection.py b/tests/test_utils_feature_selection.py new file mode 100644 index 0000000..6d98346 --- /dev/null +++ b/tests/test_utils_feature_selection.py @@ -0,0 +1,172 @@ +import unittest +import warnings +import numpy as np +import pandas as pd + +from bioneuralnet.utils.feature_selection import ( + variance_threshold, + mad_filter, + pca_loadings, + laplacian_score, + correlation_filter, + importance_rf, + top_anova_f_features, +) +from bioneuralnet.datasets import DatasetLoader + +class TestFeatureSelection(unittest.TestCase): + def setUp(self): + example = DatasetLoader("example") + X1 = example["X1"] + X2 = example["X2"] + Y = example["Y"] + + self.X = pd.concat([X1, X2], axis=1) + + if isinstance(Y, pd.DataFrame): + Y = Y.iloc[:, 0] + self.y_cont = Y + self.y_bin = (Y > float(np.median(Y.values))).astype(int) + + # small fixture for precise tests + self.X_small = pd.DataFrame( + { + "high_var": [1.0, 10.0, 100.0, 1000.0], + "low_var": [1.0, 1.1, 1.0, 1.1], + "zero_var": [5.0, 5.0, 5.0, 5.0], + } + ) + + self.df_nan = pd.DataFrame( + { + "C1": [1.0, 2.0, np.nan, 4.0], + "C2": [10.0, np.nan, 30.0, 40.0], + "C3": [5.0, 5.0, 5.0, 5.0], + } + ) + + def test_variance_threshold_selects_top_k(self): + result = variance_threshold(self.X_small, k=2) + self.assertEqual(result.shape[1], 2) + # zero-variance column should be dropped first + self.assertNotIn("zero_var", result.columns) + + def test_variance_threshold_k_larger_than_cols(self): + # k > number of columns should return all columns (minus zero-var after clean) + result = variance_threshold(self.X_small, k=100) + self.assertLessEqual(result.shape[1], self.X_small.shape[1]) + + def test_variance_threshold_on_example_data(self): + X_sub = self.X.iloc[:, :50] + result = variance_threshold(X_sub, k=10) + self.assertIsInstance(result, pd.DataFrame) + self.assertLessEqual(result.shape[1], 10) + self.assertEqual(result.shape[0], X_sub.shape[0]) + + def test_mad_filter_reduces_columns(self): + X_sub = self.X.iloc[:, :20] + result = mad_filter(X_sub, n_keep=5) + self.assertIsInstance(result, pd.DataFrame) + self.assertLessEqual(result.shape[1], 5) + self.assertEqual(result.shape[0], X_sub.shape[0]) + + def test_mad_filter_passthrough_when_n_keep_ge_cols(self): + X_sub = self.X.iloc[:, :5] + result = mad_filter(X_sub, n_keep=100) + self.assertEqual(result.shape, X_sub.shape) + + def test_pca_loadings_reduces_columns(self): + X_sub = self.X.iloc[:, :30] + result = pca_loadings(X_sub, n_keep=5) + self.assertIsInstance(result, pd.DataFrame) + self.assertLessEqual(result.shape[1], 5) + self.assertEqual(result.shape[0], X_sub.shape[0]) + + def test_pca_loadings_passthrough_when_n_keep_ge_cols(self): + X_sub = self.X.iloc[:, :5] + result = pca_loadings(X_sub, n_keep=100) + self.assertEqual(result.shape, X_sub.shape) + + def test_laplacian_score_reduces_columns(self): + X_sub = self.X.iloc[:, :20] + result = laplacian_score(X_sub, n_keep=5) + self.assertIsInstance(result, pd.DataFrame) + self.assertLessEqual(result.shape[1], 5) + self.assertEqual(result.shape[0], X_sub.shape[0]) + + def test_laplacian_score_passthrough_when_n_keep_ge_cols(self): + X_sub = self.X.iloc[:, :5] + result = laplacian_score(X_sub, n_keep=100) + self.assertEqual(result.shape, X_sub.shape) + + def test_correlation_filter_unsupervised(self): + X_sub = self.X.iloc[:40, :20] + result = correlation_filter(X_sub, y=None, top_k=5) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 40) + self.assertLessEqual(result.shape[1], 5) + + def test_correlation_filter_supervised(self): + X_sub = self.X.iloc[:40, :20] + y_sub = self.y_cont.iloc[:40] + result = correlation_filter(X_sub, y=y_sub, top_k=5) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 40) + self.assertLessEqual(result.shape[1], 5) + + def test_correlation_filter_bad_y_raises(self): + X_sub = self.X.iloc[:5, :10] + bad_y = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) + with self.assertRaises(ValueError): + correlation_filter(X_sub, y=bad_y, top_k=3) + + def test_importance_rf_classification(self): + X_sub = self.X.iloc[:40, :20] + y_sub = self.y_bin.iloc[:40] + result = importance_rf(X_sub, y_sub, top_k=5, seed=0) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 40) + self.assertLessEqual(result.shape[1], 5) + + def test_importance_rf_non_numeric_raises(self): + X_sub = self.X.iloc[:10, :5].copy() + X_sub["text_col"] = ["a"] * 10 + y_sub = self.y_bin.iloc[:10] + with self.assertRaises(ValueError): + importance_rf(X_sub, y_sub, top_k=3) + + def test_importance_rf_bad_y_raises(self): + X_sub = self.X.iloc[:10, :5] + bad_y = pd.DataFrame({"a": range(10), "b": range(10)}) + with self.assertRaises(ValueError): + importance_rf(X_sub, bad_y, top_k=3) + + def test_top_anova_classification(self): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + X_sub = self.X.iloc[:40, :20] + y_sub = self.y_bin.iloc[:40] + result = top_anova_f_features(X_sub, y_sub, max_features=5, task="classification") + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 40) + self.assertLessEqual(result.shape[1], 5) + + def test_top_anova_regression(self): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + X_sub = self.X.iloc[:40, :20] + y_sub = self.y_cont.iloc[:40] + result = top_anova_f_features(X_sub, y_sub, max_features=5, task="regression") + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 40) + self.assertLessEqual(result.shape[1], 5) + + def test_top_anova_invalid_task_raises(self): + X_sub = self.X.iloc[:40, :20] + y_sub = self.y_cont.iloc[:40] + with self.assertRaises(ValueError): + top_anova_f_features(X_sub, y_sub, max_features=5, task="invalid") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils_graph_tools.py b/tests/test_utils_graph_tools.py deleted file mode 100644 index 021a453..0000000 --- a/tests/test_utils_graph_tools.py +++ /dev/null @@ -1,141 +0,0 @@ -import unittest -from unittest.mock import patch -import pandas as pd -import numpy as np -import networkx as nx -import warnings - -from bioneuralnet.network.tools import analyze_network -from bioneuralnet.network.tools import repair_network -from bioneuralnet.network.tools import optimize_network -from bioneuralnet.datasets import DatasetLoader - - -class TestGraphTools(unittest.TestCase): - def setUp(self): - self.adj_disconnected = pd.DataFrame( - [ - [0.0, 1.0, 0.0, 0.0], - [1.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 1.0], - [0.0, 0.0, 1.0, 0.0], - ], - index=["n1", "n2", "n3", "n4"], - columns=["n1", "n2", "n3", "n4"], - ) - - index_list = [] - i = 0 - while i < 10: - index_list.append("s" + str(i)) - i += 1 - - self.omics_data = pd.DataFrame( - np.random.rand(10, 4), - index=index_list, - columns=["n1", "n2", "n3", "n4"], - ) - - y_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] - self.y = pd.Series(y_values, index=self.omics_data.index) - - self.omics_list = [] - self.omics_list.append(self.omics_data[["n1", "n2"]]) - self.omics_list.append(self.omics_data[["n3", "n4"]]) - - example = DatasetLoader("example") - X1 = example["X1"] - X2 = example["X2"] - y_ex = example["Y"] - - if isinstance(y_ex, pd.DataFrame): - y_ex = y_ex.iloc[:, 0] - - y_array = y_ex.values - median_val = float(np.median(y_array)) - y_binary = (y_ex > median_val).astype(int) - - self.omics_data_example = pd.concat([X1, X2], axis=1) - self.y_example = y_binary - - @patch("bioneuralnet.utils.graph_tools.logger") - def test_graph_analysis_logs(self, mock_logger): - graph_analysis(self.adj_disconnected, "TestGraph", omics_list=None) - self.assertTrue(mock_logger.info.called) - - calls = [] - for c in mock_logger.info.call_args_list: - calls.append(str(c)) - - found_nodes = False - found_components = False - for c in calls: - if "Nodes: 4" in c: - found_nodes = True - if "Connected components: 2" in c: - found_components = True - - self.assertTrue(found_nodes) - self.assertTrue(found_components) - - def test_repair_graph_connectivity_eigen(self): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - - eps = 0.01 - repaired = repair_graph_connectivity( - self.adj_disconnected, - epsilon=eps, - selection_mode="eigen", - ) - - self.assertIsInstance(repaired, pd.DataFrame) - self.assertEqual(repaired.shape, (4, 4)) - - G = nx.from_pandas_adjacency(repaired) - self.assertEqual(nx.number_connected_components(G), 1) - - original_sum = float(self.adj_disconnected.values.sum()) - repaired_sum = float(repaired.values.sum()) - self.assertGreater(repaired_sum, original_sum) - - def test_repair_graph_connectivity_modality_corr(self): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - - eps = 0.01 - repaired = repair_graph_connectivity( - self.adj_disconnected, - epsilon=eps, - selection_mode="modality_corr", - omics_list=self.omics_list, - ) - - G = nx.from_pandas_adjacency(repaired) - self.assertEqual(nx.number_connected_components(G), 1) - - def test_find_optimal_graph_execution(self): - best_G, best_params, results = find_optimal_graph( - self.omics_data_example, - self.y_example, - methods=["correlation", "threshold"], - trials=2, - verbose=False, - ) - - self.assertIsInstance(results, pd.DataFrame) - - if not results.empty: - has_method = "method" in results.columns - has_score = "score" in results.columns - self.assertTrue(has_method) - self.assertTrue(has_score) - - if best_G is not None: - self.assertIsInstance(best_G, pd.DataFrame) - self.assertIsInstance(best_params, dict) - self.assertEqual(best_G.shape[0], self.omics_data_example.shape[1]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_utils_preprocess.py b/tests/test_utils_preprocess.py index a0e289b..5af1aac 100644 --- a/tests/test_utils_preprocess.py +++ b/tests/test_utils_preprocess.py @@ -1,64 +1,35 @@ import unittest -import pandas as pd import numpy as np -import warnings - -from bioneuralnet.utils.preprocess import preprocess_clinical -from bioneuralnet.utils.preprocess import clean_inf_nan -from bioneuralnet.utils.preprocess import select_top_k_variance -from bioneuralnet.utils.preprocess import select_top_k_correlation -from bioneuralnet.utils.preprocess import select_top_randomforest -from bioneuralnet.utils.preprocess import top_anova_f_features -from bioneuralnet.utils.preprocess import prune_network -from bioneuralnet.utils.preprocess import prune_network_by_quantile -from bioneuralnet.utils.preprocess import network_remove_low_variance -from bioneuralnet.utils.preprocess import network_remove_high_zero_fraction -from bioneuralnet.utils.preprocess import impute_omics -from bioneuralnet.utils.preprocess import impute_omics_knn -from bioneuralnet.utils.preprocess import normalize_omics -from bioneuralnet.utils.preprocess import beta_to_m -from bioneuralnet.datasets import DatasetLoader +import pandas as pd +from bioneuralnet.utils.preprocess import ( + m_transform, + impute_simple, + impute_knn, + normalize, + clean_inf_nan, + clean_internal, + preprocess_clinical, + prune_network, + prune_network_by_quantile, + network_remove_low_variance, + network_remove_high_zero_fraction, +) +from bioneuralnet.datasets import DatasetLoader class TestPreprocessFunctions(unittest.TestCase): def setUp(self): example = DatasetLoader("example") - X1 = example["X1"] - X2 = example["X2"] - Y = example["Y"] - clinical = example["clinical"] - - self.omics_example = pd.concat([X1, X2], axis=1) - - if isinstance(Y, pd.DataFrame): - self.y_example = Y.iloc[:, 0] - else: - self.y_example = Y - - y_array = self.y_example.values - median_val = float(np.median(y_array)) - self.y_binary = (self.y_example > median_val).astype(int) - - self.clinical = clinical.copy() - self.clinical["ignore"] = 1 + self.clinical = example["clinical"].copy() - self.X_small = pd.DataFrame( - { - "f1": [1.0, 2.0, 3.0, 4.0], - "f2": [1.0, 1.0, 1.0, 1.0], - "f3": [4.0, 3.0, 2.0, 1.0], - "f4": [10.0, 20.0, 30.0, 40.0], - }, - index=["i1", "i2", "i3", "i4"], - ) - - self.df_var = pd.DataFrame( + self.df_beta = pd.DataFrame( { - "A": [1.0, 3.0, 5.0], - "B": [2.0, 4.0, 6.0], + "B1": [0.1, 0.5, 0.9], + "B2": [0.0, 1.0, 0.5], } ) + # zero variance example self.df_nan = pd.DataFrame( { "C1": [1.0, 2.0, np.nan, 4.0], @@ -67,15 +38,13 @@ def setUp(self): } ) - self.df_beta = pd.DataFrame( + self.df_var = pd.DataFrame( { - "B1": [0.1, 0.5, 0.9], - "B2": [0.0, 1.0, 0.5], + "A": [1.0, 3.0, 5.0], + "B": [2.0, 4.0, 6.0], } ) - self.y_reg = pd.Series([0.1, 0.4, 0.5, 0.8], index=self.X_small.index) - adj = np.array( [ [1.0, 0.2, 0.0, 0.8], @@ -85,12 +54,75 @@ def setUp(self): ], dtype=float, ) - self.adj_df = pd.DataFrame( adj, index=["a", "b", "c", "d"], columns=["a", "b", "c", "d"] ) - def test_clean_inf_nan_replaces_and_drops(self): + def test_m_transform_shape_preserved(self): + result = m_transform(self.df_beta, eps=1e-6) + self.assertEqual(result.shape, self.df_beta.shape) + self.assertFalse(result.isnull().any().any()) + + def test_m_transform_values(self): + result = m_transform(self.df_beta, eps=1e-6) + # B1=0.1 -> log2(0.1/0.9) around -3.17 + self.assertAlmostEqual(result.loc[0, "B1"], -3.169925, places=4) + # B1=0.9 -> log2(0.9/0.1) around +3.17 + self.assertAlmostEqual(result.loc[2, "B1"], 3.169925, places=4) + + def test_impute_simple_mean(self): + result = impute_simple(self.df_nan[["C1", "C2"]], method="mean") + self.assertEqual(result.isna().sum().sum(), 0) + self.assertAlmostEqual(result.loc[2, "C1"], (1.0 + 2.0 + 4.0) / 3, places=5) + + def test_impute_simple_median(self): + result = impute_simple(self.df_nan[["C1", "C2"]], method="median") + self.assertEqual(result.isna().sum().sum(), 0) + self.assertAlmostEqual(result.loc[1, "C2"], 30.0, places=5) + + def test_impute_simple_zero(self): + result = impute_simple(self.df_nan[["C1", "C2"]], method="zero") + self.assertEqual(result.isna().sum().sum(), 0) + self.assertEqual(result.loc[2, "C1"], 0.0) + + def test_impute_simple_invalid_method_raises(self): + with self.assertRaises(ValueError): + impute_simple(self.df_nan, method="bad_method") + + def test_impute_knn_fills_nans(self): + result = impute_knn(self.df_nan[["C1", "C2"]], n_neighbors=2) + self.assertEqual(result.shape, self.df_nan[["C1", "C2"]].shape) + self.assertEqual(result.isna().sum().sum(), 0) + + def test_impute_knn_no_nans_passthrough(self): + df_clean = pd.DataFrame({"x": [1.0, 2.0], "y": [3.0, 4.0]}) + result = impute_knn(df_clean, n_neighbors=1) + pd.testing.assert_frame_equal(result, df_clean) + + def test_impute_knn_non_numeric_raises(self): + df_bad = pd.DataFrame({"x": [1.0, np.nan], "label": ["a", "b"]}) + with self.assertRaises(ValueError): + impute_knn(df_bad) + + def test_normalize_standard(self): + result = normalize(self.df_var, method="standard") + self.assertAlmostEqual(result.mean().sum(), 0.0, places=5) + + def test_normalize_minmax(self): + result = normalize(self.df_var, method="minmax") + self.assertAlmostEqual(result.min().min(), 0.0, places=5) + self.assertAlmostEqual(result.max().max(), 1.0, places=5) + + def test_normalize_log2(self): + result = normalize(self.df_var, method="log2") + self.assertAlmostEqual(result.loc[0, "A"], np.log2(2.0), places=5) + + def test_normalize_invalid_method_raises(self): + with self.assertRaises(ValueError): + normalize(self.df_var, method="bad_method") + + # zero variance -> should be dropped + def test_clean_inf_nan_removes_inf_and_nan(self): df = pd.DataFrame( { "x": [1.0, np.inf, 3.0, -np.inf], @@ -99,139 +131,82 @@ def test_clean_inf_nan_replaces_and_drops(self): } ) cleaned = clean_inf_nan(df) - - self.assertTrue(np.allclose(cleaned["x"].values, [1.0, 2.0, 3.0, 2.0])) - self.assertTrue(np.allclose(cleaned["y"].values, [3.0, 2.0, 3.0, 4.0])) - self.assertNotIn("z", cleaned.columns) self.assertFalse(cleaned.isin([np.inf, -np.inf]).any().any()) self.assertFalse(cleaned.isna().any().any()) + self.assertNotIn("z", cleaned.columns) - def test_preprocess_clinical_basic(self): - result = preprocess_clinical( - X=self.clinical, - top_k=3, - scale=False, - ignore_columns=["ignore"], + # 75 percent NaN and zero variance + def test_clean_internal_drops_sparse_columns(self): + df = pd.DataFrame( + { + "dense": [1.0, 2.0, 3.0, 4.0], + "sparse": [np.nan, np.nan, np.nan, 1.0], + "const": [1.0, 1.0, 1.0, 1.0], + } ) + result = clean_internal(df, nan_threshold=0.5) + self.assertNotIn("sparse", result.columns) + self.assertNotIn("const", result.columns) + self.assertEqual(result.isna().sum().sum(), 0) + + def test_clean_internal_no_nans_passthrough(self): + df = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [4.0, 5.0, 6.0]}) + result = clean_internal(df, nan_threshold=0.5) + self.assertEqual(result.shape, df.shape) + self.assertEqual(result.isna().sum().sum(), 0) + + def test_preprocess_clinical_basic(self): + result = preprocess_clinical(self.clinical) self.assertIsInstance(result, pd.DataFrame) self.assertEqual(result.shape[0], self.clinical.shape[0]) - self.assertNotIn("ignore", result.columns) self.assertGreater(result.shape[1], 0) - self.assertLessEqual(result.shape[1], 3) - - def test_preprocess_clinical_errors(self): - with self.assertRaises(TypeError): - preprocess_clinical( - self.clinical.iloc[:2, :], - top_k=1, - nan_threshold="bad", - ) - with self.assertRaises(KeyError): - preprocess_clinical(self.clinical, ignore_columns=["nonexistent"]) - - def test_select_top_k_variance(self): - X_sub = self.omics_example.iloc[:40, :20] - top5 = select_top_k_variance(X_sub, k=5) - self.assertEqual(top5.shape[0], 40) - self.assertLessEqual(top5.shape[1], 5) - self.assertTrue((top5.var(axis=0) > 0).all()) - - def test_select_top_k_correlation_unsupervised(self): - X_sub = self.omics_example.iloc[:40, :20] - unsup = select_top_k_correlation(X_sub, y=None, top_k=5) - self.assertEqual(unsup.shape[0], 40) - self.assertLessEqual(unsup.shape[1], 5) - self.assertTrue((unsup.var(axis=0) > 0).all()) - - def test_select_top_k_correlation_supervised(self): - X_sub = self.omics_example.iloc[:40, :20] - y_sub = self.y_example.iloc[:40] - - sup = select_top_k_correlation(X_sub, y=y_sub, top_k=5) - self.assertEqual(sup.shape[0], 40) - self.assertLessEqual(sup.shape[1], 5) - - bad_y = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - with self.assertRaises(ValueError): - select_top_k_correlation(X_sub.iloc[:2, :], bad_y, top_k=1) - - def test_select_top_randomforest(self): - X_sub = self.omics_example.iloc[:40, :20] - y_sub = self.y_example.iloc[:40] - - top_rf = select_top_randomforest(X_sub, y_sub, top_k=5, seed=0) - self.assertEqual(top_rf.shape[0], 40) - self.assertLessEqual(top_rf.shape[1], 5) - - X_mixed = X_sub.copy() - str_list = [] - i = 0 - n_rows = X_sub.shape[0] - base_strings = ["x", "y", "z", "w"] - while len(str_list) < n_rows: - str_list.append(base_strings[i % len(base_strings)]) - i += 1 - X_mixed["non_numeric"] = str_list + self.assertEqual(result.isna().sum().sum(), 0) - with self.assertRaises(ValueError): - select_top_randomforest(X_mixed, y_sub, top_k=1) - - def test_top_anova_f_features_classification(self): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - - X_clf = self.omics_example.iloc[:40, :20] - y_bin = self.y_binary.iloc[:40] - - top_feats = top_anova_f_features( - X_clf, - y_bin, - max_features=5, - alpha=0.05, - task="classification", - ) - self.assertEqual(top_feats.shape[0], 40) - self.assertLessEqual(top_feats.shape[1], 5) - - X_reg = self.omics_example.iloc[:40, :20] - y_cont = self.y_example.iloc[:40] - top_feats_reg = top_anova_f_features( - X_reg, - y_cont, - max_features=5, - alpha=0.05, - task="regression", - ) - self.assertEqual(top_feats_reg.shape[0], 40) - self.assertLessEqual(top_feats_reg.shape[1], 5) - - with self.assertRaises(ValueError): - top_anova_f_features( - X_reg, - y_cont, - max_features=1, - alpha=0.05, - task="invalid", - ) + def test_preprocess_clinical_drop_columns(self): + col_to_drop = self.clinical.columns[0] + result = preprocess_clinical(self.clinical, drop_columns=[col_to_drop]) + self.assertNotIn(col_to_drop, result.columns) + + def test_preprocess_clinical_scale(self): + result = preprocess_clinical(self.clinical, scale=True) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], self.clinical.shape[0]) + + def test_preprocess_clinical_ordinal_mapping(self): + df = pd.DataFrame( + { + "grade": ["low", "high", "low", "medium"], + "score": [1.0, 2.0, 3.0, 4.0], + } + ) + mapping = {"ordinal_mappings": {"grade": {"low": 0, "medium": 1, "high": 2}}} + result = preprocess_clinical(df, **mapping) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.isna().sum().sum(), 0) def test_prune_network_threshold(self): pruned = prune_network(self.adj_df, weight_threshold=0.15) - self.assertEqual(set(pruned.index), set(self.adj_df.index)) - self.assertEqual(set(pruned.columns), set(self.adj_df.columns)) + self.assertIsInstance(pruned, pd.DataFrame) + # all remaining nodes should be in the original index + self.assertTrue(set(pruned.index).issubset(set(self.adj_df.index))) + + def test_prune_network_zero_threshold_keeps_all_connected(self): full = prune_network(self.adj_df, weight_threshold=0.0) self.assertEqual(set(full.index), set(self.adj_df.index)) def test_prune_network_by_quantile(self): - pruned_q = prune_network_by_quantile(self.adj_df, quantile=0.5) - self.assertEqual(set(pruned_q.index), set(self.adj_df.index)) - empty_adj = pd.DataFrame( - np.zeros((3, 3)), - index=["x", "y", "z"], - columns=["x", "y", "z"], + pruned = prune_network_by_quantile(self.adj_df, quantile=0.5) + self.assertIsInstance(pruned, pd.DataFrame) + self.assertTrue(set(pruned.index).issubset(set(self.adj_df.index))) + + def test_prune_network_by_quantile_empty_graph(self): + empty = pd.DataFrame( + np.zeros((3, 3)), index=["x", "y", "z"], columns=["x", "y", "z"] ) - pruned_empty = prune_network_by_quantile(empty_adj, quantile=0.5) - self.assertTrue(pruned_empty.equals(empty_adj)) + result = prune_network_by_quantile(empty, quantile=0.5) + self.assertIsInstance(result, pd.DataFrame) + # zero variance -> should be removed def test_network_remove_low_variance(self): net = pd.DataFrame( { @@ -244,8 +219,8 @@ def test_network_remove_low_variance(self): filtered = network_remove_low_variance(net, threshold=0.5) self.assertNotIn("n2", filtered.index) self.assertNotIn("n2", filtered.columns) - self.assertEqual(set(filtered.index), {"n3"}) + # 2/3 zeros (c2)-> should be removed at threshold=0.5 def test_network_remove_high_zero_fraction(self): net = pd.DataFrame( { @@ -258,42 +233,8 @@ def test_network_remove_high_zero_fraction(self): filtered = network_remove_high_zero_fraction(net, threshold=0.5) self.assertNotIn("c2", filtered.index) self.assertNotIn("c2", filtered.columns) - self.assertEqual(set(filtered.index), {"c1", "c3"}) - - def test_impute_omics_mean(self): - df_imputed = impute_omics(self.df_nan, method="mean") - self.assertAlmostEqual(df_imputed.loc[2, "C1"], 2.3333333333333335) - self.assertAlmostEqual(df_imputed.loc[1, "C2"], 26.666666666666668) - self.assertEqual(df_imputed.isna().sum().sum(), 0) - - def test_impute_omics_median(self): - df_imputed = impute_omics(self.df_nan, method="median") - self.assertAlmostEqual(df_imputed.loc[1, "C2"], 30.0) - self.assertEqual(df_imputed.isna().sum().sum(), 0) - - def test_impute_omics_knn(self): - df_imputed = impute_omics_knn(self.df_nan, n_neighbors=2) - self.assertEqual(df_imputed.shape, self.df_nan.shape) - self.assertEqual(df_imputed.isna().sum().sum(), 0) - self.assertNotAlmostEqual(df_imputed.loc[1, "C2"], 26.666666666666668) - - def test_normalize_omics_standard(self): - df_normalized = normalize_omics(self.df_nan.dropna(), method="standard") - self.assertAlmostEqual(df_normalized.mean().sum(), 0.0, places=5) - expected_std_sum = np.sqrt(2) * 2 - self.assertAlmostEqual(df_normalized.std().sum(), expected_std_sum, places=5) - - def test_normalize_omics_log2(self): - df_log = normalize_omics(self.df_var, method="log2") - self.assertAlmostEqual(df_log.loc[0, "A"], 1.0) - self.assertAlmostEqual(df_log.loc[0, "B"], 1.5849625) - - def test_beta_to_m_conversion(self): - df_m_values = beta_to_m(self.df_beta, eps=1e-6) - self.assertAlmostEqual(df_m_values.loc[0, "B1"], -3.169925, places=5) - self.assertAlmostEqual(df_m_values.loc[2, "B2"], 0.0, places=5) - self.assertAlmostEqual(df_m_values.loc[0, "B2"], -19.931567126628412) - + self.assertIn("c1", filtered.index) + self.assertIn("c3", filtered.index) if __name__ == "__main__": unittest.main()