OpenMS · hjn0415a · Jun 22, 2026 · Jun 26, 2026 · Jun 30, 2026 · coderabbitai
diff --git a/app.py b/app.py
@@ -23,12 +23,18 @@
             st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="📈"),
             st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="🎯"),
             st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="📋"),
+            st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="📚"),
+            st.Page(Path("content", "enrichment.py"), title="Pathway Analysis", icon="📉"),
+        ],
+        "Differential Protein Analysis": [
+            st.Page(Path("content", "filtering.py"), title="Filtering", icon="🧹"),
+            st.Page(Path("content", "imputation.py"), title="Imputation", icon="🩹"),
+            st.Page(Path("content", "normalization.py"), title="Normalization", icon="⚖️"),
+            st.Page(Path("content", "statistical.py"), title="Statistical", icon="🔢"),
             st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="🌋"),
             st.Page(Path("content", "results_pca.py"), title="PCA", icon="📊"),
             st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="🔥"),
-            st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="📚"),
-            st.Page(Path("content", "results_proteomicslfq.py"), title="Proteomics LFQ", icon="🧪"),
-        ],
+        ]
     }
 
     pg = st.navigation(pages)

diff --git a/content/enrichment.py b/content/enrichment.py
@@ -0,0 +1,140 @@
+"""Pathway Analysis Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+# Import GO Enrichment modules from openms_insight engine
+from openms_insight.analysis.enrichment import calculate_go_enrichment
+
+params = page_setup()
+st.title("GO Enrichment Analysis")
+
+st.markdown(
+    """
+Identify overrepresented biological themes (BP, CC, MF) within your differentially expressed protein features using MyGene.info and Fisher's Exact Test.
+"""
+)
+
+if "workspace" not in st.session_state:
+    st.warning("Please initialize your workspace first.")
+    st.stop()
+
+# --- STEP 1: Upstream Statistics Checkpoint ---
+if (
+    "statistics_df" in st.session_state
+    and st.session_state["statistics_df"] is not None
+):
+    final_statistics_report = st.session_state["statistics_df"]
+    st.info(
+        "🔄 **Upstream Pipeline Detected**: Using analyzed matrices from the **Statistical Inference** step."
+    )
+else:
+    st.warning(
+        "⚠️ **Missing Prerequisites**: Statistical inference data not detected. Please run hypothesis testing first."
+    )
+    st.page_link(
+        "content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
-    st.page_link(
-        "content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
+    st.page_link(
+        "content/statistical.py", label="Go to Statistical Inference", icon="🔬"
-    st.page_link(
-        "content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
+    st.page_link(
+        "content/statistical.py", label="Go to Statistical Inference", icon="🔬"
+    )
+    st.stop()
+
+# --- STEP 2: Preprocessing Mapping Key Configuration ---
+# Identify target identifier columns dynamically
+id_col = "ProteinName"
+if id_col not in final_statistics_report.columns:
+    st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
+    st.stop()
+
+# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
+st.subheader("Configure Enrichment Thresholds")
+
+# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
+target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
-id_col = "ProteinName"
-if id_col not in final_statistics_report.columns:
-    st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
-    st.stop()
-
-# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
-st.subheader("Configure Enrichment Thresholds")
-
-# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
-target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
+required_cols = {"ProteinName", "log2FC"}
+missing_cols = required_cols - set(final_statistics_report.columns)
+if missing_cols:
+    st.error(
+        f"❌ Structural Error: Missing required columns: {', '.join(sorted(missing_cols))}."
+    )
+    st.stop()
+
+id_col = "ProteinName"
+
+# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
+st.subheader("Configure Enrichment Thresholds")
+
+if "p-adj" in final_statistics_report.columns:
+    target_p_col = "p-adj"
+elif "p-value" in final_statistics_report.columns:
+    target_p_col = "p-value"
+else:
+    st.error("❌ Structural Error: Missing both 'p-adj' and 'p-value'.")
+    st.stop()
-id_col = "ProteinName"
-if id_col not in final_statistics_report.columns:
-    st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
-    st.stop()
-
-# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
-st.subheader("Configure Enrichment Thresholds")
-
-# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
-target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
+required_cols = {"ProteinName", "log2FC"}
+missing_cols = required_cols - set(final_statistics_report.columns)
+if missing_cols:
+    st.error(
+        f"❌ Structural Error: Missing required columns: {', '.join(sorted(missing_cols))}."
+    )
+    st.stop()
+
+id_col = "ProteinName"
+
+# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
+st.subheader("Configure Enrichment Thresholds")
+
+if "p-adj" in final_statistics_report.columns:
+    target_p_col = "p-adj"
+elif "p-value" in final_statistics_report.columns:
+    target_p_col = "p-value"
+else:
+    st.error("❌ Structural Error: Missing both 'p-adj' and 'p-value'.")
+    st.stop()
+p_label = (
+    "Adjusted P-value (p-adj) Cutoff"
+    if target_p_col == "p-adj"
+    else "Raw P-value (p-value) Cutoff"
+)
+
+ui_go_col1, ui_go_col2 = st.columns(2)
+
+with ui_go_col1:
+    p_cutoff = st.number_input(
+        f"🔬 {p_label}",
+        min_value=0.0001,
+        max_value=1.0,
+        value=0.05,
+        step=0.01,
+        format="%.4f",
+        help="Proteins with significance metrics below this value are mapped to the foreground cohort.",
+    )
+
+with ui_go_col2:
+    fc_cutoff = st.number_input(
+        "📈 Absolute Difference Cutoff (|log2FC|)",
+        min_value=0.0,
+        max_value=10.0,
+        value=1.0,
+        step=0.1,
+        format="%.2f",
+        help="Proteins with absolute log2 fold change greater than or equal to this threshold will be selected.",
+    )
+
+# --- SECTION 2: Execution and Interactive View Charts ---
+st.markdown("<br>", unsafe_allow_html=True)
+if st.button("🚀 Run GO Enrichment Analysis", type="primary", key="run_go_analysis"):
+
+    with st.spinner("Querying MyGene.info API & executing hyper-geometric calculation loops..."):
+        # Convert internal pandas DataFrame to openms_insight Polars DataFrame expectation
+        stats_pl = pl.from_pandas(final_statistics_report)
+
+        status, output = calculate_go_enrichment(
+            final_report=stats_pl,
+            id_col=id_col,
+            target_p_col=target_p_col,
+            p_cutoff=p_cutoff,
+            fc_cutoff=fc_cutoff,
+        )
+
+    # Route response structures based on analysis output status code
+    if status == "empty_data":
+        st.error("❌ No valid statistical rows found containing standard columns to run GO alignment.")
+
+    elif status == "insufficient_proteins":
+        st.warning(
+            f"⚠️ Not enough significant proteins found to construct target datasets. "
+            f"(Criteria: {target_p_col} < {p_cutoff:.4f}, |log2FC| ≥ {fc_cutoff:.2f})."
+        )
+        st.info(f"💡 Found significant proteins count: **{output}**. Try relaxing your p-value or log2FC filters.")
+
+    elif status == "success":
+        st.success("⭕ GO Enrichment Analysis completed successfully!")
+
+        # Display operational matrix scale
+        st.markdown(
+            f"📊 **Analysis Profile Scope**: Mapped **{output['fg_count']}** significant foreground profiles out of **{output['bg_count']}** reference background items."
+        )
+
+        # Build multi-tab interface layer for ontology subcategories
+        tabs = st.tabs([
+            "🧬 Biological Process (BP)",
+            "🔬 Cellular Component (CC)",
+            "🧪 Molecular Function (MF)"
+        ])
+        categories_data = output["categories"]
+
+        for idx, go_type in enumerate(["BP", "CC", "MF"]):
+            with tabs[idx]:
+                fig = categories_data[go_type]["fig"]
+                df_go = categories_data[go_type]["df"]
+
+                if fig is not None and df_go is not None:
+                    # Render plotly bar figures generated straight from backend engine
+                    st.plotly_chart(fig, use_container_width=True)
+
+                    st.subheader(f"📊 {go_type} Results Dataframe")
+                    st.dataframe(df_go, use_container_width=True)
+                else:
+                    st.info(f"No statistically overrepresented terms identified for Category: **{go_type}**")
diff --git a/content/filtering.py b/content/filtering.py
@@ -0,0 +1,163 @@
+"""Filtering Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+# Import filtering functions from openms_insight package
+from openms_insight.analysis.filter import (
+    filter_low_abundance,
+    filter_low_repeatability,
+    filter_low_variance,
+)
+
+params = page_setup()
+st.title("Data Filtering")
+
+st.markdown(
+    """
+Filter out low-quality proteins from your dataset based on abundance, repeatability, or variance thresholds.
+"""
+)
+
+if "workspace" not in st.session_state:
+    st.warning("Please initialize your workspace first.")
+    st.stop()
+
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+    st.info(
+        "Abundance data not available. Please run the workflow and configure sample groups first."
+    )
+    st.page_link(
+        "content/results_abundance.py", label="Go to Abundance", icon="📋"
+    )
+    st.stop()
+
+pivot_df, group_map = result
+
+# 1. Identify actual sample columns dynamically
+sample_cols = [
+    c
+    for c in pivot_df.columns
+    if c not in ["ProteinName", "PeptideSequence"]
+]
+
+# --- SECTION 1: Original Data View ---
+st.subheader("Original Abundance Table")
+st.markdown(
+    f"Currently displaying **{pivot_df.shape[0]}** proteins and **{len(sample_cols)}** samples before filtering."
+)
+st.dataframe(pivot_df, use_container_width=True)
+
+st.markdown("---")
+
+# --- SECTION 2: Filter Configuration ---
+st.subheader("Configure Filter Engine")
+
+# Prepare Polars Metadata DataFrame required by openms_insight functions
+metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
+metadata_pl = pl.DataFrame(
+    metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
+)
+
+# User selection for filtering strategy
+filter_method = st.selectbox(
+    "Select Filtering Method",
+    options=["Low Abundance", "Low Repeatability", "Low Variance"],
+    index=0,
+    help="Choose the statistical criteria to prune unreliable protein entries.",
+)
+
+# Render threshold sliders dynamically based on the selected filter method
+if filter_method == "Low Abundance":
+    st.markdown(
+        "**Low Abundance Filter**: Keeps rows where at least one group's median is above the selected percentile threshold."
+    )
+    threshold = st.slider(
+        "Threshold Percentile (%)",
+        min_value=0.0,
+        max_value=100.0,
+        value=10.0,
+        step=5.0,
+    )
+
+elif filter_method == "Low Repeatability":
+    st.markdown(
+        "**Low Repeatability Filter**: Keeps rows where at least one group has a missing value ratio within the allowed maximum."
+    )
+    threshold = st.slider(
+        "Max Missing Ratio",
+        min_value=0.0,
+        max_value=100.0,
+        value=50.0,
+        step=5.0,
+        help="Allowed missing value (zero or null) ratio per group.",
+    )
+
+elif filter_method == "Low Variance":
+    st.markdown(
+        "**Low Variance Filter**: Keeps rows where at least one group's variance is above the selected percentile threshold."
+    )
+    threshold = st.slider(
+        "Threshold Percentile (%)",
+        min_value=0.0,
+        max_value=100.0,
+        value=10.0,
+        step=5.0,
+    )
+
+# --- SECTION 3: Filter Execution and Collected Results View ---
+if st.button("Apply Filter", type="primary"):
+    # Convert the original Pandas DataFrame into a Polars LazyFrame graph
+    quant_lazy = pl.from_pandas(pivot_df).lazy()
+
+    # Route execution to the chosen openms_insight engine function
+    if filter_method == "Low Abundance":
+        filtered_lazy = filter_low_abundance(
+            quantification_data=quant_lazy,
+            metadata=metadata_pl,
+            group_column="group",
+            threshold_percentile=threshold,
+        )
+    elif filter_method == "Low Repeatability":
+        # Convert percent slider input to ratio expected by the function (e.g., 50.0% -> 0.5)
+        filtered_lazy = filter_low_repeatability(
+            quantification_data=quant_lazy,
+            metadata=metadata_pl,
+            group_column="group",
+            max_missing_ratio=threshold / 100.0,
+        )
+    elif filter_method == "Low Variance":
+        filtered_lazy = filter_low_variance(
+            quantification_data=quant_lazy,
+            metadata=metadata_pl,
+            group_column="group",
+            threshold_percentile=threshold,
+        )
+
+    # Collect the evaluated lazy graph and convert back to Pandas for visualization
+    filtered_df = filtered_lazy.collect().to_pandas()
+    st.session_state["filtered_df"] = filtered_df
+
-    # Collect the evaluated lazy graph and convert back to Pandas for visualization
-    filtered_df = filtered_lazy.collect().to_pandas()
-    st.session_state["filtered_df"] = filtered_df
+    # Collect the evaluated lazy graph and convert back to Pandas for visualization
+    filtered_df = filtered_lazy.collect().to_pandas()
+    for key in ("imputed_df", "normalized_df", "statistics_df"):
+        st.session_state.pop(key, None)
+    st.session_state["filtered_df"] = filtered_df
-    # Collect the evaluated lazy graph and convert back to Pandas for visualization
-    filtered_df = filtered_lazy.collect().to_pandas()
-    st.session_state["filtered_df"] = filtered_df
+    # Collect the evaluated lazy graph and convert back to Pandas for visualization
+    filtered_df = filtered_lazy.collect().to_pandas()
+    for key in ("imputed_df", "normalized_df", "statistics_df"):
+        st.session_state.pop(key, None)
+    st.session_state["filtered_df"] = filtered_df
+    # Layout response metrics and the filtered matrix
+    st.success(f"Successfully applied **{filter_method}** filter!")
+
+    # Display dataset scale compression stats
+    col1, col2, col3 = st.columns(3)
+    col1.metric("Original Proteins", pivot_df.shape[0])
+    col2.metric("Filtered Proteins", filtered_df.shape[0])
+    col3.metric(
+        "Removed Proteins", pivot_df.shape[0] - filtered_df.shape[0], delta=None
+    )
+
+    st.subheader("Filtered Abundance Table")
+    if filtered_df.empty:
+        st.warning(
+            "The filtered table is empty. Try relaxing the threshold constraints."
+        )
+    else:
+        st.dataframe(filtered_df, use_container_width=True)