Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,18 @@
st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="📈"),
st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="🎯"),
st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="📋"),
st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="📚"),
st.Page(Path("content", "enrichment.py"), title="Pathway Analysis", icon="📉"),
],
"Differential Protein Analysis": [
st.Page(Path("content", "filtering.py"), title="Filtering", icon="🧹"),
st.Page(Path("content", "imputation.py"), title="Imputation", icon="🩹"),
st.Page(Path("content", "normalization.py"), title="Normalization", icon="⚖️"),
st.Page(Path("content", "statistical.py"), title="Statistical", icon="🔢"),
st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="🌋"),
st.Page(Path("content", "results_pca.py"), title="PCA", icon="📊"),
st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="🔥"),
st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="📚"),
st.Page(Path("content", "results_proteomicslfq.py"), title="Proteomics LFQ", icon="🧪"),
],
]
}

pg = st.navigation(pages)
Expand Down
140 changes: 140 additions & 0 deletions content/enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Pathway Analysis Page."""

from pathlib import Path
import pandas as pd
import polars as pl
import streamlit as st
from src.common.common import page_setup
from src.common.results_helpers import get_abundance_data
# Import GO Enrichment modules from openms_insight engine
from openms_insight.analysis.enrichment import calculate_go_enrichment

params = page_setup()
st.title("GO Enrichment Analysis")

st.markdown(
"""
Identify overrepresented biological themes (BP, CC, MF) within your differentially expressed protein features using MyGene.info and Fisher's Exact Test.
"""
)

if "workspace" not in st.session_state:
st.warning("Please initialize your workspace first.")
st.stop()

# --- STEP 1: Upstream Statistics Checkpoint ---
if (
"statistics_df" in st.session_state
and st.session_state["statistics_df"] is not None
):
final_statistics_report = st.session_state["statistics_df"]
st.info(
"🔄 **Upstream Pipeline Detected**: Using analyzed matrices from the **Statistical Inference** step."
)
else:
st.warning(
"⚠️ **Missing Prerequisites**: Statistical inference data not detected. Please run hypothesis testing first."
)
st.page_link(
"content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
Comment on lines +38 to +39

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Point this fallback link at the registered statistics page.

app.py registers content/statistical.py, but this button sends users to content/results_statistics.py. If that file is absent, the prerequisite flow dead-ends here.

Proposed fix
     st.page_link(
-        "content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
+        "content/statistical.py", label="Go to Statistical Inference", icon="🔬"
     )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
st.page_link(
"content/results_statistics.py", label="Go to Statistical Inference", icon="🔬"
st.page_link(
"content/statistical.py", label="Go to Statistical Inference", icon="🔬"
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@content/enrichment.py` around lines 38 - 39, The fallback navigation in the
`st.page_link` call currently points to the wrong page module, which can break
the prerequisite flow if that target does not exist. Update the link in
`content/enrichment.py` to use the same registered statistics page path that
`app.py` registers for the statistics screen, keeping the label and icon
unchanged.

)
st.stop()

# --- STEP 2: Preprocessing Mapping Key Configuration ---
# Identify target identifier columns dynamically
id_col = "ProteinName"
if id_col not in final_statistics_report.columns:
st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
st.stop()

# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
st.subheader("Configure Enrichment Thresholds")

# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
Comment on lines +45 to +54

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Validate every column the enrichment call depends on.

This only checks ProteinName, but the execution path also requires log2FC and one of p-adj / p-value. Right now the page can pass an incompatible statistics table into calculate_go_enrichment and fail much later with a less useful error.

Proposed fix
-# Identify target identifier columns dynamically
-id_col = "ProteinName"
-if id_col not in final_statistics_report.columns:
-    st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
-    st.stop()
+required_cols = {"ProteinName", "log2FC"}
+missing_cols = required_cols - set(final_statistics_report.columns)
+if missing_cols:
+    st.error(
+        f"❌ Structural Error: Missing required columns: {', '.join(sorted(missing_cols))}."
+    )
+    st.stop()
+
+id_col = "ProteinName"
 
 # --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
 st.subheader("Configure Enrichment Thresholds")
 
-# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
-target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
+if "p-adj" in final_statistics_report.columns:
+    target_p_col = "p-adj"
+elif "p-value" in final_statistics_report.columns:
+    target_p_col = "p-value"
+else:
+    st.error("❌ Structural Error: Missing both 'p-adj' and 'p-value'.")
+    st.stop()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
id_col = "ProteinName"
if id_col not in final_statistics_report.columns:
st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.")
st.stop()
# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
st.subheader("Configure Enrichment Thresholds")
# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
required_cols = {"ProteinName", "log2FC"}
missing_cols = required_cols - set(final_statistics_report.columns)
if missing_cols:
st.error(
f"❌ Structural Error: Missing required columns: {', '.join(sorted(missing_cols))}."
)
st.stop()
id_col = "ProteinName"
# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
st.subheader("Configure Enrichment Thresholds")
if "p-adj" in final_statistics_report.columns:
target_p_col = "p-adj"
elif "p-value" in final_statistics_report.columns:
target_p_col = "p-value"
else:
st.error("❌ Structural Error: Missing both 'p-adj' and 'p-value'.")
st.stop()
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@content/enrichment.py` around lines 45 - 54, The enrichment setup currently
validates only ProteinName, but the later call path also depends on log2FC and a
valid p-value column. In content/enrichment.py, extend the existing preflight
checks near target_p_col selection to verify final_statistics_report contains
log2FC plus either p-adj or p-value before reaching calculate_go_enrichment. If
any required column is missing, show a clear st.error and st.stop early so the
page fails fast with a useful message.

p_label = (
"Adjusted P-value (p-adj) Cutoff"
if target_p_col == "p-adj"
else "Raw P-value (p-value) Cutoff"
)

ui_go_col1, ui_go_col2 = st.columns(2)

with ui_go_col1:
p_cutoff = st.number_input(
f"🔬 {p_label}",
min_value=0.0001,
max_value=1.0,
value=0.05,
step=0.01,
format="%.4f",
help="Proteins with significance metrics below this value are mapped to the foreground cohort.",
)

with ui_go_col2:
fc_cutoff = st.number_input(
"📈 Absolute Difference Cutoff (|log2FC|)",
min_value=0.0,
max_value=10.0,
value=1.0,
step=0.1,
format="%.2f",
help="Proteins with absolute log2 fold change greater than or equal to this threshold will be selected.",
)

# --- SECTION 2: Execution and Interactive View Charts ---
st.markdown("<br>", unsafe_allow_html=True)
if st.button("🚀 Run GO Enrichment Analysis", type="primary", key="run_go_analysis"):

with st.spinner("Querying MyGene.info API & executing hyper-geometric calculation loops..."):
# Convert internal pandas DataFrame to openms_insight Polars DataFrame expectation
stats_pl = pl.from_pandas(final_statistics_report)

status, output = calculate_go_enrichment(
final_report=stats_pl,
id_col=id_col,
target_p_col=target_p_col,
p_cutoff=p_cutoff,
fc_cutoff=fc_cutoff,
)

# Route response structures based on analysis output status code
if status == "empty_data":
st.error("❌ No valid statistical rows found containing standard columns to run GO alignment.")

elif status == "insufficient_proteins":
st.warning(
f"⚠️ Not enough significant proteins found to construct target datasets. "
f"(Criteria: {target_p_col} < {p_cutoff:.4f}, |log2FC| ≥ {fc_cutoff:.2f})."
)
st.info(f"💡 Found significant proteins count: **{output}**. Try relaxing your p-value or log2FC filters.")

elif status == "success":
st.success("⭕ GO Enrichment Analysis completed successfully!")

# Display operational matrix scale
st.markdown(
f"📊 **Analysis Profile Scope**: Mapped **{output['fg_count']}** significant foreground profiles out of **{output['bg_count']}** reference background items."
)

# Build multi-tab interface layer for ontology subcategories
tabs = st.tabs([
"🧬 Biological Process (BP)",
"🔬 Cellular Component (CC)",
"🧪 Molecular Function (MF)"
])
categories_data = output["categories"]

for idx, go_type in enumerate(["BP", "CC", "MF"]):
with tabs[idx]:
fig = categories_data[go_type]["fig"]
df_go = categories_data[go_type]["df"]

if fig is not None and df_go is not None:
# Render plotly bar figures generated straight from backend engine
st.plotly_chart(fig, use_container_width=True)

st.subheader(f"📊 {go_type} Results Dataframe")
st.dataframe(df_go, use_container_width=True)
else:
st.info(f"No statistically overrepresented terms identified for Category: **{go_type}**")
163 changes: 163 additions & 0 deletions content/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""Filtering Page."""

from pathlib import Path
import pandas as pd
import polars as pl
import streamlit as st
from src.common.common import page_setup
from src.common.results_helpers import get_abundance_data

# Import filtering functions from openms_insight package
from openms_insight.analysis.filter import (
filter_low_abundance,
filter_low_repeatability,
filter_low_variance,
)

params = page_setup()
st.title("Data Filtering")

st.markdown(
"""
Filter out low-quality proteins from your dataset based on abundance, repeatability, or variance thresholds.
"""
)

if "workspace" not in st.session_state:
st.warning("Please initialize your workspace first.")
st.stop()

result = get_abundance_data(st.session_state["workspace"])
if result is None:
st.info(
"Abundance data not available. Please run the workflow and configure sample groups first."
)
st.page_link(
"content/results_abundance.py", label="Go to Abundance", icon="📋"
)
st.stop()

pivot_df, group_map = result

# 1. Identify actual sample columns dynamically
sample_cols = [
c
for c in pivot_df.columns
if c not in ["ProteinName", "PeptideSequence"]
]

# --- SECTION 1: Original Data View ---
st.subheader("Original Abundance Table")
st.markdown(
f"Currently displaying **{pivot_df.shape[0]}** proteins and **{len(sample_cols)}** samples before filtering."
)
st.dataframe(pivot_df, use_container_width=True)

st.markdown("---")

# --- SECTION 2: Filter Configuration ---
st.subheader("Configure Filter Engine")

# Prepare Polars Metadata DataFrame required by openms_insight functions
metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
metadata_pl = pl.DataFrame(
metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
)

# User selection for filtering strategy
filter_method = st.selectbox(
"Select Filtering Method",
options=["Low Abundance", "Low Repeatability", "Low Variance"],
index=0,
help="Choose the statistical criteria to prune unreliable protein entries.",
)

# Render threshold sliders dynamically based on the selected filter method
if filter_method == "Low Abundance":
st.markdown(
"**Low Abundance Filter**: Keeps rows where at least one group's median is above the selected percentile threshold."
)
threshold = st.slider(
"Threshold Percentile (%)",
min_value=0.0,
max_value=100.0,
value=10.0,
step=5.0,
)

elif filter_method == "Low Repeatability":
st.markdown(
"**Low Repeatability Filter**: Keeps rows where at least one group has a missing value ratio within the allowed maximum."
)
threshold = st.slider(
"Max Missing Ratio",
min_value=0.0,
max_value=100.0,
value=50.0,
step=5.0,
help="Allowed missing value (zero or null) ratio per group.",
)

elif filter_method == "Low Variance":
st.markdown(
"**Low Variance Filter**: Keeps rows where at least one group's variance is above the selected percentile threshold."
)
threshold = st.slider(
"Threshold Percentile (%)",
min_value=0.0,
max_value=100.0,
value=10.0,
step=5.0,
)

# --- SECTION 3: Filter Execution and Collected Results View ---
if st.button("Apply Filter", type="primary"):
# Convert the original Pandas DataFrame into a Polars LazyFrame graph
quant_lazy = pl.from_pandas(pivot_df).lazy()

# Route execution to the chosen openms_insight engine function
if filter_method == "Low Abundance":
filtered_lazy = filter_low_abundance(
quantification_data=quant_lazy,
metadata=metadata_pl,
group_column="group",
threshold_percentile=threshold,
)
elif filter_method == "Low Repeatability":
# Convert percent slider input to ratio expected by the function (e.g., 50.0% -> 0.5)
filtered_lazy = filter_low_repeatability(
quantification_data=quant_lazy,
metadata=metadata_pl,
group_column="group",
max_missing_ratio=threshold / 100.0,
)
elif filter_method == "Low Variance":
filtered_lazy = filter_low_variance(
quantification_data=quant_lazy,
metadata=metadata_pl,
group_column="group",
threshold_percentile=threshold,
)

# Collect the evaluated lazy graph and convert back to Pandas for visualization
filtered_df = filtered_lazy.collect().to_pandas()
st.session_state["filtered_df"] = filtered_df

Comment on lines +142 to +145

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗄️ Data Integrity & Integration | 🟠 Major | ⚡ Quick win

Invalidate downstream analysis state when the filter output changes.

After Line 144 writes a new filtered_df, the old imputed_df, normalized_df, and statistics_df remain live. content/normalization.py prefers imputed_df over filtered_df, and content/statistical.py prefers normalized_df/imputed_df, so later pages can keep running on stale pre-filter data.

Proposed fix
     # Collect the evaluated lazy graph and convert back to Pandas for visualization
     filtered_df = filtered_lazy.collect().to_pandas()
+    for key in ("imputed_df", "normalized_df", "statistics_df"):
+        st.session_state.pop(key, None)
     st.session_state["filtered_df"] = filtered_df
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Collect the evaluated lazy graph and convert back to Pandas for visualization
filtered_df = filtered_lazy.collect().to_pandas()
st.session_state["filtered_df"] = filtered_df
# Collect the evaluated lazy graph and convert back to Pandas for visualization
filtered_df = filtered_lazy.collect().to_pandas()
for key in ("imputed_df", "normalized_df", "statistics_df"):
st.session_state.pop(key, None)
st.session_state["filtered_df"] = filtered_df
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@content/filtering.py` around lines 142 - 145, The filter update in
filtering.py should also invalidate dependent analysis state so downstream pages
stop using stale data. In the filtering flow where filtered_df is written in the
filtering logic, clear or remove the existing imputed_df, normalized_df, and
statistics_df entries from st.session_state whenever the filter result changes.
Make the change in the same place that updates filtered_df so normalization and
statistical paths that read from imputed_df/normalized_df in
content/normalization.py and content/statistical.py will recompute from the new
filtered data instead of reusing old cached values.

# Layout response metrics and the filtered matrix
st.success(f"Successfully applied **{filter_method}** filter!")

# Display dataset scale compression stats
col1, col2, col3 = st.columns(3)
col1.metric("Original Proteins", pivot_df.shape[0])
col2.metric("Filtered Proteins", filtered_df.shape[0])
col3.metric(
"Removed Proteins", pivot_df.shape[0] - filtered_df.shape[0], delta=None
)

st.subheader("Filtered Abundance Table")
if filtered_df.empty:
st.warning(
"The filtered table is empty. Try relaxing the threshold constraints."
)
else:
st.dataframe(filtered_df, use_container_width=True)
Loading