From 92afc2892ade9eae2f6dd5fe95d023c14a3777fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 27 Jan 2026 12:15:34 +0100 Subject: [PATCH 1/7] refactor: move window data rule to VAF snakefile --- workflow/rules/report.smk | 19 ------------------- workflow/rules/vaf.smk | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 1d709d1..ce3816c 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -110,25 +110,6 @@ rule polymorphic_sites_over_time_plot: "../scripts/report/polymorphic_sites_over_time_plot.R" -rule window_data: - conda: "../envs/biopython.yaml" - params: - window = config["WINDOW"]["WIDTH"], - step = config["WINDOW"]["STEP"], - features = config.get("GB_FEATURES", {}), - gb_qualifier_display = "gene" - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - gb = OUTDIR/"reference.gb", - output: - window_df = REPORT_DIR_TABLES/"window.csv", - json = temp(REPORT_DIR_TABLES/"window.json"), - log: - LOGDIR / "window_data" / "log.txt" - script: - "../scripts/report/window_data.py" - - rule nv_panel_data: conda: "../envs/renv.yaml" input: diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index c42d580..6cc8325 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -273,3 +273,22 @@ rule fill_all_sites: LOGDIR / "fill_all_sites" / "log.txt" script: "../scripts/fill_all_sites.R" + + +rule window_data: + conda: "../envs/biopython.yaml" + params: + window = config["WINDOW"]["WIDTH"], + step = config["WINDOW"]["STEP"], + features = config.get("GB_FEATURES", {}), + gb_qualifier_display = "gene" + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + gb = OUTDIR/"reference.gb", + output: + window_df = REPORT_DIR_TABLES/"window.csv", + json = temp(REPORT_DIR_TABLES/"window.json"), + log: + LOGDIR / "window_data" / "log.txt" + script: + "../scripts/report/window_data.py" From 7a1f467ce5044fe730622be0c93370238f81f7bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 27 Jan 2026 16:13:04 +0100 Subject: [PATCH 2/7] chore: remove unused column from selection --- workflow/scripts/calculate_dnds.R | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/scripts/calculate_dnds.R b/workflow/scripts/calculate_dnds.R index f545c27..506395b 100644 --- a/workflow/scripts/calculate_dnds.R +++ b/workflow/scripts/calculate_dnds.R @@ -19,7 +19,6 @@ variants <- read_delim( col_select = c( "SAMPLE", "VARIANT_NAME", - "REGION", "ALT_FREQ", "SYNONYMOUS", "POS" From 8f214c4da911c4fb580b0b0abdaf24987d3cc829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 27 Jan 2026 17:11:14 +0100 Subject: [PATCH 3/7] perf: use set for masked site lookup and validate sequence lengths --- workflow/scripts/extract_afwdist_variants.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/extract_afwdist_variants.py b/workflow/scripts/extract_afwdist_variants.py index 6a7bf3f..a446c1e 100644 --- a/workflow/scripts/extract_afwdist_variants.py +++ b/workflow/scripts/extract_afwdist_variants.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import logging -from typing import List +from typing import List, Set import pandas as pd from Bio import SeqIO @@ -17,7 +17,7 @@ def read_monofasta(path: str) -> SeqRecord: return record -def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: +def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> Set[int]: """ Parse a VCF containing positions for masking. Assumes the VCF file is formatted as in: @@ -31,10 +31,10 @@ def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: comment="#", names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") ) - return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist() + return set(vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].unique()) -def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame: +def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: Set[int]) -> pd.DataFrame: pos = [] alt = [] for i in range(1, len(ancestor) + 1): @@ -77,10 +77,11 @@ def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: logging.info("Reading input FASTA files") # Case ancestor ancestor = read_monofasta(snakemake.input.ancestor) - logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}") + logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor)}") # Alignment reference reference = read_monofasta(snakemake.input.reference) - logging.info(f"Reference: '{reference.description}', length={len(reference.seq)}") + logging.info(f"Reference: '{reference.description}', length={len(reference)}") + assert len(ancestor) == len(reference) logging.info("Processing ancestor variants") ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_sites) From a72a2f4ac111d1218b75cd96961d9f10941e9798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Fri, 30 Jan 2026 13:08:25 +0100 Subject: [PATCH 4/7] refactor: move NV panel data rules to VCF snakefile --- workflow/rules/report.smk | 38 -------------------------------------- workflow/rules/vaf.smk | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index ce3816c..73ad7a1 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -110,44 +110,6 @@ rule polymorphic_sites_over_time_plot: "../scripts/report/polymorphic_sites_over_time_plot.R" -rule nv_panel_data: - conda: "../envs/renv.yaml" - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_TABLES/"nv_panel.csv", - json = temp(REPORT_DIR_TABLES/"nv_panel.json"), - log: - LOGDIR / "nv_panel_data" / "log.txt" - script: - "../scripts/report/nv_panel_data.R" - - -rule nv_panel_zoom_on_feature_data: - input: - table = REPORT_DIR_TABLES/"nv_panel.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", - output: - table = temp(REPORT_DIR_TABLES/"nv_panel.{region_name}.csv"), - log: - LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt" - script: - "../scripts/report/nv_panel_zoom_on_feature_data.py" - - -rule window_zoom_on_feature_data: - input: - table = REPORT_DIR_TABLES/"window.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", - output: - table = temp(REPORT_DIR_TABLES/"window.{region_name}.csv"), - log: - LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt" - script: - "../scripts/report/window_zoom_on_feature_data.py" - - rule nv_panel_plot: conda: "../envs/renv.yaml" params: diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 6cc8325..8c1979b 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -292,3 +292,41 @@ rule window_data: LOGDIR / "window_data" / "log.txt" script: "../scripts/report/window_data.py" + + +rule nv_panel_data: + conda: "../envs/renv.yaml" + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], + output: + table = REPORT_DIR_TABLES/"nv_panel.csv", + json = temp(REPORT_DIR_TABLES/"nv_panel.json"), + log: + LOGDIR / "nv_panel_data" / "log.txt" + script: + "../scripts/report/nv_panel_data.R" + + +rule nv_panel_zoom_on_feature_data: + input: + table = REPORT_DIR_TABLES/"nv_panel.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + output: + table = temp(REPORT_DIR_TABLES/"nv_panel.{region_name}.csv"), + log: + LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt" + script: + "../scripts/report/nv_panel_zoom_on_feature_data.py" + + +rule window_zoom_on_feature_data: + input: + table = REPORT_DIR_TABLES/"window.csv", + regions = REPORT_DIR_TABLES/"genbank_regions.json", + output: + table = temp(REPORT_DIR_TABLES/"window.{region_name}.csv"), + log: + LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt" + script: + "../scripts/report/window_zoom_on_feature_data.py" From f89a3bc37a9dce0ccdcb9f113f7cd1ad9a1f96d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Mon, 9 Feb 2026 13:22:10 +0100 Subject: [PATCH 5/7] refactor: move correlation rules to VCF snakefile --- workflow/rules/report.smk | 35 ----------------------------------- workflow/rules/vaf.smk | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 73ad7a1..6fcb7aa 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -282,24 +282,6 @@ rule dnds_plots: "../scripts/report/dnds_plots.R" -rule af_time_correlation_data: - conda: "../envs/renv.yaml" - params: - cor_method = config["COR"]["METHOD"], - cor_exact = config["COR"]["EXACT"], - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", - metadata = config["METADATA"], - output: - fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"), - correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"), - subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt", - log: - LOGDIR / "af_time_correlation_data" / "log.txt" - script: - "../scripts/report/af_time_correlation_data.R" - - rule af_time_correlation_plot: conda: "../envs/renv.yaml" params: @@ -335,23 +317,6 @@ rule af_trajectory_panel_plot: "../scripts/report/af_trajectory_panel_plot.R" -rule pairwise_trajectory_correlation_data: - conda: "../envs/renv.yaml" - params: - cor_method = config["COR"]["METHOD"], - cor_use = "pairwise.complete.obs", - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_TABLES/"pairwise_trajectory_frequency_data.csv", - matrix = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), - log: - LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" - script: - "../scripts/report/pairwise_trajectory_correlation_data.R" - - rule summary_table: conda: "../envs/renv.yaml" input: diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 8c1979b..cc30a29 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -330,3 +330,38 @@ rule window_zoom_on_feature_data: LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt" script: "../scripts/report/window_zoom_on_feature_data.py" + + +rule af_time_correlation_data: + conda: "../envs/renv.yaml" + params: + cor_method = config["COR"]["METHOD"], + cor_exact = config["COR"]["EXACT"], + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", + metadata = config["METADATA"], + output: + fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"), + correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"), + subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt", + log: + LOGDIR / "af_time_correlation_data" / "log.txt" + script: + "../scripts/report/af_time_correlation_data.R" + + +rule pairwise_trajectory_correlation_data: + conda: "../envs/renv.yaml" + params: + cor_method = config["COR"]["METHOD"], + cor_use = "pairwise.complete.obs", + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", + metadata = config["METADATA"], + output: + table = REPORT_DIR_TABLES/"pairwise_trajectory_frequency_data.csv", + matrix = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), + log: + LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" + script: + "../scripts/report/pairwise_trajectory_correlation_data.R" From 4f8570e207e3012fee7efa39d00835f97577b69c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 10 Feb 2026 14:29:51 +0100 Subject: [PATCH 6/7] refactor: move rate analyses to core snakefiles --- workflow/rules/distances.smk | 31 ++++++++++++++++++++++++ workflow/rules/report.smk | 47 ------------------------------------ workflow/rules/vaf.smk | 16 ++++++++++++ 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index 77a4ae9..c655320 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -50,3 +50,34 @@ rule format_afwdist_results: LOGDIR/"format_afwdist_results"/"log.txt" script: "../scripts/format_afwdist_results.py" + + +rule allele_freq_tree_data: + conda: "../envs/renv.yaml" + params: + use_bionj = config["USE_BIONJ"], + outgroup_id = config["ALIGNMENT_REFERENCE"], + input: + dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + output: + tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", + log: + LOGDIR / "allele_freq_tree_data" / "log.txt" + script: + "../scripts/report/allele_freq_tree_data.R" + + +rule time_signal_data: + conda: "../envs/renv.yaml" + params: + outgroup_id = config["ALIGNMENT_REFERENCE"], + input: + tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), + metadata = config["METADATA"], + output: + table = report(REPORT_DIR_TABLES/"time_signal.csv"), + json = REPORT_DIR_TABLES/"time_signal.json", + log: + LOGDIR / "time_signal_data" / "log.txt" + script: + "../scripts/report/time_signal_data.R" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 6fcb7aa..50a0574 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -78,22 +78,6 @@ rule extract_genbank_regions: "../scripts/report/extract_genbank_regions.py" -rule polymorphic_sites_over_time_data: - conda: "../envs/renv.yaml" - params: - max_alt_freq = 1.0 - config["VC"]["MIN_FREQ"], - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", - json = temp(REPORT_DIR_TABLES/"polymorphic_sites_over_time.json"), - log: - LOGDIR / "polymorphic_sites_over_time_data" / "log.txt" - script: - "../scripts/report/polymorphic_sites_over_time_data.R" - - rule polymorphic_sites_over_time_plot: conda: "../envs/renv.yaml" params: @@ -199,21 +183,6 @@ rule context_phylogeny_plot: "../scripts/report/context_phylogeny_plot.R" -rule allele_freq_tree_data: - conda: "../envs/renv.yaml" - params: - use_bionj = config["USE_BIONJ"], - outgroup_id = config["ALIGNMENT_REFERENCE"], - input: - dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", - output: - tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", - log: - LOGDIR / "allele_freq_tree_data" / "log.txt" - script: - "../scripts/report/allele_freq_tree_data.R" - - rule allele_freq_tree_plot: conda: "../envs/renv.yaml" params: @@ -233,22 +202,6 @@ rule allele_freq_tree_plot: "../scripts/report/allele_freq_tree_plot.R" -rule time_signal_data: - conda: "../envs/renv.yaml" - params: - outgroup_id = config["ALIGNMENT_REFERENCE"], - input: - tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), - metadata = config["METADATA"], - output: - table = report(REPORT_DIR_TABLES/"time_signal.csv"), - json = REPORT_DIR_TABLES/"time_signal.json", - log: - LOGDIR / "time_signal_data" / "log.txt" - script: - "../scripts/report/time_signal_data.R" - - rule time_signal_plot: conda: "../envs/renv.yaml" params: diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index b42e9f3..100e231 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -296,3 +296,19 @@ rule pairwise_trajectory_correlation_data: LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" script: "../scripts/report/pairwise_trajectory_correlation_data.R" + + +rule polymorphic_sites_over_time_data: + conda: "../envs/renv.yaml" + params: + max_alt_freq = 1.0 - config["VC"]["MIN_FREQ"], + input: + variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", + metadata = config["METADATA"], + output: + table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", + json = temp(REPORT_DIR_TABLES/"polymorphic_sites_over_time.json"), + log: + LOGDIR / "polymorphic_sites_over_time_data" / "log.txt" + script: + "../scripts/report/polymorphic_sites_over_time_data.R" From fab09f86c25a1ab042a50eb301ae91489cfb1991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?= Date: Tue, 10 Feb 2026 14:47:32 +0100 Subject: [PATCH 7/7] feat: calculate confidence interval of evolutionary rate --- workflow/rules/distances.smk | 1 + workflow/scripts/report/time_signal_data.R | 3 +++ 2 files changed, 4 insertions(+) diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index c655320..ca5f69d 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -71,6 +71,7 @@ rule time_signal_data: conda: "../envs/renv.yaml" params: outgroup_id = config["ALIGNMENT_REFERENCE"], + confidence_interval = 0.95, input: tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), metadata = config["METADATA"], diff --git a/workflow/scripts/report/time_signal_data.R b/workflow/scripts/report/time_signal_data.R index 04448b2..1877d8b 100644 --- a/workflow/scripts/report/time_signal_data.R +++ b/workflow/scripts/report/time_signal_data.R @@ -60,12 +60,15 @@ write_csv(time.signal, snakemake@output$table) log_debug("Building linear model") model <- lm(distance ~ date_interval, data = time.signal) p.value <- summary(model)$coefficients[2, 4] +ci_models <- confint(model, level = snakemake@params$confidence_interval) # TREE STATS log_debug("Saving linear model") list( "sub_rate" = model$coefficients[[2]] * 365, "r2" = summary(model)$r.squared[[1]], + "ci_low" = ci_models[2, 1] * 365, + "ci_high" = ci_models[2, 2] * 365, "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value) ) %>% toJSON() %>%