diff --git a/workflow/rules/distances.smk b/workflow/rules/distances.smk index 77a4ae9..ca5f69d 100644 --- a/workflow/rules/distances.smk +++ b/workflow/rules/distances.smk @@ -50,3 +50,35 @@ rule format_afwdist_results: LOGDIR/"format_afwdist_results"/"log.txt" script: "../scripts/format_afwdist_results.py" + + +rule allele_freq_tree_data: + conda: "../envs/renv.yaml" + params: + use_bionj = config["USE_BIONJ"], + outgroup_id = config["ALIGNMENT_REFERENCE"], + input: + dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", + output: + tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", + log: + LOGDIR / "allele_freq_tree_data" / "log.txt" + script: + "../scripts/report/allele_freq_tree_data.R" + + +rule time_signal_data: + conda: "../envs/renv.yaml" + params: + outgroup_id = config["ALIGNMENT_REFERENCE"], + confidence_interval = 0.95, + input: + tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), + metadata = config["METADATA"], + output: + table = report(REPORT_DIR_TABLES/"time_signal.csv"), + json = REPORT_DIR_TABLES/"time_signal.json", + log: + LOGDIR / "time_signal_data" / "log.txt" + script: + "../scripts/report/time_signal_data.R" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 1d709d1..50a0574 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -78,22 +78,6 @@ rule extract_genbank_regions: "../scripts/report/extract_genbank_regions.py" -rule polymorphic_sites_over_time_data: - conda: "../envs/renv.yaml" - params: - max_alt_freq = 1.0 - config["VC"]["MIN_FREQ"], - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_PLOTS/"polymorphic_sites_over_time.csv", - json = temp(REPORT_DIR_TABLES/"polymorphic_sites_over_time.json"), - log: - LOGDIR / "polymorphic_sites_over_time_data" / "log.txt" - script: - "../scripts/report/polymorphic_sites_over_time_data.R" - - rule polymorphic_sites_over_time_plot: conda: "../envs/renv.yaml" params: @@ -110,63 +94,6 @@ rule polymorphic_sites_over_time_plot: "../scripts/report/polymorphic_sites_over_time_plot.R" -rule window_data: - conda: "../envs/biopython.yaml" - params: - window = config["WINDOW"]["WIDTH"], - step = config["WINDOW"]["STEP"], - features = config.get("GB_FEATURES", {}), - gb_qualifier_display = "gene" - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - gb = OUTDIR/"reference.gb", - output: - window_df = REPORT_DIR_TABLES/"window.csv", - json = temp(REPORT_DIR_TABLES/"window.json"), - log: - LOGDIR / "window_data" / "log.txt" - script: - "../scripts/report/window_data.py" - - -rule nv_panel_data: - conda: "../envs/renv.yaml" - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_TABLES/"nv_panel.csv", - json = temp(REPORT_DIR_TABLES/"nv_panel.json"), - log: - LOGDIR / "nv_panel_data" / "log.txt" - script: - "../scripts/report/nv_panel_data.R" - - -rule nv_panel_zoom_on_feature_data: - input: - table = REPORT_DIR_TABLES/"nv_panel.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", - output: - table = temp(REPORT_DIR_TABLES/"nv_panel.{region_name}.csv"), - log: - LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt" - script: - "../scripts/report/nv_panel_zoom_on_feature_data.py" - - -rule window_zoom_on_feature_data: - input: - table = REPORT_DIR_TABLES/"window.csv", - regions = REPORT_DIR_TABLES/"genbank_regions.json", - output: - table = temp(REPORT_DIR_TABLES/"window.{region_name}.csv"), - log: - LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt" - script: - "../scripts/report/window_zoom_on_feature_data.py" - - rule nv_panel_plot: conda: "../envs/renv.yaml" params: @@ -256,21 +183,6 @@ rule context_phylogeny_plot: "../scripts/report/context_phylogeny_plot.R" -rule allele_freq_tree_data: - conda: "../envs/renv.yaml" - params: - use_bionj = config["USE_BIONJ"], - outgroup_id = config["ALIGNMENT_REFERENCE"], - input: - dist = OUTDIR/f"{OUTPUT_NAME}.distances.csv", - output: - tree = REPORT_DIR_TABLES/"allele_freq_tree.nwk", - log: - LOGDIR / "allele_freq_tree_data" / "log.txt" - script: - "../scripts/report/allele_freq_tree_data.R" - - rule allele_freq_tree_plot: conda: "../envs/renv.yaml" params: @@ -290,22 +202,6 @@ rule allele_freq_tree_plot: "../scripts/report/allele_freq_tree_plot.R" -rule time_signal_data: - conda: "../envs/renv.yaml" - params: - outgroup_id = config["ALIGNMENT_REFERENCE"], - input: - tree = report(REPORT_DIR_TABLES/"allele_freq_tree.nwk"), - metadata = config["METADATA"], - output: - table = report(REPORT_DIR_TABLES/"time_signal.csv"), - json = REPORT_DIR_TABLES/"time_signal.json", - log: - LOGDIR / "time_signal_data" / "log.txt" - script: - "../scripts/report/time_signal_data.R" - - rule time_signal_plot: conda: "../envs/renv.yaml" params: @@ -339,24 +235,6 @@ rule dnds_plots: "../scripts/report/dnds_plots.R" -rule af_time_correlation_data: - conda: "../envs/renv.yaml" - params: - cor_method = config["COR"]["METHOD"], - cor_exact = config["COR"]["EXACT"], - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", - metadata = config["METADATA"], - output: - fmt_variants = temp(REPORT_DIR_TABLES/"variants.filled.dated.tsv"), - correlations = report(REPORT_DIR_TABLES/"af_time_correlation.csv"), - subset = REPORT_DIR_TABLES/"af_time_correlation.subset.txt", - log: - LOGDIR / "af_time_correlation_data" / "log.txt" - script: - "../scripts/report/af_time_correlation_data.R" - - rule af_time_correlation_plot: conda: "../envs/renv.yaml" params: @@ -392,23 +270,6 @@ rule af_trajectory_panel_plot: "../scripts/report/af_trajectory_panel_plot.R" -rule pairwise_trajectory_correlation_data: - conda: "../envs/renv.yaml" - params: - cor_method = config["COR"]["METHOD"], - cor_use = "pairwise.complete.obs", - input: - variants = OUTDIR/f"{OUTPUT_NAME}.variants.all_sites.tsv", - metadata = config["METADATA"], - output: - table = REPORT_DIR_TABLES/"pairwise_trajectory_frequency_data.csv", - matrix = report(REPORT_DIR_TABLES/"pairwise_trajectory_correlation_matrix.csv"), - log: - LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt" - script: - "../scripts/report/pairwise_trajectory_correlation_data.R" - - rule summary_table: conda: "../envs/renv.yaml" input: diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk index 1af66f7..6d78e63 100644 --- a/workflow/rules/vaf.smk +++ b/workflow/rules/vaf.smk @@ -167,7 +167,8 @@ rule format_vcf_fields_longer: filter_exclude=config["ANNOTATION"]["FILTER_EXCLUDE"], variant_name_pattern=lambda wildcards: config["ANNOTATION"][ "VARIANT_NAME_PATTERN" - ], # lambda to deactivate automatic wildcard expansion in pattern + ], + # lambda to deactivate automatic wildcard expansion in pattern sep=",", input: tsv=OUTDIR / "vaf" / "fields" / "{sample}.tsv", @@ -189,7 +190,6 @@ rule concat_vcf_fields: run: import pandas as pd from functools import reduce - reduce( lambda a, b: pd.concat((a, b), axis="rows", ignore_index=True), (pd.read_csv(path, sep=params.sep) for path in input), @@ -219,3 +219,116 @@ use rule concat_vcf_fields as concat_variants with: expand(OUTDIR / "vaf" / "variants" / "{sample}.tsv", sample=iter_samples()), output: OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + + +rule window_data: + conda: + "../envs/biopython.yaml" + params: + window=config["WINDOW"]["WIDTH"], + step=config["WINDOW"]["STEP"], + features=config.get("GB_FEATURES", {}), + gb_qualifier_display="gene", + input: + variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + gb=OUTDIR / "reference.gb", + output: + window_df=REPORT_DIR_TABLES / "window.csv", + json=temp(REPORT_DIR_TABLES / "window.json"), + log: + LOGDIR / "window_data" / "log.txt", + script: + "../scripts/report/window_data.py" + + +rule nv_panel_data: + conda: + "../envs/renv.yaml" + input: + variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + metadata=config["METADATA"], + output: + table=REPORT_DIR_TABLES / "nv_panel.csv", + json=temp(REPORT_DIR_TABLES / "nv_panel.json"), + log: + LOGDIR / "nv_panel_data" / "log.txt", + script: + "../scripts/report/nv_panel_data.R" + + +rule nv_panel_zoom_on_feature_data: + input: + table=REPORT_DIR_TABLES / "nv_panel.csv", + regions=REPORT_DIR_TABLES / "genbank_regions.json", + output: + table=temp(REPORT_DIR_TABLES / "nv_panel.{region_name}.csv"), + log: + LOGDIR / "nv_panel_zoom_on_feature_data" / "{region_name}.log.txt", + script: + "../scripts/report/nv_panel_zoom_on_feature_data.py" + + +rule window_zoom_on_feature_data: + input: + table=REPORT_DIR_TABLES / "window.csv", + regions=REPORT_DIR_TABLES / "genbank_regions.json", + output: + table=temp(REPORT_DIR_TABLES / "window.{region_name}.csv"), + log: + LOGDIR / "window_zoom_on_feature_data" / "{region_name}.log.txt", + script: + "../scripts/report/window_zoom_on_feature_data.py" + + +rule af_time_correlation_data: + conda: + "../envs/renv.yaml" + params: + cor_method=config["COR"]["METHOD"], + cor_exact=config["COR"]["EXACT"], + input: + variants=OUTDIR / f"{OUTPUT_NAME}.variants.all_sites.tsv", + metadata=config["METADATA"], + output: + fmt_variants=temp(REPORT_DIR_TABLES / "variants.filled.dated.tsv"), + correlations=report(REPORT_DIR_TABLES / "af_time_correlation.csv"), + subset=REPORT_DIR_TABLES / "af_time_correlation.subset.txt", + log: + LOGDIR / "af_time_correlation_data" / "log.txt", + script: + "../scripts/report/af_time_correlation_data.R" + + +rule pairwise_trajectory_correlation_data: + conda: + "../envs/renv.yaml" + params: + cor_method=config["COR"]["METHOD"], + cor_use="pairwise.complete.obs", + input: + variants=OUTDIR / f"{OUTPUT_NAME}.variants.all_sites.tsv", + metadata=config["METADATA"], + output: + table=REPORT_DIR_TABLES / "pairwise_trajectory_frequency_data.csv", + matrix=report(REPORT_DIR_TABLES / "pairwise_trajectory_correlation_matrix.csv"), + log: + LOGDIR / "pairwise_trajectory_correlation_data" / "log.txt", + script: + "../scripts/report/pairwise_trajectory_correlation_data.R" + + +rule polymorphic_sites_over_time_data: + conda: + "../envs/renv.yaml" + params: + max_alt_freq=1.0 - config["VC"]["MIN_FREQ"], + input: + variants=OUTDIR / f"{OUTPUT_NAME}.variants.tsv", + metadata=config["METADATA"], + output: + table=REPORT_DIR_PLOTS / "polymorphic_sites_over_time.csv", + json=temp(REPORT_DIR_TABLES / "polymorphic_sites_over_time.json"), + log: + LOGDIR / "polymorphic_sites_over_time_data" / "log.txt", + script: + "../scripts/report/polymorphic_sites_over_time_data.R" diff --git a/workflow/scripts/calculate_dnds.R b/workflow/scripts/calculate_dnds.R index 3e37a4d..4c34688 100644 --- a/workflow/scripts/calculate_dnds.R +++ b/workflow/scripts/calculate_dnds.R @@ -19,7 +19,6 @@ variants <- read_delim( col_select = c( "SAMPLE", "VARIANT_NAME", - "CHROM", "ALT_FREQ", "SYNONYMOUS", "POS" diff --git a/workflow/scripts/extract_afwdist_variants.py b/workflow/scripts/extract_afwdist_variants.py index 6a7bf3f..a446c1e 100644 --- a/workflow/scripts/extract_afwdist_variants.py +++ b/workflow/scripts/extract_afwdist_variants.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import logging -from typing import List +from typing import List, Set import pandas as pd from Bio import SeqIO @@ -17,7 +17,7 @@ def read_monofasta(path: str) -> SeqRecord: return record -def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: +def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> Set[int]: """ Parse a VCF containing positions for masking. Assumes the VCF file is formatted as in: @@ -31,10 +31,10 @@ def read_masked_sites(vcf_path: str, mask_classes: List[str]) -> List[int]: comment="#", names=("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") ) - return vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].tolist() + return set(vcf.loc[vcf.FILTER.isin(mask_classes), "POS"].unique()) -def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: List[int]) -> pd.DataFrame: +def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: str, masked_positions: Set[int]) -> pd.DataFrame: pos = [] alt = [] for i in range(1, len(ancestor) + 1): @@ -77,10 +77,11 @@ def build_ancestor_variant_table(ancestor: Seq, reference: Seq, reference_name: logging.info("Reading input FASTA files") # Case ancestor ancestor = read_monofasta(snakemake.input.ancestor) - logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor.seq)}") + logging.info(f"Ancestor: '{ancestor.description}', length={len(ancestor)}") # Alignment reference reference = read_monofasta(snakemake.input.reference) - logging.info(f"Reference: '{reference.description}', length={len(reference.seq)}") + logging.info(f"Reference: '{reference.description}', length={len(reference)}") + assert len(ancestor) == len(reference) logging.info("Processing ancestor variants") ancestor_table = build_ancestor_variant_table(ancestor.seq, reference.seq, reference.id, masked_sites) diff --git a/workflow/scripts/report/time_signal_data.R b/workflow/scripts/report/time_signal_data.R index 04448b2..1877d8b 100644 --- a/workflow/scripts/report/time_signal_data.R +++ b/workflow/scripts/report/time_signal_data.R @@ -60,12 +60,15 @@ write_csv(time.signal, snakemake@output$table) log_debug("Building linear model") model <- lm(distance ~ date_interval, data = time.signal) p.value <- summary(model)$coefficients[2, 4] +ci_models <- confint(model, level = snakemake@params$confidence_interval) # TREE STATS log_debug("Saving linear model") list( "sub_rate" = model$coefficients[[2]] * 365, "r2" = summary(model)$r.squared[[1]], + "ci_low" = ci_models[2, 1] * 365, + "ci_high" = ci_models[2, 2] * 365, "pvalue" = ifelse(p.value < 0.001, "< 0.001", p.value) ) %>% toJSON() %>%