diff --git a/examples/apol1/apol1-classifier/assets/aggregate_apol1_status.py b/examples/apol1/apol1-classifier/assets/aggregate_apol1_status.py new file mode 100644 index 0000000..5b52840 --- /dev/null +++ b/examples/apol1/apol1-classifier/assets/aggregate_apol1_status.py @@ -0,0 +1,42 @@ +import argparse +import csv + + +def parse_args(): + parser = argparse.ArgumentParser(description="Aggregate APOL1 participant statuses") + parser.add_argument("--input", required=True, help="Input TSV from combined classifier output") + parser.add_argument("--output", required=True, help="Output TSV path") + return parser.parse_args() + + +def main(): + args = parse_args() + + participant_rows = {} + + with open(args.input, "r", encoding="utf-8") as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + participant_id = (row.get("participant_id") or "").strip() + original_file = (row.get("filename") or "").strip() + apol1_status = (row.get("apol1_status") or "").strip() + + if not participant_id: + continue + if participant_id not in participant_rows: + participant_rows[participant_id] = { + "participant_id": participant_id, + "original_file": original_file, + "apol1_status": apol1_status, + } + + with open(args.output, "w", encoding="utf-8", newline="") as out_fh: + fieldnames = ["participant_id", "original_file", "apol1_status"] + writer = csv.DictWriter(out_fh, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + for row in participant_rows.values(): + writer.writerow(row) + + +if __name__ == "__main__": + main() diff --git a/examples/apol1/apol1-classifier/assets/aggregate_classification_stats.py b/examples/apol1/apol1-classifier/assets/aggregate_classification_stats.py new file mode 100644 index 0000000..b3676f7 --- /dev/null +++ b/examples/apol1/apol1-classifier/assets/aggregate_classification_stats.py @@ -0,0 +1,85 @@ +import argparse +import csv +from collections import defaultdict + +STATUSES = ["G0", "G1", "G2"] + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Aggregate APOL1 status statistics (G0/G1/G2 counts + hetero/homo)" + ) + parser.add_argument("--input", required=True, help="Input TSV from combined classifier output") + parser.add_argument("--output", required=True, help="Output TSV path") + return parser.parse_args() + + +def parse_status(apol1_status): + status = (apol1_status or "").strip() + if not status: + return [] + parts = [part.strip() for part in status.split("/")] + if len(parts) != 2: + return [] + return parts + + +def main(): + args = parse_args() + + # Track one APOL1 status per participant (all rows for a participant should match). + participant_status = {} + + with open(args.input, "r", encoding="utf-8") as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + participant_id = (row.get("participant_id") or "").strip() + apol1_status = (row.get("apol1_status") or "").strip() + if not participant_id or not apol1_status: + continue + if participant_id not in participant_status: + participant_status[participant_id] = apol1_status + + stats = defaultdict(lambda: {"count": 0, "hetero": 0, "homo": 0}) + total_successful_reads = 0 + + for apol1_status in participant_status.values(): + alleles = parse_status(apol1_status) + called = [allele for allele in alleles if allele in STATUSES] + + # "number" is the total number of successful allele reads across participants. + total_successful_reads += len(called) + + for allele in called: + stats[allele]["count"] += 1 + + # Track per-status zygosity participant counts when both alleles are callable. + if len(called) == 2: + if called[0] == called[1]: + stats[called[0]]["homo"] += 1 + else: + stats[called[0]]["hetero"] += 1 + stats[called[1]]["hetero"] += 1 + + with open(args.output, "w", encoding="utf-8", newline="") as out_fh: + fieldnames = ["status", "count", "number", "hetero", "homo", "frequency"] + writer = csv.DictWriter(out_fh, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + + for status in STATUSES: + count = stats[status]["count"] + frequency = (count / total_successful_reads) if total_successful_reads else 0 + writer.writerow( + { + "status": status, + "count": count, + "number": total_successful_reads, + "hetero": stats[status]["hetero"], + "homo": stats[status]["homo"], + "frequency": f"{frequency:.6f}", + } + ) + + +if __name__ == "__main__": + main() diff --git a/examples/apol1/apol1-classifier/assets/aggregate_population_stats.py b/examples/apol1/apol1-classifier/assets/aggregate_population_stats.py new file mode 100644 index 0000000..cb844be --- /dev/null +++ b/examples/apol1/apol1-classifier/assets/aggregate_population_stats.py @@ -0,0 +1,135 @@ +import argparse +import csv +import re +from collections import defaultdict + +def parse_args(): + parser = argparse.ArgumentParser(description="Aggregate population allele statistics") + parser.add_argument("--input", required=True, help="Input TSV from combined classifier output") + parser.add_argument("--output", required=True, help="Output TSV path") + return parser.parse_args() + +def locus_key_for_row(row): + chrom = (row.get("chromosome") or "").strip() + pos = (row.get("position") or "").strip() + ref = (row.get("ref") or "").strip() + alt = (row.get("alt") or "").strip() + if not (chrom and pos and ref and alt): + return None + return f"{chrom}-{pos}-{ref}-{alt}" + +def parse_alleles(genotype, alleles): + if not genotype: + return [] + genotype = genotype.strip() + if not genotype: + return [] + + # Split on common genotype separators + if "/" in genotype or "|" in genotype: + parts = [p for p in re.split(r"[\/|]", genotype) if p] + return parts + + # If alleles are same length and genotype is a concat of two alleles, split evenly + if alleles: + lengths = {len(a) for a in alleles if a} + if len(lengths) == 1: + allele_len = lengths.pop() + if allele_len > 0 and len(genotype) == 2 * allele_len: + return [genotype[:allele_len], genotype[allele_len:]] + + # Fallback for single-base alleles like "AA" or "AG" + if len(genotype) == 2: + return [genotype[0], genotype[1]] + + return [genotype] + +def main(): + args = parse_args() + + # stats keyed by (locus_base, allele) + stats = defaultdict(lambda: { + "allele_count": 0, + "num_homo": 0, + "num_hetero": 0, + "rsid": None, + }) + locus_called_samples = defaultdict(int) + + with open(args.input, "r", encoding="utf-8") as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + locus_key = locus_key_for_row(row) + if locus_key is None: + continue + + chrom = (row.get("chromosome") or "").strip() + pos = (row.get("position") or "").strip() + ref = (row.get("ref") or "").strip() + alt = (row.get("alt") or "").strip() + locus_base = f"{chrom}-{pos}" + + alleles = [] + if ref: + alleles.append(ref) + if alt: + alleles.extend([a.strip() for a in alt.split(",") if a.strip()]) + + genotype = (row.get("genotype") or "").strip() + gt_alleles = parse_alleles(genotype, alleles) + if not gt_alleles: + continue + + # Keep only alleles that are callable for this locus. + allowed = {a.upper() for a in alleles} + callable_gt_alleles = [a.upper() for a in gt_alleles if a.upper() in allowed] + + # Require a full diploid call for denominator consistency (allele_number = 2 * N called samples). + if len(callable_gt_alleles) != 2: + continue + + locus_called_samples[locus_base] += 1 + + rsid = row.get("rsid") + is_homo = callable_gt_alleles[0] == callable_gt_alleles[1] + + for allele in callable_gt_alleles: + key = (locus_base, allele) + entry = stats[key] + entry["allele_count"] += 1 + if is_homo and allele == callable_gt_alleles[0]: + entry["num_homo"] += 1 + elif not is_homo: + entry["num_hetero"] += 1 + if rsid and not entry["rsid"]: + entry["rsid"] = rsid + + with open(args.output, "w", encoding="utf-8", newline="") as out_fh: + fieldnames = [ + "locus_key", + "allele_count", + "allele_number", + "num_homo", + "num_hetero", + "allele_freq", + "rsid", + ] + writer = csv.DictWriter(out_fh, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + for (locus_base, allele) in sorted(stats.keys()): + entry = stats[(locus_base, allele)] + called = locus_called_samples.get(locus_base, 0) + allele_number = called * 2 + allele_freq = (entry["allele_count"] / allele_number) if allele_number else 0 + writer.writerow({ + "locus_key": f"{locus_base}-{allele}", + "allele_count": entry["allele_count"], + "allele_number": allele_number, + "num_homo": entry["num_homo"], + "num_hetero": entry["num_hetero"], + "allele_freq": f"{allele_freq:.6f}", + "rsid": entry["rsid"], + }) + +if __name__ == "__main__": + main() diff --git a/examples/apol1/apol1-classifier/assets/classify_apol1.py b/examples/apol1/apol1-classifier/assets/classify_apol1.py index c6a43aa..18c5df5 100644 --- a/examples/apol1/apol1-classifier/assets/classify_apol1.py +++ b/examples/apol1/apol1-classifier/assets/classify_apol1.py @@ -18,6 +18,15 @@ class APOL1Genotypes(GenotypeEnum): from bioscript import write_tsv from bioscript.types import MatchType +def _format_allele_label(allele): + if isinstance(allele, Alleles): + return ",".join(sorted(a.value for a in allele)) + if isinstance(allele, str): + return allele + if hasattr(allele, "__iter__"): + return ",".join(sorted(str(a) for a in allele)) + return str(allele) + class APOL1Classifier(GenotypeClassifier): def classify(self, matches) -> list[dict[str, object]]: g2_match = matches.get(rs71785313) @@ -71,12 +80,23 @@ def classify(self, matches) -> list[dict[str, object]]: if match: genotype = match.genotype_sorted - match_type = ( - match.match_type.value if not match.has_missing else MatchType.NO_CALL.value - ) + raw_match_type = match.match_type.value + if match.has_missing or raw_match_type == MatchType.NO_CALL.value: + match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 + else: + match_type = raw_match_type + ref_count = match.ref_count + alt_count = match.alt_count else: genotype = None match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 + + ref_label = _format_allele_label(variant_call.ref) + alt_label = _format_allele_label(variant_call.alt) report_rows.append( { @@ -85,8 +105,12 @@ def classify(self, matches) -> list[dict[str, object]]: "rsid": rsid, "chromosome": chromosome, "position": position, + "ref": ref_label, + "alt": alt_label, "genotype": genotype, "match_type": match_type, + "ref_count": ref_count, + "alt_count": alt_count, "apol1_status": apol1_status, } ) @@ -134,6 +158,10 @@ def classify_fixture(genotypes): for row in rows.values(): assert row["participant_id"] == "TEST_ID" assert row["filename"] == "test.txt" + assert "ref" in row + assert "alt" in row + assert "ref_count" in row + assert "alt_count" in row return rows def test_g0_homozygous(): @@ -141,14 +169,56 @@ def test_g0_homozygous(): assert rows["rs71785313"]["apol1_status"] == "G0/G0" assert rows["rs71785313"]["genotype"] == "II" assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 assert rows["rs73885319"]["apol1_status"] == "G0/G0" assert rows["rs73885319"]["genotype"] == "AA" assert rows["rs73885319"]["match_type"] == MatchType.REFERENCE_CALL.value + assert rows["rs73885319"]["ref_count"] == 2 + assert rows["rs73885319"]["alt_count"] == 0 assert rows["rs60910145"]["apol1_status"] == "G0/G0" assert rows["rs60910145"]["genotype"] == "TT" assert rows["rs60910145"]["match_type"] == MatchType.REFERENCE_CALL.value + assert rows["rs60910145"]["ref_count"] == 2 + assert rows["rs60910145"]["alt_count"] == 0 + + cleanup_output() + +def test_all_apol1_status_combinations(): + cases = [ + ("G0/G0", ["AA", "TT", "II"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "II"}), + ("G1/G0", ["AG", "TC", "II"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "II"}), + ("G1/G1", ["GG", "CC", "II"], {"rs73885319": "GG", "rs60910145": "CC", "rs71785313": "II"}), + ("G2/G0", ["AA", "TT", "ID"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DI"}), + ("G2/G1", ["AG", "TC", "ID"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "DI"}), + ("G2/G2", ["AA", "TT", "DD"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DD"}), + ] + + for expected_status, genotypes, expected_genotypes in cases: + rows = classify_fixture(genotypes) + + assert set(rows.keys()) == {"rs73885319", "rs60910145", "rs71785313"} + assert all(row["apol1_status"] == expected_status for row in rows.values()) + + for rsid, expected_genotype in expected_genotypes.items(): + assert rows[rsid]["genotype"] == expected_genotype + + # rs71785313 is present in every test case and currently emitted as No call by the matcher. + assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 + + # rs73885319 and rs60910145 must still be callable and typed correctly. + assert rows["rs73885319"]["match_type"] in { + MatchType.REFERENCE_CALL.value, + MatchType.VARIANT_CALL.value, + } + assert rows["rs60910145"]["match_type"] in { + MatchType.REFERENCE_CALL.value, + MatchType.VARIANT_CALL.value, + } cleanup_output() @@ -158,11 +228,17 @@ def test_g1_homozygous(): assert rows["rs73885319"]["genotype"] == "GG" assert rows["rs73885319"]["match_type"] == MatchType.VARIANT_CALL.value + assert rows["rs73885319"]["ref_count"] == 0 + assert rows["rs73885319"]["alt_count"] == 2 assert rows["rs60910145"]["genotype"] == "CC" assert rows["rs60910145"]["match_type"] == MatchType.VARIANT_CALL.value + assert rows["rs60910145"]["ref_count"] == 0 + assert rows["rs60910145"]["alt_count"] == 2 assert rows["rs71785313"]["genotype"] == "II" assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 cleanup_output() diff --git a/examples/apol1/apol1-classifier/flow.yaml b/examples/apol1/apol1-classifier/flow.yaml index f71dedc..0a6d451 100644 --- a/examples/apol1/apol1-classifier/flow.yaml +++ b/examples/apol1/apol1-classifier/flow.yaml @@ -4,22 +4,25 @@ metadata: name: apol1-classifier version: 0.1.1 spec: + steps: + - id: apol1 + uses: + source: + kind: local + path: ./ + with: + participants: inputs.samplesheet + publish: + classification_result: File(result_APOL1.tsv) + population_stats: File(result_APOL1_stats.tsv) + classification_stats: File(result_APOL1_classification_stats.tsv) + apol1_status: File(result_APOL1_status.tsv) + store: + counts_sql: + kind: sql + source: classification_result + table: apol1_{run_id} + key_column: participant_id inputs: samplesheet: type: List[GenotypeRecord] - steps: - - id: apol1 - uses: - source: - kind: local - path: ./ - with: - participants: inputs.samplesheet - publish: - classification_result: File(result_APOL1.tsv) - store: - counts_sql: - kind: sql - source: classification_result - table: apol1_{run_id} - key_column: participant_id diff --git a/examples/apol1/apol1-classifier/module.yaml b/examples/apol1/apol1-classifier/module.yaml index 1f602db..61381a7 100644 --- a/examples/apol1/apol1-classifier/module.yaml +++ b/examples/apol1/apol1-classifier/module.yaml @@ -3,30 +3,52 @@ kind: Module metadata: name: apol1-classifier version: 0.1.1 - description: Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment. authors: - - madhava@openmined.org + - madhava@openmined.org + description: Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment. spec: runner: kind: nextflow entrypoint: workflow.nf template: dynamic-nextflow - assets: - - path: classify_apol1.py + image: ghcr.io/openmined/bioscript:0.1.6 inputs: - - name: participants - type: List[GenotypeRecord] - description: CSV/TSV with participant_id and genotype_file columns - format: - kind: csv - mapping: - participant_id: participant_id - genotype_file: genotype_file + - name: participants + type: List[GenotypeRecord] + description: CSV/TSV with participant_id and genotype_file columns + format: + kind: csv + mapping: + participant_id: participant_id + genotype_file: genotype_file outputs: - - name: classification_result - type: File - description: APOL1 genotype classification - format: - kind: tsv - path: result_APOL1.tsv + - name: classification_result + type: File + description: APOL1 genotype classification + format: + kind: tsv + path: result_APOL1.tsv + - name: population_stats + type: File + description: APOL1 population allele statistics (aggregated) + format: + kind: tsv + path: result_APOL1_stats.tsv + - name: classification_stats + type: File + description: APOL1 status counts by allele class (G0/G1/G2) with hetero/homo split + format: + kind: tsv + path: result_APOL1_classification_stats.tsv + - name: apol1_status + type: File + description: Per-participant APOL1 status summary + format: + kind: tsv + path: result_APOL1_status.tsv parameters: [] + assets: + - path: classify_apol1.py + - path: aggregate_population_stats.py + - path: aggregate_classification_stats.py + - path: aggregate_apol1_status.py diff --git a/examples/apol1/apol1-classifier/workflow.nf b/examples/apol1/apol1-classifier/workflow.nf index cea4b82..6a07d26 100644 --- a/examples/apol1/apol1-classifier/workflow.nf +++ b/examples/apol1/apol1-classifier/workflow.nf @@ -33,8 +33,29 @@ workflow USER { per_participant_results.collect() ) + // Aggregate population statistics + def population_stats_ch = aggregate_population_stats( + Channel.value(assetsDirPath), + aggregated + ) + + // Aggregate APOL1 status statistics (G0/G1/G2 with hetero/homo counts) + def classification_stats_ch = aggregate_classification_stats( + Channel.value(assetsDirPath), + aggregated + ) + + // Emit per-participant APOL1 status summary. + def apol1_status_ch = aggregate_apol1_status( + Channel.value(assetsDirPath), + aggregated + ) + emit: classification_result = aggregated + population_stats = population_stats_ch + classification_stats = classification_stats_ch + apol1_status = apol1_status_ch } process apol1_classifier { @@ -74,3 +95,60 @@ process aggregate_results { bioscript combine --list results.list --output result_APOL1.tsv """ } + +process aggregate_population_stats { + container 'ghcr.io/openmined/bioscript:0.1.6' + publishDir params.results_dir, mode: 'copy', overwrite: true + + input: + path assets_dir + path aggregated_results + + output: + path "result_APOL1_stats.tsv" + + script: + """ + python3 "${assets_dir}/aggregate_population_stats.py" \ + --input "${aggregated_results}" \ + --output result_APOL1_stats.tsv + """ +} + +process aggregate_classification_stats { + container 'ghcr.io/openmined/bioscript:0.1.6' + publishDir params.results_dir, mode: 'copy', overwrite: true + + input: + path assets_dir + path aggregated_results + + output: + path "result_APOL1_classification_stats.tsv" + + script: + """ + python3 "${assets_dir}/aggregate_classification_stats.py" \ + --input "${aggregated_results}" \ + --output result_APOL1_classification_stats.tsv + """ +} + +process aggregate_apol1_status { + container 'ghcr.io/openmined/bioscript:0.1.6' + publishDir params.results_dir, mode: 'copy', overwrite: true + + input: + path assets_dir + path aggregated_results + + output: + path "result_APOL1_status.tsv" + + script: + """ + python3 "${assets_dir}/aggregate_apol1_status.py" \ + --input "${aggregated_results}" \ + --output result_APOL1_status.tsv + """ +} diff --git a/examples/apol1/apol1_dev.ipynb b/examples/apol1/apol1_dev.ipynb index 2d1a93e..020840f 100644 --- a/examples/apol1/apol1_dev.ipynb +++ b/examples/apol1/apol1_dev.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -49,13 +49,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from bioscript import write_tsv\n", "from bioscript.types import MatchType\n", "\n", + "def _format_allele_label(allele):\n", + " if isinstance(allele, Alleles):\n", + " return \",\".join(sorted(a.value for a in allele))\n", + " if isinstance(allele, str):\n", + " return allele\n", + " if hasattr(allele, \"__iter__\"):\n", + " return \",\".join(sorted(str(a) for a in allele))\n", + " return str(allele)\n", + "\n", "class APOL1Classifier(GenotypeClassifier):\n", " def classify(self, matches) -> list[dict[str, object]]:\n", " g2_match = matches.get(rs71785313)\n", @@ -109,12 +118,23 @@ "\n", " if match:\n", " genotype = match.genotype_sorted\n", - " match_type = (\n", - " match.match_type.value if not match.has_missing else MatchType.NO_CALL.value\n", - " )\n", + " raw_match_type = match.match_type.value\n", + " if match.has_missing or raw_match_type == MatchType.NO_CALL.value:\n", + " match_type = MatchType.NO_CALL.value\n", + " ref_count = 0\n", + " alt_count = 0\n", + " else:\n", + " match_type = raw_match_type\n", + " ref_count = match.ref_count\n", + " alt_count = match.alt_count\n", " else:\n", " genotype = None\n", " match_type = MatchType.NO_CALL.value\n", + " ref_count = 0\n", + " alt_count = 0\n", + "\n", + " ref_label = _format_allele_label(variant_call.ref)\n", + " alt_label = _format_allele_label(variant_call.alt)\n", "\n", " report_rows.append(\n", " {\n", @@ -123,19 +143,23 @@ " \"rsid\": rsid,\n", " \"chromosome\": chromosome,\n", " \"position\": position,\n", + " \"ref\": ref_label,\n", + " \"alt\": alt_label,\n", " \"genotype\": genotype,\n", " \"match_type\": match_type,\n", + " \"ref_count\": ref_count,\n", + " \"alt_count\": alt_count,\n", " \"apol1_status\": apol1_status,\n", " }\n", " )\n", "\n", " write_tsv(f\"{self.output_basename}.tsv\", report_rows)\n", - " return report_rows\n" + " return report_rows" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -157,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -176,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -202,6 +226,10 @@ " for row in rows.values():\n", " assert row[\"participant_id\"] == \"TEST_ID\"\n", " assert row[\"filename\"] == \"test.txt\"\n", + " assert \"ref\" in row\n", + " assert \"alt\" in row\n", + " assert \"ref_count\" in row\n", + " assert \"alt_count\" in row\n", " return rows\n", "\n", "def test_g0_homozygous():\n", @@ -209,21 +237,63 @@ " assert rows[\"rs71785313\"][\"apol1_status\"] == \"G0/G0\"\n", " assert rows[\"rs71785313\"][\"genotype\"] == \"II\"\n", " assert rows[\"rs71785313\"][\"match_type\"] == MatchType.NO_CALL.value\n", + " assert rows[\"rs71785313\"][\"ref_count\"] == 0\n", + " assert rows[\"rs71785313\"][\"alt_count\"] == 0\n", "\n", " assert rows[\"rs73885319\"][\"apol1_status\"] == \"G0/G0\"\n", " assert rows[\"rs73885319\"][\"genotype\"] == \"AA\"\n", " assert rows[\"rs73885319\"][\"match_type\"] == MatchType.REFERENCE_CALL.value\n", + " assert rows[\"rs73885319\"][\"ref_count\"] == 2\n", + " assert rows[\"rs73885319\"][\"alt_count\"] == 0\n", "\n", " assert rows[\"rs60910145\"][\"apol1_status\"] == \"G0/G0\"\n", " assert rows[\"rs60910145\"][\"genotype\"] == \"TT\"\n", " assert rows[\"rs60910145\"][\"match_type\"] == MatchType.REFERENCE_CALL.value\n", + " assert rows[\"rs60910145\"][\"ref_count\"] == 2\n", + " assert rows[\"rs60910145\"][\"alt_count\"] == 0\n", + "\n", + " cleanup_output()\n", + "\n", + "def test_all_apol1_status_combinations():\n", + " cases = [\n", + " (\"G0/G0\", [\"AA\", \"TT\", \"II\"], {\"rs73885319\": \"AA\", \"rs60910145\": \"TT\", \"rs71785313\": \"II\"}),\n", + " (\"G1/G0\", [\"AG\", \"TC\", \"II\"], {\"rs73885319\": \"AG\", \"rs60910145\": \"CT\", \"rs71785313\": \"II\"}),\n", + " (\"G1/G1\", [\"GG\", \"CC\", \"II\"], {\"rs73885319\": \"GG\", \"rs60910145\": \"CC\", \"rs71785313\": \"II\"}),\n", + " (\"G2/G0\", [\"AA\", \"TT\", \"ID\"], {\"rs73885319\": \"AA\", \"rs60910145\": \"TT\", \"rs71785313\": \"DI\"}),\n", + " (\"G2/G1\", [\"AG\", \"TC\", \"ID\"], {\"rs73885319\": \"AG\", \"rs60910145\": \"CT\", \"rs71785313\": \"DI\"}),\n", + " (\"G2/G2\", [\"AA\", \"TT\", \"DD\"], {\"rs73885319\": \"AA\", \"rs60910145\": \"TT\", \"rs71785313\": \"DD\"}),\n", + " ]\n", + "\n", + " for expected_status, genotypes, expected_genotypes in cases:\n", + " rows = classify_fixture(genotypes)\n", + "\n", + " assert set(rows.keys()) == {\"rs73885319\", \"rs60910145\", \"rs71785313\"}\n", + " assert all(row[\"apol1_status\"] == expected_status for row in rows.values())\n", + "\n", + " for rsid, expected_genotype in expected_genotypes.items():\n", + " assert rows[rsid][\"genotype\"] == expected_genotype\n", + "\n", + " # rs71785313 is present in every test case and currently emitted as No call by the matcher.\n", + " assert rows[\"rs71785313\"][\"match_type\"] == MatchType.NO_CALL.value\n", + " assert rows[\"rs71785313\"][\"ref_count\"] == 0\n", + " assert rows[\"rs71785313\"][\"alt_count\"] == 0\n", + "\n", + " # rs73885319 and rs60910145 must still be callable and typed correctly.\n", + " assert rows[\"rs73885319\"][\"match_type\"] in {\n", + " MatchType.REFERENCE_CALL.value,\n", + " MatchType.VARIANT_CALL.value,\n", + " }\n", + " assert rows[\"rs60910145\"][\"match_type\"] in {\n", + " MatchType.REFERENCE_CALL.value,\n", + " MatchType.VARIANT_CALL.value,\n", + " }\n", "\n", - " cleanup_output()\n" + " cleanup_output()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -233,14 +303,20 @@ "\n", " assert rows[\"rs73885319\"][\"genotype\"] == \"GG\"\n", " assert rows[\"rs73885319\"][\"match_type\"] == MatchType.VARIANT_CALL.value\n", + " assert rows[\"rs73885319\"][\"ref_count\"] == 0\n", + " assert rows[\"rs73885319\"][\"alt_count\"] == 2\n", "\n", " assert rows[\"rs60910145\"][\"genotype\"] == \"CC\"\n", " assert rows[\"rs60910145\"][\"match_type\"] == MatchType.VARIANT_CALL.value\n", + " assert rows[\"rs60910145\"][\"ref_count\"] == 0\n", + " assert rows[\"rs60910145\"][\"alt_count\"] == 2\n", "\n", " assert rows[\"rs71785313\"][\"genotype\"] == \"II\"\n", " assert rows[\"rs71785313\"][\"match_type\"] == MatchType.NO_CALL.value\n", + " assert rows[\"rs71785313\"][\"ref_count\"] == 0\n", + " assert rows[\"rs71785313\"][\"alt_count\"] == 0\n", "\n", - " cleanup_output()\n" + " cleanup_output()" ] }, { @@ -254,21 +330,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ All tests passed!\n" + ] + } + ], "source": [ "# Run tests\n", "test_g0_homozygous()\n", "test_g1_homozygous()\n", + "test_all_apol1_status_combinations()\n", "print(\"✓ All tests passed!\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('classify_apol1.py')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from bioscript import export_from_notebook\n", "export_from_notebook(\"apol1_dev.ipynb\", \"classify_apol1.py\")" @@ -276,18 +372,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Testing: classify_apol1.py\n", + "============================================================\n", + "Running tests with pytest: classify_apol1.py\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/.venv/bin/python3\n", + "cachedir: .pytest_cache\n", + "rootdir: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1\n", + "plugins: anyio-4.12.1\n", + "collected 3 items \u001b[0m\n", + "\n", + "classify_apol1.py::test_g0_homozygous \u001b[32mPASSED\u001b[0m\u001b[32m [ 33%]\u001b[0m\n", + "classify_apol1.py::test_all_apol1_status_combinations \u001b[32mPASSED\u001b[0m\u001b[32m [ 66%]\u001b[0m\n", + "classify_apol1.py::test_g1_homozygous \u001b[32mPASSED\u001b[0m\u001b[32m [100%]\u001b[0m\n", + "\n", + "\u001b[32m============================== \u001b[32m\u001b[1m3 passed\u001b[0m\u001b[32m in 0.02s\u001b[0m\u001b[32m ===============================\u001b[0m\n" + ] + } + ], "source": [ "!bioscript test classify_apol1.py" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultProject(name='apol1-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.', template=, version='0.1.1', assets=['classify_apol1.py', 'aggregate_population_stats.py', 'aggregate_classification_stats.py', 'aggregate_apol1_status.py'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='APOL1 genotype classification', format='tsv', path='result_APOL1.tsv', cli_flag=None), Output(name='population_stats', type='File', description='APOL1 population allele statistics (aggregated)', format='tsv', path='result_APOL1_stats.tsv', cli_flag=None), Output(name='classification_stats', type='File', description='APOL1 status counts by allele class (G0/G1/G2) with hetero/homo split', format='tsv', path='result_APOL1_classification_stats.tsv', cli_flag=None), Output(name='apol1_status', type='File', description='Per-participant APOL1 status summary', format='tsv', path='result_APOL1_status.tsv', cli_flag=None)], processes=[ProcessDefinition(name='apol1_classifier', script='classify_apol1.py', container='ghcr.io/openmined/bioscript:0.1.6', kind='bioscript')], docker_image='ghcr.io/openmined/bioscript:0.1.6', docker_platform='linux/amd64')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from bioscript import export_bioscript_workflow\n", "\n", @@ -296,7 +427,11 @@ " workflow_name='apol1-classifier',\n", " author='madhava@openmined.org',\n", " target_dir='./',\n", - " assets={},\n", + " assets=[\n", + " 'apol1-classifier/assets/aggregate_population_stats.py',\n", + " 'apol1-classifier/assets/aggregate_classification_stats.py',\n", + " 'apol1-classifier/assets/aggregate_apol1_status.py',\n", + " ],\n", " inputs=[\n", " {\n", " 'name': 'participants',\n", @@ -317,18 +452,156 @@ " 'format': 'tsv',\n", " 'path': 'result_APOL1.tsv',\n", " },\n", + " {\n", + " 'name': 'population_stats',\n", + " 'type': 'File',\n", + " 'description': 'APOL1 population allele statistics (aggregated)',\n", + " 'format': 'tsv',\n", + " 'path': 'result_APOL1_stats.tsv',\n", + " },\n", + " {\n", + " 'name': 'classification_stats',\n", + " 'type': 'File',\n", + " 'description': 'APOL1 status counts by allele class (G0/G1/G2) with hetero/homo split',\n", + " 'format': 'tsv',\n", + " 'path': 'result_APOL1_classification_stats.tsv',\n", + " },\n", + " {\n", + " 'name': 'apol1_status',\n", + " 'type': 'File',\n", + " 'description': 'Per-participant APOL1 status summary',\n", + " 'format': 'tsv',\n", + " 'path': 'result_APOL1_status.tsv',\n", + " },\n", " ],\n", " version=\"0.1.1\",\n", " description=\"Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.\",\n", ")\n", - "project\n" + "project" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "4227" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "workflow_path = Path('apol1-classifier/workflow.nf')\n", + "if not workflow_path.exists():\n", + " raise FileNotFoundError('workflow.nf not found. Run the export cell first.')\n", + "\n", + "text = workflow_path.read_text(encoding='utf-8')\n", + "\n", + "if 'aggregate_population_stats' not in text:\n", + " text = text.replace(\n", + " \" def aggregated = aggregate_results(\\n per_participant_results.collect()\\n )\\n\\n emit:\\n classification_result = aggregated\\n}\",\n", + " \" def aggregated = aggregate_results(\\n per_participant_results.collect()\\n )\\n\\n // Aggregate population statistics\\n def population_stats_ch = aggregate_population_stats(\\n Channel.value(assetsDirPath),\\n aggregated\\n )\\n\\n // Aggregate APOL1 status statistics (G0/G1/G2 with hetero/homo counts)\\n def classification_stats_ch = aggregate_classification_stats(\\n Channel.value(assetsDirPath),\\n aggregated\\n )\\n\\n // Emit per-participant APOL1 status summary.\\n def apol1_status_ch = aggregate_apol1_status(\\n Channel.value(assetsDirPath),\\n aggregated\\n )\\n\\n emit:\\n classification_result = aggregated\\n population_stats = population_stats_ch\\n classification_stats = classification_stats_ch\\n apol1_status = apol1_status_ch\\n}\",\n", + " )\n", + "\n", + "population_stats_block = '''\n", + "process aggregate_population_stats {\n", + " container 'ghcr.io/openmined/bioscript:0.1.6'\n", + " publishDir params.results_dir, mode: 'copy', overwrite: true\n", + "\n", + " input:\n", + " path assets_dir\n", + " path aggregated_results\n", + "\n", + " output:\n", + " path \"result_APOL1_stats.tsv\"\n", + "\n", + " script:\n", + " \"\"\"\n", + " python3 \"${assets_dir}/aggregate_population_stats.py\" \\\\\n", + " --input \"${aggregated_results}\" \\\\\n", + " --output result_APOL1_stats.tsv\n", + " \"\"\"\n", + "}\n", + "'''\n", + "\n", + "classification_stats_block = '''\n", + "process aggregate_classification_stats {\n", + " container 'ghcr.io/openmined/bioscript:0.1.6'\n", + " publishDir params.results_dir, mode: 'copy', overwrite: true\n", + "\n", + " input:\n", + " path assets_dir\n", + " path aggregated_results\n", + "\n", + " output:\n", + " path \"result_APOL1_classification_stats.tsv\"\n", + "\n", + " script:\n", + " \"\"\"\n", + " python3 \"${assets_dir}/aggregate_classification_stats.py\" \\\\\n", + " --input \"${aggregated_results}\" \\\\\n", + " --output result_APOL1_classification_stats.tsv\n", + " \"\"\"\n", + "}\n", + "'''\n", + "\n", + "apol1_status_block = '''\n", + "process aggregate_apol1_status {\n", + " container 'ghcr.io/openmined/bioscript:0.1.6'\n", + " publishDir params.results_dir, mode: 'copy', overwrite: true\n", + "\n", + " input:\n", + " path assets_dir\n", + " path aggregated_results\n", + "\n", + " output:\n", + " path \"result_APOL1_status.tsv\"\n", + "\n", + " script:\n", + " \"\"\"\n", + " python3 \"${assets_dir}/aggregate_apol1_status.py\" \\\\\n", + " --input \"${aggregated_results}\" \\\\\n", + " --output result_APOL1_status.tsv\n", + " \"\"\"\n", + "}\n", + "'''\n", + "\n", + "if 'process aggregate_population_stats' not in text:\n", + " text += population_stats_block\n", + "\n", + "if 'process aggregate_classification_stats' not in text:\n", + " text += classification_stats_block\n", + "\n", + "if 'process aggregate_apol1_status' not in text:\n", + " text += apol1_status_block\n", + "\n", + "workflow_path.write_text(text, encoding='utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BioVaultPipeline(name='apol1-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='apol1', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_APOL1.tsv)', 'population_stats': 'File(result_APOL1_stats.tsv)', 'classification_stats': 'File(result_APOL1_classification_stats.tsv)', 'apol1_status': 'File(result_APOL1_status.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='apol1_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore\n", "\n", @@ -347,6 +620,9 @@ " },\n", " publish={\n", " 'classification_result': 'File(result_APOL1.tsv)',\n", + " 'population_stats': 'File(result_APOL1_stats.tsv)',\n", + " 'classification_stats': 'File(result_APOL1_classification_stats.tsv)',\n", + " 'apol1_status': 'File(result_APOL1_status.tsv)',\n", " },\n", " store={\n", " 'counts_sql': SQLStore(\n", @@ -360,14 +636,29 @@ " ],\n", " version=\"0.1.1\",\n", ")\n", - "pipeline\n" + "pipeline" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test file created: apol1_test_g1.tsv\n", + "Content:\n", + "# rsid\tchromosome\tposition\tgenotype\n", + "rs73885319\t22\t36265860\tGG\n", + "rs60910145\t22\t36265988\tCC\n", + "rs71785313\t22\t36266000\tII\n", + "\n", + "✓ Test file ready!\n" + ] + } + ], "source": [ "# Write a minimal APOL1 genotype TSV and inspect the classifier output\n", "from pathlib import Path\n", @@ -390,9 +681,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1\n", + "[bioscript] Provided SNP file argument: apol1_test_g1.tsv\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_test_g1.tsv\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_test_g1.tsv\n", + "participant_id=TEST_APOL1\n", + "APOL1_count=3\n", + "participant_id\tfilename\trsid\tchromosome\tposition\tref\talt\tgenotype\tmatch_type\tref_count\talt_count\tapol1_status\n", + "TEST_APOL1\tapol1_test_g1.tsv\trs71785313\t22\t36266000\tI\tD\tII\tNo call\t0\t0\tG1/G1\n", + "TEST_APOL1\tapol1_test_g1.tsv\trs73885319\t22\t36265860\tA\tC,G,T,U\tGG\tVariant call\t0\t2\tG1/G1\n", + "TEST_APOL1\tapol1_test_g1.tsv\trs60910145\t22\t36265988\tT\tA,C,G,U\tCC\tVariant call\t0\t2\tG1/G1\n" + ] + } + ], "source": [ "!bioscript classify classify_apol1.py --file apol1_test_g1.tsv --participant_id=\"TEST_APOL1\"\n", "!cat result_APOL1_TEST_APOL1.tsv" @@ -407,9 +718,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CSV file created: apol1_decodeme.csv\n", + "Content:\n", + "Name,Variation,Chromosome,Position\n", + "rs4477212,A/G,1,72017\n", + "rs2185539,C/T,1,556738\n", + "rs6681105,C/T,1,581938\n", + "rs11240767,C/T,1,718814\n", + "rs3094315,C/T,1,742429\n", + "rs3131972,C/T,1,742584\n", + "rs3131969,C/T,1,744045\n", + "rs1048488,C/T,1,750775\n", + "rs73885319,A/G,22,36526907\n", + "rs60910145,T/G,22,36527035\n", + "rs71785313,TTATAA/-,22,36527047\n", + "rs143830837,TTATAA/-,22,36527047\n", + "\n", + "✓ DecodeME sample ready!\n" + ] + } + ], "source": [ "# Example deCODEme-style genotype CSV\n", "from pathlib import Path\n", @@ -443,9 +778,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1\n", + "[bioscript] Provided SNP file argument: apol1_decodeme.csv\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_decodeme.csv\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_decodeme.csv\n", + "participant_id=DECODE\n", + "APOL1_count=3\n", + "participant_id\tfilename\trsid\tchromosome\tposition\tref\talt\tgenotype\tmatch_type\tref_count\talt_count\tapol1_status\n", + "DECODE\tapol1_decodeme.csv\trs71785313\t22\t36527047\tI\tD\tDI\tNo call\t0\t0\tG2/G1\n", + "DECODE\tapol1_decodeme.csv\trs73885319\t22\t36526907\tA\tC,G,T,U\tAG\tVariant call\t1\t1\tG2/G1\n", + "DECODE\tapol1_decodeme.csv\trs60910145\t22\t36527035\tT\tA,C,G,U\tGT\tVariant call\t1\t1\tG2/G1\n" + ] + } + ], "source": [ "!bioscript classify classify_apol1.py --file apol1_decodeme.csv --participant_id=\"DECODE\"\n", "!cat result_APOL1_DECODE.tsv\n" @@ -453,9 +808,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CSV file created: apol1_myheritage.csv\n", + "Content:\n", + "# MyHeritage DNA raw data.\n", + "# This file was generated on 2018-02-14 04:06:47\n", + "# For each SNP, we provide the identifier, chromosome number, base pair position and genotype. The genotype is reported on the forward (+) strand with respect to the human reference build 37.\n", + "# THIS INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH ONLY. IT IS NOT INTENDED FOR MEDICAL OR HEALTH PURPOSES.\n", + "# PLEASE BE AWARE THAT THE DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.\n", + "RSID,CHROMOSOME,POSITION,RESULT\n", + "\"rs4477212\",\"1\",\"82154\",\"AA\"\n", + "\"rs3094315\",\"1\",\"752566\",\"--\"\n", + "\"rs3131972\",\"1\",\"752721\",\"GG\"\n", + "\"rs12562034\",\"1\",\"768448\",\"--\"\n", + "\"rs12124819\",\"1\",\"776546\",\"--\"\n", + "\"rs12913832\",\"15\",\"28365618\",\"--\" # HERC2\n", + "\"rs73885319\",\"22\",\"36661906\",\"AG\" # APOL1 test entries\n", + "\"rs60910145\",\"22\",\"36662034\",\"TG\" # APOL1 test entries\n", + "\"rs71785313\",\"22\",\"36662046\",\"ID\" # APOL1 test entries\n", + "\"rs143830837\",\"22\",\"36662046\",\"ID\" # APOL1 test entries\n", + "\n", + "✓ MyHeritage sample ready!\n" + ] + } + ], "source": [ "# Example MyHeritage CSV with build information in header comments\n", "from pathlib import Path\n", @@ -492,9 +874,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1\n", + "[bioscript] Provided SNP file argument: apol1_myheritage.csv\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_myheritage.csv\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_myheritage.csv\n", + "participant_id=MYHERITAGE\n", + "APOL1_count=3\n", + "participant_id\tfilename\trsid\tchromosome\tposition\tref\talt\tgenotype\tmatch_type\tref_count\talt_count\tapol1_status\n", + "MYHERITAGE\tapol1_myheritage.csv\trs71785313\t22\t36662046\tI\tD\tDI\tNo call\t0\t0\tG2/G1\n", + "MYHERITAGE\tapol1_myheritage.csv\trs73885319\t22\t36661906\tA\tC,G,T,U\tAG\tVariant call\t1\t1\tG2/G1\n", + "MYHERITAGE\tapol1_myheritage.csv\trs60910145\t22\t36662034\tT\tA,C,G,U\tGT\tVariant call\t1\t1\tG2/G1\n" + ] + } + ], "source": [ "!bioscript classify classify_apol1.py --file apol1_myheritage.csv --participant_id=\"MYHERITAGE\"\n", "!cat result_APOL1_MYHERITAGE.tsv\n" @@ -502,9 +904,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Headerless file created: apol1_headerless.txt\n", + "Content:\n", + "rs3934834 1 995669 CC\n", + "rs6687776 1 1020428 CC\n", + "rs9651273 1 1021403 AA\n", + "rs4970420 1 1096336 GG\n", + "rs11260549 1 1111657 GG\n", + "rs2887286 1 1145994 TT\n", + "rs7515488 1 1153667 CC\n", + "rs11804831 1 1184667 TT\n", + "rs880051 1 1483590 GG\n", + "rs2296716 1 1487687 CC\n", + "\n", + "rs73885319 22 36265860 AG\n", + "rs60910145 22 36265988 TG\n", + "rs71785313 22 36266000 ID\n", + "rs143830837 22 36266000 ID\n", + "\n", + "✓ Headerless sample ready!\n" + ] + } + ], "source": [ "# Example headerless genotype file (space-delimited)\n", "from pathlib import Path\n", @@ -540,9 +968,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1\n", + "[bioscript] Provided SNP file argument: apol1_headerless.txt\n", + "[bioscript] Provided path absolute? False\n", + "[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_headerless.txt\n", + "[bioscript] Resolved exists? True\n", + "[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt\n", + "[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_headerless.txt\n", + "participant_id=HEADERLESS\n", + "APOL1_count=3\n", + "participant_id\tfilename\trsid\tchromosome\tposition\tref\talt\tgenotype\tmatch_type\tref_count\talt_count\tapol1_status\n", + "HEADERLESS\tapol1_headerless.txt\trs71785313\t22\t36266000\tI\tD\tDI\tNo call\t0\t0\tG2/G1\n", + "HEADERLESS\tapol1_headerless.txt\trs73885319\t22\t36265860\tA\tC,G,T,U\tAG\tVariant call\t1\t1\tG2/G1\n", + "HEADERLESS\tapol1_headerless.txt\trs60910145\t22\t36265988\tT\tA,C,G,U\tGT\tVariant call\t1\t1\tG2/G1\n" + ] + } + ], "source": [ "!bioscript classify classify_apol1.py --file apol1_headerless.txt --participant_id=\"HEADERLESS\"\n", "!cat result_APOL1_HEADERLESS.tsv\n" @@ -565,7 +1013,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.7" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/examples/apol1/classify_apol1.py b/examples/apol1/classify_apol1.py index c6a43aa..18c5df5 100644 --- a/examples/apol1/classify_apol1.py +++ b/examples/apol1/classify_apol1.py @@ -18,6 +18,15 @@ class APOL1Genotypes(GenotypeEnum): from bioscript import write_tsv from bioscript.types import MatchType +def _format_allele_label(allele): + if isinstance(allele, Alleles): + return ",".join(sorted(a.value for a in allele)) + if isinstance(allele, str): + return allele + if hasattr(allele, "__iter__"): + return ",".join(sorted(str(a) for a in allele)) + return str(allele) + class APOL1Classifier(GenotypeClassifier): def classify(self, matches) -> list[dict[str, object]]: g2_match = matches.get(rs71785313) @@ -71,12 +80,23 @@ def classify(self, matches) -> list[dict[str, object]]: if match: genotype = match.genotype_sorted - match_type = ( - match.match_type.value if not match.has_missing else MatchType.NO_CALL.value - ) + raw_match_type = match.match_type.value + if match.has_missing or raw_match_type == MatchType.NO_CALL.value: + match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 + else: + match_type = raw_match_type + ref_count = match.ref_count + alt_count = match.alt_count else: genotype = None match_type = MatchType.NO_CALL.value + ref_count = 0 + alt_count = 0 + + ref_label = _format_allele_label(variant_call.ref) + alt_label = _format_allele_label(variant_call.alt) report_rows.append( { @@ -85,8 +105,12 @@ def classify(self, matches) -> list[dict[str, object]]: "rsid": rsid, "chromosome": chromosome, "position": position, + "ref": ref_label, + "alt": alt_label, "genotype": genotype, "match_type": match_type, + "ref_count": ref_count, + "alt_count": alt_count, "apol1_status": apol1_status, } ) @@ -134,6 +158,10 @@ def classify_fixture(genotypes): for row in rows.values(): assert row["participant_id"] == "TEST_ID" assert row["filename"] == "test.txt" + assert "ref" in row + assert "alt" in row + assert "ref_count" in row + assert "alt_count" in row return rows def test_g0_homozygous(): @@ -141,14 +169,56 @@ def test_g0_homozygous(): assert rows["rs71785313"]["apol1_status"] == "G0/G0" assert rows["rs71785313"]["genotype"] == "II" assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 assert rows["rs73885319"]["apol1_status"] == "G0/G0" assert rows["rs73885319"]["genotype"] == "AA" assert rows["rs73885319"]["match_type"] == MatchType.REFERENCE_CALL.value + assert rows["rs73885319"]["ref_count"] == 2 + assert rows["rs73885319"]["alt_count"] == 0 assert rows["rs60910145"]["apol1_status"] == "G0/G0" assert rows["rs60910145"]["genotype"] == "TT" assert rows["rs60910145"]["match_type"] == MatchType.REFERENCE_CALL.value + assert rows["rs60910145"]["ref_count"] == 2 + assert rows["rs60910145"]["alt_count"] == 0 + + cleanup_output() + +def test_all_apol1_status_combinations(): + cases = [ + ("G0/G0", ["AA", "TT", "II"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "II"}), + ("G1/G0", ["AG", "TC", "II"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "II"}), + ("G1/G1", ["GG", "CC", "II"], {"rs73885319": "GG", "rs60910145": "CC", "rs71785313": "II"}), + ("G2/G0", ["AA", "TT", "ID"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DI"}), + ("G2/G1", ["AG", "TC", "ID"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "DI"}), + ("G2/G2", ["AA", "TT", "DD"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DD"}), + ] + + for expected_status, genotypes, expected_genotypes in cases: + rows = classify_fixture(genotypes) + + assert set(rows.keys()) == {"rs73885319", "rs60910145", "rs71785313"} + assert all(row["apol1_status"] == expected_status for row in rows.values()) + + for rsid, expected_genotype in expected_genotypes.items(): + assert rows[rsid]["genotype"] == expected_genotype + + # rs71785313 is present in every test case and currently emitted as No call by the matcher. + assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 + + # rs73885319 and rs60910145 must still be callable and typed correctly. + assert rows["rs73885319"]["match_type"] in { + MatchType.REFERENCE_CALL.value, + MatchType.VARIANT_CALL.value, + } + assert rows["rs60910145"]["match_type"] in { + MatchType.REFERENCE_CALL.value, + MatchType.VARIANT_CALL.value, + } cleanup_output() @@ -158,11 +228,17 @@ def test_g1_homozygous(): assert rows["rs73885319"]["genotype"] == "GG" assert rows["rs73885319"]["match_type"] == MatchType.VARIANT_CALL.value + assert rows["rs73885319"]["ref_count"] == 0 + assert rows["rs73885319"]["alt_count"] == 2 assert rows["rs60910145"]["genotype"] == "CC" assert rows["rs60910145"]["match_type"] == MatchType.VARIANT_CALL.value + assert rows["rs60910145"]["ref_count"] == 0 + assert rows["rs60910145"]["alt_count"] == 2 assert rows["rs71785313"]["genotype"] == "II" assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value + assert rows["rs71785313"]["ref_count"] == 0 + assert rows["rs71785313"]["alt_count"] == 0 cleanup_output() diff --git a/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..29e77f2 --- /dev/null +++ b/examples/apol1/genotype_files/108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108180_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108180_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..29e77f2 --- /dev/null +++ b/examples/apol1/genotype_files/108180_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108181_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108181_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..8bab250 --- /dev/null +++ b/examples/apol1/genotype_files/108181_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AG 0.9000 0.500 0.0000 +rs60910145 22 36265988 TC 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108182_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108182_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..8f5f73e --- /dev/null +++ b/examples/apol1/genotype_files/108182_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AG 0.9000 0.500 0.0000 +rs60910145 22 36265988 CT 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108183_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108183_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..019b7b1 --- /dev/null +++ b/examples/apol1/genotype_files/108183_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 GG 0.9000 0.500 0.0000 +rs60910145 22 36265988 CC 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108184_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108184_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..019b7b1 --- /dev/null +++ b/examples/apol1/genotype_files/108184_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 GG 0.9000 0.500 0.0000 +rs60910145 22 36265988 CC 0.9000 0.500 0.0000 +rs71785313 22 36266000 II 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108185_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108185_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..0dd9081 --- /dev/null +++ b/examples/apol1/genotype_files/108185_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 ID 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108186_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108186_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..ee9df99 --- /dev/null +++ b/examples/apol1/genotype_files/108186_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 DI 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108187_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108187_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..38630e8 --- /dev/null +++ b/examples/apol1/genotype_files/108187_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AG 0.9000 0.500 0.0000 +rs60910145 22 36265988 TC 0.9000 0.500 0.0000 +rs71785313 22 36266000 ID 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108188_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108188_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..a30bede --- /dev/null +++ b/examples/apol1/genotype_files/108188_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AG 0.9000 0.500 0.0000 +rs60910145 22 36265988 CT 0.9000 0.500 0.0000 +rs71785313 22 36266000 DI 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108189_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108189_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..25c94f8 --- /dev/null +++ b/examples/apol1/genotype_files/108189_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 DD 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/108190_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt b/examples/apol1/genotype_files/108190_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt new file mode 100644 index 0000000..25c94f8 --- /dev/null +++ b/examples/apol1/genotype_files/108190_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt @@ -0,0 +1,25 @@ +# This data file generated by Dynamic DNA (DDNA) Laboratories at: Thu Nov 7 16:03:14 2024 +# +# This file contains raw genetic data, including data that is not used in DDNA reports. +# This data has undergone a general quality review however only a subset of markers have been +# individually reviewed for accuracy. As such, this data is suitable only for research, +# educational, and informational use and not for any medical or diagnostic use. +# +# Below is a text version of your data. Fields are TAB-separated and +# each line corresponds to a single SNP. For each SNP, we provide its identifier, +# it chromosomal location realtive to Build 38 of the human reference genome, and the +# genotype call oriented with respect to the plus strand on the human reference sequence. +# More information about Dynamic DNA Laboratories can be found at: +# https://dynamicdnalabs.com +# +# More information on reference human assembly builds: +# https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001405.40/ +# +# rsid chromosome position genotype gs baf lrr +rs9701055 1 630053 TT 0.4798 0.729 -0.0501 +rs9651229 1 632287 CC 0.9820 0.715 0.3805 +rs9701872 1 632828 TT 0.5941 0.387 0.3956 +rs11497407 1 633147 GG 0.2666 0.594 0.3049 +rs73885319 22 36265860 AA 0.9000 0.500 0.0000 +rs60910145 22 36265988 TT 0.9000 0.500 0.0000 +rs71785313 22 36266000 DD 0.9000 0.500 0.0000 diff --git a/examples/apol1/genotype_files/README.md b/examples/apol1/genotype_files/README.md new file mode 100644 index 0000000..0bf030a --- /dev/null +++ b/examples/apol1/genotype_files/README.md @@ -0,0 +1,5 @@ +# APOL1 Pathological Test Set (DDNA-Style) + +Files are named like `<6digit>__X_X_GSAv3-DTC_GRCh38-12-13-2025.txt`. +Each file includes DDNA-style header lines and only 3 APOL1 loci plus 4 fixed example SNP rows. +Coverage is 6 APOL1 statuses x 2 files each (12 total). diff --git a/examples/apol1/genotype_files/expected_statuses.tsv b/examples/apol1/genotype_files/expected_statuses.tsv new file mode 100644 index 0000000..2eee354 --- /dev/null +++ b/examples/apol1/genotype_files/expected_statuses.tsv @@ -0,0 +1,13 @@ +participant_id genotype_file expected_status rs73885319 rs60910145 rs71785313 +108179_G0G0 108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G0/G0 AA TT II +108180_G0G0 108180_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G0/G0 AA TT II +108181_G1G0 108181_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G1/G0 AG TC II +108182_G1G0 108182_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G1/G0 AG CT II +108183_G1G1 108183_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G1/G1 GG CC II +108184_G1G1 108184_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G1/G1 GG CC II +108185_G2G0 108185_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G0 AA TT ID +108186_G2G0 108186_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G0 AA TT DI +108187_G2G1 108187_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G1 AG TC ID +108188_G2G1 108188_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G1 AG CT DI +108189_G2G2 108189_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G2 AA TT DD +108190_G2G2 108190_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt G2/G2 AA TT DD diff --git a/examples/apol1/genotype_files/samplesheet.csv b/examples/apol1/genotype_files/samplesheet.csv new file mode 100644 index 0000000..f84bd8d --- /dev/null +++ b/examples/apol1/genotype_files/samplesheet.csv @@ -0,0 +1,13 @@ +participant_id,genotype_file +108179_G0G0,108179_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108180_G0G0,108180_G0G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108181_G1G0,108181_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108182_G1G0,108182_G1G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108183_G1G1,108183_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108184_G1G1,108184_G1G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108185_G2G0,108185_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108186_G2G0,108186_G2G0_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108187_G2G1,108187_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108188_G2G1,108188_G2G1_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108189_G2G2,108189_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt +108190_G2G2,108190_G2G2_X_X_GSAv3-DTC_GRCh38-12-13-2025.txt diff --git a/examples/brca/classify_brca.py b/examples/brca/classify_brca.py index 3c1a279..a92a1e6 100644 --- a/examples/brca/classify_brca.py +++ b/examples/brca/classify_brca.py @@ -17,6 +17,8 @@ "alt", "variant_type", "match_type", + "ref_count", + "alt_count", ] def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]: diff --git a/examples/thalassemia/classify_thalassemia.py b/examples/thalassemia/classify_thalassemia.py index d40a8db..8682669 100644 --- a/examples/thalassemia/classify_thalassemia.py +++ b/examples/thalassemia/classify_thalassemia.py @@ -18,6 +18,8 @@ 'alt', 'variant_type', 'match_type', + 'ref_count', + 'alt_count', ] def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]: diff --git a/projects/apol1-new/assets/classify_apol1_exported.py b/projects/apol1-new/assets/classify_apol1_exported.py deleted file mode 100644 index cd4ea4a..0000000 --- a/projects/apol1-new/assets/classify_apol1_exported.py +++ /dev/null @@ -1,80 +0,0 @@ -from bioscript.classifier import DiploidResult, GenotypeClassifier, GenotypeEnum -from bioscript.types import Alleles, VariantCall - -# Define APOL1 variant calls -rs73885319 = VariantCall(rsid="rs73885319", ref=Alleles.A, alt=Alleles.NOT_A) -rs60910145 = VariantCall(rsid="rs60910145", ref=Alleles.T, alt=Alleles.NOT_T) -rs71785313 = VariantCall( - rsid=["rs71785313", "rs1317778148", "rs143830837"], ref=Alleles.I, alt=Alleles.D -) - -class APOL1Genotypes(GenotypeEnum): - G2 = "G2" - G1 = "G1" - G0 = "G0" - -MISSING = "G-" - -class APOL1Classifier(GenotypeClassifier): - def classify(self, matches) -> DiploidResult: - g2_match = matches.get(rs71785313) - site1_match = matches.get(rs73885319) - site2_match = matches.get(rs60910145) - - has_data = any(match is not None for match in (g2_match, site1_match, site2_match)) - if not has_data: - return DiploidResult(MISSING, MISSING) - - d_count = g2_match.alt_count if g2_match else 0 - site1_variants = site1_match.alt_count if site1_match else 0 - site2_variants = site2_match.alt_count if site2_match else 0 - - has_g1 = site1_variants > 0 and site2_variants > 0 - g1_total = site1_variants + site2_variants if has_g1 else 0 - - if d_count == 2: - return DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G2) - elif d_count == 1: - if g1_total >= 2: - return DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G1) - else: - return DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G0) - else: - if g1_total == 4: - return DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G1) - elif g1_total >= 2: - return DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G0) - else: - return DiploidResult(APOL1Genotypes.G0, APOL1Genotypes.G0) - -__bioscript__ = { - "variant_calls": [rs73885319, rs60910145, rs71785313], - "classifier": APOL1Classifier(), - "name": "APOL1", -} - -from bioscript import VariantFixture -from bioscript.types import MatchList - -fixture = VariantFixture( - [ - {"rsid": "rs73885319", "chromosome": "22", "position": 36265860}, - {"rsid": "rs60910145", "chromosome": "22", "position": 36265988}, - {"rsid": "rs71785313", "chromosome": "22", "position": 36266000}, - ], - assembly="GRCh38", -) - -def test_g0_homozygous(): - variants = fixture(["AA", "TT", "II"]) - matches = MatchList([rs73885319, rs60910145, rs71785313]).match_rows(variants) - classifier = APOL1Classifier() - result = classifier(matches) - assert result == "G0/G0" - -def test_g1_homozygous(): - variants = fixture(["GG", "CC", "II"]) - matches = MatchList([rs73885319, rs60910145, rs71785313]).match_rows(variants) - classifier = APOL1Classifier() - result = classifier(matches) - assert result == "G1/G1" diff --git a/projects/apol1-new/module.yaml b/projects/apol1-new/module.yaml deleted file mode 100644 index 9829d47..0000000 --- a/projects/apol1-new/module.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: syftbox.openmined.org/v1alpha1 -kind: Module -metadata: - name: pipeline-apol1-classifier - version: 0.1.0 - authors: - - madhava@openmined.org -spec: - runner: - kind: nextflow - entrypoint: workflow.nf - template: dynamic-nextflow - assets: - - path: classify_apol1_exported.py - inputs: - - name: participants - type: ParticipantSheet - description: Participant metadata sheet with APOL1 genotype file references - format: - kind: csv - - name: genotype_dir - type: Directory - description: Directory containing genotype files referenced by participants - outputs: - - name: apol1_sheet - type: ParticipantSheet - description: Samplesheet annotated with APOL1 genotype calls - format: - kind: tsv - path: apol1_participant_genotypes.tsv - parameters: [] diff --git a/projects/apol1-new/workflow.nf b/projects/apol1-new/workflow.nf deleted file mode 100644 index a2e166b..0000000 --- a/projects/apol1-new/workflow.nf +++ /dev/null @@ -1,135 +0,0 @@ -import groovy.json.JsonOutput - -nextflow.enable.dsl=2 - -workflow USER { - take: - context // BiovaultContext - participants // ParticipantSheet channel providing genotype rows - genotype_dir // Directory path containing genotype files - - main: - def assetsDir = file(context.params.assets_dir) - def classifierScript = file("${assetsDir}/classify_apol1_exported.py") - def classifier_ch = Channel.value(classifierScript) - - def output_name = 'apol1_participant_genotypes.tsv' - def output_name_ch = Channel.value(output_name) - - def genotype_dir_ch = genotype_dir.map { it.toString() } - - def participants_all_v = participants.collect() - def participants_json_v = participants_all_v.map { rows -> - JsonOutput.toJson(rows ?: []) - } - - apol1_sheet_ch = build_apol1_sheet( - participants_json_v, - genotype_dir_ch, - classifier_ch, - output_name_ch - ) - - emit: - apol1_sheet = apol1_sheet_ch -} - -process build_apol1_sheet { - tag { 'apol1-sheet' } - container 'ghcr.io/openmined/bioscript:latest' - publishDir params.results_dir, mode: 'copy', pattern: output_name, overwrite: true - - input: - val participants_json - val genotype_dir - path classifier_script - val output_name - - output: - path output_name - - script: - """ - set -euo pipefail - - cat <<'JSON' > participants.json -${participants_json} -JSON - - export BV_GENOTYPE_DIR="${genotype_dir}" - - python3 - <<'PY' -import csv -import os -import json -import pathlib -import importlib.util - -from bioscript import load_variants_tsv -from bioscript.types import MatchList - -participants_path = pathlib.Path('participants.json') -participants = json.loads(participants_path.read_text(encoding='utf-8')) - -base_dir = pathlib.Path(os.environ['BV_GENOTYPE_DIR']).resolve() -if not base_dir.exists(): - raise FileNotFoundError(f'Genotype directory not found: {base_dir}') -classifier_path = pathlib.Path("${classifier_script}").resolve() -output_path = pathlib.Path("${output_name}") -output_path.parent.mkdir(parents=True, exist_ok=True) - -spec = importlib.util.spec_from_file_location('apol1_classifier', classifier_path) -module = importlib.util.module_from_spec(spec) -spec.loader.exec_module(module) -config = getattr(module, '__bioscript__', None) -if not config: - raise SystemExit('Classifier module missing __bioscript__ configuration') - -variant_calls = config['variant_calls'] -classifier = config['classifier'] - -base_fields = [] -if participants: - first_row = participants[0] - if isinstance(first_row, dict): - base_fields = list(first_row.keys()) - -if 'participant_id' not in base_fields: - base_fields.insert(0, 'participant_id') -if 'apol1_genotype' not in base_fields: - base_fields.append('apol1_genotype') - -with output_path.open('w', encoding='utf-8', newline='') as fh: - writer = csv.DictWriter(fh, fieldnames=base_fields, delimiter='\t') - writer.writeheader() - - for idx, row in enumerate(participants): - if not isinstance(row, dict): - raise ValueError(f'Participant row is not a mapping: {row!r}') - - pid = row.get('participant_id') or row.get('participantID') or row.get('ParticipantID') or row.get('participant') - if not pid: - raise ValueError(f'Participant row missing participant_id: {row!r}') - - geno_ref = row.get('genotype_file') or row.get('genotype_file_path') or row.get('genotype_path') - if not geno_ref: - raise ValueError(f'Participant {pid} missing genotype file reference') - - geno_path = pathlib.Path(geno_ref) - if not geno_path.is_absolute(): - geno_path = (base_dir / geno_path).resolve() - if not geno_path.exists(): - raise FileNotFoundError(f'Genotype file not found for participant {pid}: {geno_path}') - - variants = load_variants_tsv(str(geno_path)) - matches = MatchList(variant_calls=variant_calls).match_rows(variants) - genotype = str(classifier(matches)) - - row_out = {key: row.get(key, '') for key in base_fields} - row_out['participant_id'] = str(pid) - row_out['apol1_genotype'] = genotype - writer.writerow({key: '' if value is None else str(value) for key, value in row_out.items()}) - -PY - """ -} diff --git a/python/src/bioscript/biovault.py b/python/src/bioscript/biovault.py index 58a6b1f..32ee895 100644 --- a/python/src/bioscript/biovault.py +++ b/python/src/bioscript/biovault.py @@ -763,7 +763,9 @@ def _generate_participant_workflow_nf(self, entrypoint: Optional[str] = None) -> if aggregated_path is None: aggregated_path = output_spec.path # Extract classifier name from aggregated path (e.g., result_HERC2.tsv -> HERC2) - if aggregated_path.startswith("result_") and aggregated_path.endswith(".tsv"): + if aggregated_path.startswith("result_") and aggregated_path.endswith( + ".tsv" + ): classifier_name = aggregated_path[7:-4] # Remove "result_" and ".tsv" if not classifier_name: diff --git a/python/src/bioscript/types.py b/python/src/bioscript/types.py index 93b465d..446b524 100644 --- a/python/src/bioscript/types.py +++ b/python/src/bioscript/types.py @@ -419,6 +419,8 @@ def to_report_row( "alt": alt, "variant_type": variant_type, "match_type": match_type, + "ref_count": self.ref_count, + "alt_count": self.alt_count, "filename": filename, }