From d6c76f7660642477af5c7ad4661f959bee169d1e Mon Sep 17 00:00:00 2001 From: Chunhong Mao Date: Tue, 24 Mar 2026 19:54:31 -0400 Subject: [PATCH] Added synthetic data download --- .../src/build_benchmark_data.py | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/variantiq-benchmark/src/build_benchmark_data.py b/variantiq-benchmark/src/build_benchmark_data.py index eeaa368..0a8c430 100755 --- a/variantiq-benchmark/src/build_benchmark_data.py +++ b/variantiq-benchmark/src/build_benchmark_data.py @@ -11,6 +11,7 @@ import tarfile vcf_download_url="https://nssac.bii.virginia.edu/~dm8qs/variantiq_data" +synthetic_data_download_url="https://nssac.bii.virginia.edu/~cm4su/variantiq_data" FASTQDUMP=["scif","run","fastq-dump"] build_benchmark_data=App() @@ -56,6 +57,35 @@ def default_action( before re-downloading them """ print(f"Building Benchmark Data: {dataset}") + + # ---------------------------------------------------------------------- + # SPECIAL CASE: variantiq_synthetic + # ---------------------------------------------------------------------- + if dataset == "variantiq-synthetic": + print("Special dataset detected: variantiq-synthetic") + + archive_name = "variantiq-synthetic.bz2" + download_url = f"{synthetic_data_download_url}/{archive_name}" + target_file = os.path.join(output_directory, archive_name) + + # Download only if not already expanded + if not os.path.exists(target_file) and not os.path.exists(f"{target_file}.expanded"): + print(f"Downloading: {download_url}") + download_url_to_file(download_url, target_file, True) + + print("Extracting dataset...") + with tarfile.open(target_file, "r:bz2") as tar: + tar.extractall(path=output_directory) + + # Mark as expanded + with open(f"{target_file}.expanded", "w") as fp: + pass + + print(f"Removing {target_file}") + os.remove(target_file) + + print(f"Dataset ready in: {output_directory}") + return try: os.mkdir( output_directory, 0o755 ) @@ -337,4 +367,4 @@ def process_samples(sample_file,data_def_folder,pipeline_inputs_only,comparison_ populate_sample_folder(row["sample_name"],row['read_files'],row['reference_genome'],row['truth_genome'],row,pipeline_inputs_only,comparison_vcf_only,output_directory,data_def_folder,data_def,dataset) -build_benchmark_data() \ No newline at end of file +build_benchmark_data()