From 552de149b21bdf5ceede0607ea3553c220db80f3 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Tue, 17 May 2016 21:15:01 +0200 Subject: [PATCH 01/11] inserito lo script python che a partire da un file di allineamenti e un file di annotazioni su SNPs restituisce calcola alcune statistiche. --- src/input-bam/statistics.py | 264 ++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 src/input-bam/statistics.py diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py new file mode 100644 index 00000000..7c5df217 --- /dev/null +++ b/src/input-bam/statistics.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import argparse +import logging +import os +import re +import sys + +import progressbar +import pysam + + +def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir): + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + + # return the number of SNPs in the file + snp_total_number = file_len(annotation_file) + logging.info('Number of Total SNPs: {0}'.format(snp_total_number)) + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + ann_file.readline() + + # read header info + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_name = header.index('name') + snp_header_observed = header.index('observed') + + statistics_file = open(out_dir + 'statistics.txt', 'w') + + # read annotation file + snp_valid = 0 # SNPs with almost 1 read aligned + snp_not_valid = 0 # SNPs with no read aligned + snp_count = 0 + read_with_first_allele = 0 + read_with_second_allele = 0 + read_with_different_allele = 0 + prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() + for line in ann_file: + snp_count += 1 + prgbar.update(snp_count) + + splitted_line = line.split(file_delimiter) + + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + snp_name = splitted_line[snp_header_index_name] + first_observed, second_observed = check_observed( + splitted_line[snp_header_observed]) + + frequences_observed = {'first': 0, 'second': 0, 'others': 0} + + logging.info('Loaded snp: {0}'.format(snp_name)) + statistics_file.write('\nLoaded snp: {0}'.format(snp_name)) + frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0} + count_read = 0 + for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop): + + logging.debug('Coverage at base {0} = {1}'.format( + pileupcolumn.pos, pileupcolumn.n)) + for pileupread in pileupcolumn.pileups: + if not pileupread.is_del and not pileupread.is_refskip: + count_read += 1 + base_value = pileupread.alignment.query_sequence[ + pileupread.query_position] + logging.debug('Base in read = {0}, First observed = {1}, Second observed = {2}'.format( + base_value, first_observed, second_observed)) + frequences[base_value] += 1 + if base_value == first_observed: + frequences_observed['first'] += 1 + read_with_first_allele += 1 + elif base_value == second_observed: + frequences_observed['second'] += 1 + read_with_second_allele += 1 + else: + frequences_observed['others'] += 1 + read_with_different_allele += 1 + logging.debug(frequences) + + if count_read != 0: + snp_valid += 1 + # number of reads aligned with the SNP + logging.info( + '\nTotal number of read aligned with SNP: {0}'.format(count_read)) + statistics_file.write( + '\nTotal number of read aligned with SNP: {0}'.format(count_read)) + + # number of reads aligned with the SNP in the first allele + logging.info('Number of read aligned with the first allele: {0}'.format( + frequences_observed['first'])) + statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format( + frequences_observed['first'])) + + # reads aligned with the SNP in the first allele over all + logging.info('Percentual of read aligned with the first allele: {0}'.format(float( + frequences_observed['first']) / count_read * 100)) + statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float( + frequences_observed['first']) / count_read * 100)) + + # number of reads aligned with the SNP in the second allele + logging.info('Number of read aligned with the second allele: {0}'.format( + frequences_observed['second'])) + statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format( + frequences_observed['second'])) + + # reads aligned with the SNP in the second allele over all + logging.info('Percentual of read aligned with the second allele: {0}'.format(float( + frequences_observed['second']) / count_read * 100)) + statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float( + frequences_observed['second']) / count_read * 100)) + + # reads aligned with the SNP - no first or second allele + logging.info('Number of read aligned with different allele: {0}'.format( + frequences_observed['others'])) + statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format( + float(frequences_observed['others']) / count_read) * 100) + + else: + snp_not_valid += 1 + logging.info( + '\nNo reads aligned with this snp: {0}'.format(snp_name)) + statistics_file.write( + '\nNo reads aligned with this snp: {0}'.format(snp_name)) + + prgbar.finish() + logging.info( + '\n*************************************************************') + logging.info('Processed {0} SNPs'.format(snp_count)) + statistics_file.write('\nProcessed {0} SNPs'.format(snp_count)) + + logging.info('SNPs with almost one read aligned (%): {0}'.format( + float(snp_valid) / snp_count) * 100) + statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format( + float(snp_valid) / snp_count) * 100) + logging.info('SNPs with no read aligned (%): {0}'.format( + float(snp_not_valid) / snp_count) * 100) + statistics_file.write('\nSNPs with no read aligned (%): {0}'.format( + float(snp_not_valid) / snp_count) * 100) + + logging.info('Total number of read aligned with the first allele: {0}'.format( + read_with_first_allele)) + statistics_file.write( + '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele)) + + logging.info('Total number of read aligned with the second allele: {0}'.format( + read_with_second_allele)) + statistics_file.write( + '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele)) + logging.info('Total number of read aligned with different allele: {0}'.format( + read_with_different_allele)) + statistics_file.write( + '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele)) + + logging.info('Generation of statistics: Completed') + statistics_file.close() + ann_file.close() + in_sam.close() + + +def main(): + + parser = argparse.ArgumentParser( + description='Generate statistics using alignment BAM file and SNPs annotation file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs location annotation file.', + required=True) + parser.add_argument('-r', + action='store', + dest='region', + help='Region to be examined in the form: chr:start-stop. Ex. 1:100-200.') + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-v', + help='increase output verbosity', + action='count') + + args = parser.parse_args() + + in_bam_file = args.fileBAM + in_region = args.region + out_root_dir = args.output_dir + + if args.v is None: + log_level = logging.INFO + else: + log_level = logging.DEBUG + + logging.basicConfig(level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H%M%S') + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + logging.info('StatisticsGenerator: Program Started') + + region = check_region(in_region) + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + ann_file_delimiter = '\t' + + generate_statistics(args.annotationFile, + ann_file_delimiter, region, in_bam_file, out_dir) + + logging.info('StatisticsGenerator: Program Completed') + + +# utility + +def check_observed(observed_couple): + # observed are in the form A/T, T/-, -/C + regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') + couple = regexpr_snp.search(observed_couple) + return couple.group(1), couple.group(2) + + +def file_len(file): + # return the length (number of rows) of a file + with open(file) as f: + for i, l in enumerate(f): + pass + return i + + +def check_region(input_region): + # if no region is set, is used all the 1st chromosome (for test purpose) + if input_region is not None: + regexp_reg = re.compile(r'^(\w+):(\d+)-(\d+)') + region = re.search(regexp_reg, input_region) + if region is not None: + in_chr = region.group(1) + in_start = int(region.group(2)) + in_stop = int(region.group(3)) + if in_start > in_stop: + logging.error('Wrong input region.') + sys.exit(1) + else: + return in_chr, in_start, in_stop + else: + return '1' + + +if __name__ == '__main__': + main() From 22d3c35bf54d7abdcfadebd28313128904268fe8 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Tue, 17 May 2016 23:03:45 +0200 Subject: [PATCH 02/11] Inserimento script per conversione BAM to WIF. Da completare! --- src/input-bam/bamToWif.py | 193 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 src/input-bam/bamToWif.py diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py new file mode 100644 index 00000000..961947f3 --- /dev/null +++ b/src/input-bam/bamToWif.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import logging +import os +import sys +import argparse +import progressbar +import re +import pprint + +import pysam + + +def check_observed(observed_couple): + # observed are in the form A/T, T/-, -/C + regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') + couple = regexpr_snp.search(observed_couple) + return couple.group(1), couple.group(2) + + +def file_len(file): + # return the length (number of rows) of a file + with open(file) as f: + for i, l in enumerate(f): + pass + return i + + +def snpsInRead(alignment_file, annotation_file, file_delimiter): + + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + + # return the number of SNPs in the file + # snp_total_number = file_len(annotation_file) + + total_fetch_alignment = in_sam.fetch() + total_read_number = in_sam.count() + + read_count = 0 + read_bar = progressbar.ProgressBar(maxval=total_read_number).start() + snp_read_list = {} + for read in total_fetch_alignment: + read_count += 1 + read_bar.update(read_count) + + read_qname = read.qname + read_start = read.reference_start + read_end = read.reference_end + logging.debug('read {0}, start: {1}, end: {2}'.format( + read_qname, read_start, read_end)) + + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + # ann_file.readline() + + # Header info - first line + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_chromosome = header.index('chrom') + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_observed = header.index('observed') + snp_read_list[read_qname] = [] + # snp_bar = progressbar.ProgressBar(maxval=snp_total_number).start() + # snp_count = 0 + for line in ann_file: + # snp_count += 1 + # snp_bar.update(snp_count) + + splitted_line = line.split(file_delimiter) + + snp_chrom = splitted_line[snp_header_index_chromosome] + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + first_observed, second_observed = check_observed( + splitted_line[snp_header_index_observed]) + + # Check every read aligned with the SNPs + for pileupcolumn in in_sam.pileup(snp_chrom, snp_start, snp_stop): + for pileupread in pileupcolumn.pileups: + read_snp_name = pileupread.alignment.query_name + logging.debug('Found read: {0}'.format(read_snp_name)) + if read_snp_name == read_qname: + if not pileupread.is_del and not pileupread.is_refskip: + read_snp_position = pileupread.query_position + base_value = pileupread.alignment.query_sequence[ + pileupread.query_position] + if base_value == first_observed: + allele = '0' + elif base_value == second_observed: + allele = '1' + else: + # skip the SNP if not in first or second allele + continue + base_quality = pileupread.alignment.query_alignment_qualities[ + read_snp_position] + # if read_snp_name == read_qname and + # read_snp_position >= read_start and + # read_snp_position <= read_end: + + snp_read_list[read_qname].append( + [read_snp_position, base_value, allele, base_quality]) + + ann_file.close() + snp_read_list[read_qname].append(read.mapping_quality) + # snp_bar.finish() + read_bar.finish() + return snp_read_list + + +def convertBamToWif(snp_read_list, out_dir): + + output_file = open(out_dir + 'alignment.wif', 'w') + + for read, snp_list in snp_read_list.items(): + if len(snp_list) <= 1: + continue + else: + snp_mapping_quality = snp_list[-1] + for snp in snp_list[:-1]: + output_file.write('{0} {1} {2} {3} : '.format( + snp[0], snp[1], snp[2], snp[3])) + output_file.write('# {0} : NA\n'.format(snp_mapping_quality)) + + +def main(): + + parser = argparse.ArgumentParser( + description='Convert an alignment file .bam in a .wif Hapcol compatible file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs annotation file.', + required=True) + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-v', + help='increase output verbosity', + action='count') + + args = parser.parse_args() + + logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####") + + in_bam_file = args.fileBAM + out_root_dir = args.output_dir + + if args.v == 0: + log_level = logging.INFO + else: + log_level = logging.DEBUG + + logging.basicConfig(level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H%M%S') + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + logging.info('BamToWif: Program Started') + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + ann_file_delimiter = '\t' + + logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####") + + snps_list = snpsInRead(in_bam_file, args.annotationFile, + ann_file_delimiter) + + logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####") + + convertBamToWif(snps_list, out_dir) + + logging.info('BamToWif: Program Completed') + +if __name__ == '__main__': + main() From 9c219b26230df544340d875f828c7d0660d6c530 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Wed, 18 May 2016 21:56:21 +0200 Subject: [PATCH 03/11] qualche correzione ai log --- src/input-bam/statistics.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py index 7c5df217..9c81e13c 100644 --- a/src/input-bam/statistics.py +++ b/src/input-bam/statistics.py @@ -86,7 +86,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, snp_valid += 1 # number of reads aligned with the SNP logging.info( - '\nTotal number of read aligned with SNP: {0}'.format(count_read)) + 'Total number of read aligned with SNP: {0}'.format(count_read)) statistics_file.write( '\nTotal number of read aligned with SNP: {0}'.format(count_read)) @@ -96,7 +96,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format( frequences_observed['first'])) - # reads aligned with the SNP in the first allele over all + # reads aligned with the SNP in the first allele logging.info('Percentual of read aligned with the first allele: {0}'.format(float( frequences_observed['first']) / count_read * 100)) statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float( @@ -108,28 +108,36 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format( frequences_observed['second'])) - # reads aligned with the SNP in the second allele over all + # reads aligned with the SNP in the second allele logging.info('Percentual of read aligned with the second allele: {0}'.format(float( frequences_observed['second']) / count_read * 100)) statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float( frequences_observed['second']) / count_read * 100)) - # reads aligned with the SNP - no first or second allele + # reads aligned with the SNP - different allele logging.info('Number of read aligned with different allele: {0}'.format( frequences_observed['others'])) - statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format( + statistics_file.info('\nNumber of read aligned with different allele: {0}'.format( + frequences_observed['others'])) + + logging.info('Percentual of read aligned with different allele: {0}'.format( + float(frequences_observed['others']) / count_read) * 100) + statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format( float(frequences_observed['others']) / count_read) * 100) else: snp_not_valid += 1 logging.info( - '\nNo reads aligned with this snp: {0}'.format(snp_name)) + 'No reads aligned with this snp: {0}'.format(snp_name)) statistics_file.write( '\nNo reads aligned with this snp: {0}'.format(snp_name)) prgbar.finish() logging.info( - '\n*************************************************************') + '#### TOTAL RESULTS ####') + statistics_file.write( + '\n#### TOTAL RESULTS ####') + logging.info('Processed {0} SNPs'.format(snp_count)) statistics_file.write('\nProcessed {0} SNPs'.format(snp_count)) From dcf3cc3c305cdc637f94cce4acb08de8564e8b93 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Wed, 18 May 2016 21:57:50 +0200 Subject: [PATCH 04/11] qualche correzione ai log --- src/input-bam/bamToWif.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py index 961947f3..98f93cf9 100644 --- a/src/input-bam/bamToWif.py +++ b/src/input-bam/bamToWif.py @@ -151,7 +151,7 @@ def main(): args = parser.parse_args() - logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####") + logging.info('#### STEP 1 - PROCESSING INPUT ARGUMENTS ####') in_bam_file = args.fileBAM out_root_dir = args.output_dir @@ -178,12 +178,12 @@ def main(): ann_file_delimiter = '\t' - logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####") + logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####') snps_list = snpsInRead(in_bam_file, args.annotationFile, ann_file_delimiter) - logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####") + logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####') convertBamToWif(snps_list, out_dir) From 3bcd3a11072b3c7bd9b90462d712f5787543ea64 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Wed, 18 May 2016 22:16:48 +0200 Subject: [PATCH 05/11] =?UTF-8?q?Aggiunta=20possibilit=C3=A0=20di=20decide?= =?UTF-8?q?re=20in=20fase=20di=20esecuzione=20dello=20script=20il=20delimi?= =?UTF-8?q?tatore=20dei=20campi=20dei=20file=20di=20annotazione=20e=20lo?= =?UTF-8?q?=20skip=20della=20prima=20riga?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/input-bam/bamToWif.py | 20 ++++++++++++++++---- src/input-bam/statistics.py | 20 ++++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py index 98f93cf9..c7c24e53 100644 --- a/src/input-bam/bamToWif.py +++ b/src/input-bam/bamToWif.py @@ -27,7 +27,7 @@ def file_len(file): return i -def snpsInRead(alignment_file, annotation_file, file_delimiter): +def snpsInRead(alignment_file, annotation_file, file_delimiter, skip_first_line): in_sam = pysam.AlignmentFile(alignment_file, 'rb') @@ -53,7 +53,8 @@ def snpsInRead(alignment_file, annotation_file, file_delimiter): ann_file = open(annotation_file) # skip first line (comment if the file starts with the header) - # ann_file.readline() + if skip_first_line: + ann_file.readline() # Header info - first line header = ann_file.readline().split(file_delimiter) @@ -145,6 +146,14 @@ def main(): dest='output_dir', help='Output (root) directory.', required=True) + parser.add_argument('-skip-first-line', + action='store', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') parser.add_argument('-v', help='increase output verbosity', action='count') @@ -176,12 +185,15 @@ def main(): if not os.path.exists(out_dir): os.mkdir(out_dir) - ann_file_delimiter = '\t' + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####') snps_list = snpsInRead(in_bam_file, args.annotationFile, - ann_file_delimiter) + ann_file_delimiter, args.skip_first_line) logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####') diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py index 9c81e13c..af123a39 100644 --- a/src/input-bam/statistics.py +++ b/src/input-bam/statistics.py @@ -11,7 +11,7 @@ import pysam -def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir): +def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line): in_sam = pysam.AlignmentFile(alignment_file, 'rb') # return the number of SNPs in the file @@ -20,7 +20,8 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, ann_file = open(annotation_file) # skip first line (comment if the file starts with the header) - ann_file.readline() + if skip_first_line: + ann_file.readline() # read header info header = ann_file.readline().split(file_delimiter) @@ -193,6 +194,14 @@ def main(): dest='output_dir', help='Output (root) directory.', required=True) + parser.add_argument('-skip-first-line', + action='store', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') parser.add_argument('-v', help='increase output verbosity', action='count') @@ -225,10 +234,13 @@ def main(): if not os.path.exists(out_dir): os.mkdir(out_dir) - ann_file_delimiter = '\t' + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter generate_statistics(args.annotationFile, - ann_file_delimiter, region, in_bam_file, out_dir) + ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line) logging.info('StatisticsGenerator: Program Completed') From 1f0a0ef6abbcd53904471609d779189700292ffa Mon Sep 17 00:00:00 2001 From: ddcatania Date: Sun, 22 May 2016 23:43:12 +0200 Subject: [PATCH 06/11] fix sui log --- src/input-bam/statistics.py | 107 ++++++++++++++---------------------- 1 file changed, 41 insertions(+), 66 deletions(-) diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py index af123a39..a4225062 100644 --- a/src/input-bam/statistics.py +++ b/src/input-bam/statistics.py @@ -11,7 +11,7 @@ import pysam -def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line): +def generate_statistics(annotation_file, file_delimiter, region, alignment_file, skip_first_line): in_sam = pysam.AlignmentFile(alignment_file, 'rb') # return the number of SNPs in the file @@ -32,8 +32,6 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, snp_header_index_name = header.index('name') snp_header_observed = header.index('observed') - statistics_file = open(out_dir + 'statistics.txt', 'w') - # read annotation file snp_valid = 0 # SNPs with almost 1 read aligned snp_not_valid = 0 # SNPs with no read aligned @@ -57,11 +55,10 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, frequences_observed = {'first': 0, 'second': 0, 'others': 0} logging.info('Loaded snp: {0}'.format(snp_name)) - statistics_file.write('\nLoaded snp: {0}'.format(snp_name)) + frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0} count_read = 0 for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop): - logging.debug('Coverage at base {0} = {1}'.format( pileupcolumn.pos, pileupcolumn.n)) for pileupread in pileupcolumn.pileups: @@ -88,85 +85,51 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file, # number of reads aligned with the SNP logging.info( 'Total number of read aligned with SNP: {0}'.format(count_read)) - statistics_file.write( - '\nTotal number of read aligned with SNP: {0}'.format(count_read)) # number of reads aligned with the SNP in the first allele - logging.info('Number of read aligned with the first allele: {0}'.format( - frequences_observed['first'])) - statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format( - frequences_observed['first'])) - - # reads aligned with the SNP in the first allele - logging.info('Percentual of read aligned with the first allele: {0}'.format(float( - frequences_observed['first']) / count_read * 100)) - statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float( - frequences_observed['first']) / count_read * 100)) + logging.info('Number of read aligned with the first allele: {0} ({1}%)'.format( + frequences_observed['first'], round((float( + frequences_observed['first']) / count_read) * 100, 2))) # number of reads aligned with the SNP in the second allele - logging.info('Number of read aligned with the second allele: {0}'.format( - frequences_observed['second'])) - statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format( - frequences_observed['second'])) - - # reads aligned with the SNP in the second allele - logging.info('Percentual of read aligned with the second allele: {0}'.format(float( - frequences_observed['second']) / count_read * 100)) - statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float( - frequences_observed['second']) / count_read * 100)) + logging.info('Number of read aligned with the second allele: {0} ({1}%)'.format( + frequences_observed['second'], round((float( + frequences_observed['second']) / count_read) * 100, 2))) # reads aligned with the SNP - different allele - logging.info('Number of read aligned with different allele: {0}'.format( - frequences_observed['others'])) - statistics_file.info('\nNumber of read aligned with different allele: {0}'.format( - frequences_observed['others'])) - - logging.info('Percentual of read aligned with different allele: {0}'.format( - float(frequences_observed['others']) / count_read) * 100) - statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format( - float(frequences_observed['others']) / count_read) * 100) + logging.info('Number of read aligned with different allele: {0} ({1}%)'.format( + frequences_observed['others'], round((float( + frequences_observed['others']) / count_read) * 100, 2))) else: snp_not_valid += 1 logging.info( 'No reads aligned with this snp: {0}'.format(snp_name)) - statistics_file.write( - '\nNo reads aligned with this snp: {0}'.format(snp_name)) prgbar.finish() logging.info( '#### TOTAL RESULTS ####') - statistics_file.write( - '\n#### TOTAL RESULTS ####') logging.info('Processed {0} SNPs'.format(snp_count)) - statistics_file.write('\nProcessed {0} SNPs'.format(snp_count)) - logging.info('SNPs with almost one read aligned (%): {0}'.format( - float(snp_valid) / snp_count) * 100) - statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format( - float(snp_valid) / snp_count) * 100) - logging.info('SNPs with no read aligned (%): {0}'.format( - float(snp_not_valid) / snp_count) * 100) - statistics_file.write('\nSNPs with no read aligned (%): {0}'.format( - float(snp_not_valid) / snp_count) * 100) + logging.info('SNPs with almost one read aligned: {0} ({1}%)'.format( + snp_valid, round(((float( + snp_valid) / snp_count) * 100), 2))) + + logging.info('SNPs with no read aligned: {0} ({1}%)'.format( + snp_not_valid, round(((float( + snp_not_valid) / snp_count) * 100), 2))) logging.info('Total number of read aligned with the first allele: {0}'.format( read_with_first_allele)) - statistics_file.write( - '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele)) logging.info('Total number of read aligned with the second allele: {0}'.format( read_with_second_allele)) - statistics_file.write( - '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele)) + logging.info('Total number of read aligned with different allele: {0}'.format( read_with_different_allele)) - statistics_file.write( - '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele)) logging.info('Generation of statistics: Completed') - statistics_file.close() ann_file.close() in_sam.close() @@ -195,7 +158,7 @@ def main(): help='Output (root) directory.', required=True) parser.add_argument('-skip-first-line', - action='store', + action='store_true', dest='skip_first_line', help='set this flag if the first line IS NOT the header') parser.add_argument('-file-delimiter', @@ -217,30 +180,28 @@ def main(): else: log_level = logging.DEBUG - logging.basicConfig(level=log_level, - format='%(levelname)-8s [%(asctime)s] %(message)s', - datefmt='%y%m%d %H%M%S') - if not os.path.exists(out_root_dir): logging.error('Output dir not found.') sys.exit(1) - logging.info('StatisticsGenerator: Program Started') - - region = check_region(in_region) - out_dir = out_root_dir + '/' if not os.path.exists(out_dir): os.mkdir(out_dir) + prepare_loggers(out_dir, log_level) + + logging.info('StatisticsGenerator: Program Started') + + region = check_region(in_region) + if args.file_delim is None: ann_file_delimiter = '\t' else: ann_file_delimiter = args.file_delimiter generate_statistics(args.annotationFile, - ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line) + ann_file_delimiter, region, in_bam_file, args.skip_first_line) logging.info('StatisticsGenerator: Program Completed') @@ -262,6 +223,20 @@ def file_len(file): return i +def prepare_loggers(out_dir, log_level): + logging.basicConfig(filename=out_dir + 'statistics.txt', + filemode='w', + level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H:%M:%S') + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter( + '[%(levelname)-8s] %(asctime)s - %(message)s') + console.setFormatter(formatter) + logging.getLogger('').addHandler(console) + + def check_region(input_region): # if no region is set, is used all the 1st chromosome (for test purpose) if input_region is not None: From 1cb00ed95d36f8f7d434389af71b031748bf9d99 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Mon, 13 Jun 2016 23:44:48 +0200 Subject: [PATCH 07/11] nuova versione. Carica in memoria la mappa di SNPs, la scorre e per ogni read allineato allo snp controlla se questo si allinea con l'allele di maggioranza, di minoranza o altro. --- src/input-bam/statistics_v2.py | 220 +++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 src/input-bam/statistics_v2.py diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py new file mode 100644 index 00000000..87867e1b --- /dev/null +++ b/src/input-bam/statistics_v2.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import argparse +import logging +import os +import re +import sys + +import progressbar +import pysam +import time + + +def generate_statistics(snp_map, alignment_file): + logging.debug('Started method generate_statistics') + start_time = time.time() + + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + + snp_total_number = len(snp_map) + + snp_count = 0 + prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() + count_first_allele = 0 + count_second_allele = 0 + count_other_allele = 0 + + read_count = 0 + for snp_name, snp_values in snp_map.items(): + snp_count += 1 + prgbar.update(snp_count) + logging.debug('Loaded snp: {0}'.format(snp_name)) + snp_chrom = snp_values[0] + snp_start = snp_values[1] + snp_end = snp_values[2] + first_observed = snp_values[3] + second_observed = snp_values[4] + logging.debug( + 'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end)) + fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end) + tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end) + logging.info( + 'Number of reads aligned with snp: {0}'.format(tot_fetch_aln)) + + if tot_fetch_aln == 0 or tot_fetch_aln < 10: + logging.debug('SNP without at least ten items will be discarded.') + continue + + for read in fetch_aln: + read_count += 1 + read_name = read.query_name + read_start = read.reference_start + read_end = read.reference_end + read_sequence = read.query_sequence + logging.debug('Loaded read: {0}'.format(read_name)) + + if snp_start >= read_start and snp_end <= read_end: + logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format( + read_sequence[snp_start:snp_end], first_observed, second_observed)) + if read_sequence[snp_start:snp_end] == first_observed: + count_first_allele += 1 + elif read_sequence[snp_start:snp_end] == second_observed: + count_second_allele += 1 + elif not read_sequence[snp_start:snp_end].strip(): + logging.debug('No valid read found.') + else: + count_other_allele += 1 + prgbar.finish() + + logging.info('Total number of SNPs: {0}'.format(snp_total_number)) + + logging.info('Total number of reads: {0}'.format(read_count)) + + logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format( + count_first_allele, (float(count_first_allele) / read_count) * 100)) + + logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format( + count_second_allele, (float(count_second_allele) / read_count) * 100)) + + logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format( + count_other_allele, (float(count_other_allele) / read_count) * 100)) + + logging.debug('Finished method generate_statistics in {0} seconds'.format( + time.time() - start_time)) + logging.info('Generation of statistics: Completed') + in_sam.close() + + +def main(): + + parser = argparse.ArgumentParser( + description='Generate statistics using alignment BAM file and SNPs annotation file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs location annotation file.', + required=True) + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-skip-first-line', + action='store_true', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') + parser.add_argument('-v', + help='increase output verbosity', + dest='verbosity', + action='store_true') + + args = parser.parse_args() + + in_bam_file = args.fileBAM + out_root_dir = args.output_dir + + if args.verbosity: + log_level = logging.DEBUG + else: + log_level = logging.INFO + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + prepare_loggers(log_level) + + logging.info('Program Started') + + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter + + logging.info('#### STEP 1 - Loading SNPs ####') + + snp_map = reading_snp(args.annotationFile, + ann_file_delimiter, args.skip_first_line) + + logging.info('#### STEP 2 - Generating Statistics ####') + + generate_statistics(snp_map, in_bam_file) + + logging.info('Program Completed') + + +# utility + +def check_observed(observed_couple): + # observed are in the form A/T, T/-, -/C + regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') + couple = regexpr_snp.search(observed_couple) + return couple.group(1), couple.group(2) + + +def prepare_loggers(log_level): + logging.basicConfig(level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H:%M:%S') + + +def reading_snp(annotation_file, file_delimiter, flag_first_line): + + logging.debug('Started method reading_snp') + start_time = time.time() + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + if flag_first_line: + ann_file.readline() + + # read header info + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_chrom = header.index('chrom') + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_name = header.index('name') + snp_header_observed = header.index('observed') + snp_map = {} + for line in ann_file: + splitted_line = line.split(file_delimiter) + + snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1] + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + snp_name = splitted_line[snp_header_index_name] + first_observed, second_observed = check_observed( + splitted_line[snp_header_observed]) + logging.debug('{0},{1},{2},{3}'.format( + snp_chrom, snp_start, snp_stop, snp_name)) + snp_map[snp_name] = [snp_chrom, snp_start, snp_stop, + first_observed, second_observed] + + ann_file.close() + + logging.debug('Finished method reading_snp in: {0} seconds'.format( + time.time() - start_time)) + + return snp_map + + +if __name__ == '__main__': + main() From 4ee1395b1c4aac7ddb311645ac8c9246b5ee6560 Mon Sep 17 00:00:00 2001 From: ddcatania Date: Tue, 21 Jun 2016 22:59:09 +0200 Subject: [PATCH 08/11] Calcola i due alleli con maggior frequenza e secondo una certa soglia 'epsilon' classifica gli SNP in omozigote o eterozigote --- src/input-bam/statistics_v2.py | 210 ++++++++++++++++++--------------- 1 file changed, 117 insertions(+), 93 deletions(-) diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py index 87867e1b..41169dc9 100644 --- a/src/input-bam/statistics_v2.py +++ b/src/input-bam/statistics_v2.py @@ -6,85 +6,11 @@ import os import re import sys +from collections import Counter +import time import progressbar import pysam -import time - - -def generate_statistics(snp_map, alignment_file): - logging.debug('Started method generate_statistics') - start_time = time.time() - - in_sam = pysam.AlignmentFile(alignment_file, 'rb') - - snp_total_number = len(snp_map) - - snp_count = 0 - prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() - count_first_allele = 0 - count_second_allele = 0 - count_other_allele = 0 - - read_count = 0 - for snp_name, snp_values in snp_map.items(): - snp_count += 1 - prgbar.update(snp_count) - logging.debug('Loaded snp: {0}'.format(snp_name)) - snp_chrom = snp_values[0] - snp_start = snp_values[1] - snp_end = snp_values[2] - first_observed = snp_values[3] - second_observed = snp_values[4] - logging.debug( - 'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end)) - fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end) - tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end) - logging.info( - 'Number of reads aligned with snp: {0}'.format(tot_fetch_aln)) - - if tot_fetch_aln == 0 or tot_fetch_aln < 10: - logging.debug('SNP without at least ten items will be discarded.') - continue - - for read in fetch_aln: - read_count += 1 - read_name = read.query_name - read_start = read.reference_start - read_end = read.reference_end - read_sequence = read.query_sequence - logging.debug('Loaded read: {0}'.format(read_name)) - - if snp_start >= read_start and snp_end <= read_end: - logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format( - read_sequence[snp_start:snp_end], first_observed, second_observed)) - if read_sequence[snp_start:snp_end] == first_observed: - count_first_allele += 1 - elif read_sequence[snp_start:snp_end] == second_observed: - count_second_allele += 1 - elif not read_sequence[snp_start:snp_end].strip(): - logging.debug('No valid read found.') - else: - count_other_allele += 1 - prgbar.finish() - - logging.info('Total number of SNPs: {0}'.format(snp_total_number)) - - logging.info('Total number of reads: {0}'.format(read_count)) - - logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format( - count_first_allele, (float(count_first_allele) / read_count) * 100)) - - logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format( - count_second_allele, (float(count_second_allele) / read_count) * 100)) - - logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format( - count_other_allele, (float(count_other_allele) / read_count) * 100)) - - logging.debug('Finished method generate_statistics in {0} seconds'.format( - time.time() - start_time)) - logging.info('Generation of statistics: Completed') - in_sam.close() def main(): @@ -110,6 +36,10 @@ def main(): action='store_true', dest='skip_first_line', help='set this flag if the first line IS NOT the header') + parser.add_argument('-epsilon', + action='store', + dest='threshold', + help='set the threshold for Homozygous or Heterozygous. Default: 80') parser.add_argument('-file-delimiter', action='store', dest='file_delim', @@ -147,6 +77,10 @@ def main(): else: ann_file_delimiter = args.file_delimiter + threshold = 80 + if args.threshold is not None: + threshold = int(args.threshold) + logging.info('#### STEP 1 - Loading SNPs ####') snp_map = reading_snp(args.annotationFile, @@ -154,20 +88,12 @@ def main(): logging.info('#### STEP 2 - Generating Statistics ####') - generate_statistics(snp_map, in_bam_file) + classify_snp(in_bam_file, snp_map, threshold, out_dir) logging.info('Program Completed') # utility - -def check_observed(observed_couple): - # observed are in the form A/T, T/-, -/C - regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') - couple = regexpr_snp.search(observed_couple) - return couple.group(1), couple.group(2) - - def prepare_loggers(log_level): logging.basicConfig(level=log_level, format='%(levelname)-8s [%(asctime)s] %(message)s', @@ -192,29 +118,127 @@ def reading_snp(annotation_file, file_delimiter, flag_first_line): snp_header_index_start = header.index('chromStart') snp_header_index_stop = header.index('chromEnd') snp_header_index_name = header.index('name') - snp_header_observed = header.index('observed') snp_map = {} for line in ann_file: + splitted_line = line.split(file_delimiter) snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1] snp_start = int(splitted_line[snp_header_index_start]) snp_stop = int(splitted_line[snp_header_index_stop]) snp_name = splitted_line[snp_header_index_name] - first_observed, second_observed = check_observed( - splitted_line[snp_header_observed]) - logging.debug('{0},{1},{2},{3}'.format( - snp_chrom, snp_start, snp_stop, snp_name)) - snp_map[snp_name] = [snp_chrom, snp_start, snp_stop, - first_observed, second_observed] + + snp_map[snp_name] = [snp_chrom, snp_start, snp_stop] ann_file.close() logging.debug('Finished method reading_snp in: {0} seconds'.format( time.time() - start_time)) - return snp_map +def classify_snp(alignment_file, snp_map, threshold, out_dir): + logging.debug('Started method classifySNP') + start_time = time.time() + count_homo = 0 + count_hete = 0 + snp_total_number = len(snp_map) + statistics_file = open(out_dir + 'statistics2.txt', 'w') + + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() + cnt = 0 + + for snp_name, snp_values in snp_map.items(): + statistics_file.write('Reading SNP: {0}\n'.format(snp_name)) + cnt += 1 + prgbar.update(cnt) + snp_chrom = snp_values[0] + snp_start = snp_values[1] + snp_end = snp_values[2] + + logging.debug( + 'SNP start: {0} - SNP end:{1}'.format(snp_start, snp_end)) + + fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end) + count_aln = in_sam.count(snp_chrom, snp_start, snp_end) + + alleles = Counter() + + for read in fetch_aln: + read_name = read.query_name + logging.debug('Reading read: {0}'.format(read_name)) + read_sequence = read.query_sequence + read_base = read_sequence[snp_start:snp_end] + if not read_base: + logging.debug('Empty read - indel') + continue + logging.debug('Read base: {0}'.format(read_base)) + alleles[read_base] += 1 + + logging.debug(alleles) + statistics_file.write('Allele counter for the SNP:\n') + for key, value in alleles.items(): + statistics_file.write('{0} : {1}\n'.format(key, value)) + + if alleles: + first_allele = alleles.most_common(2)[0] + logging.debug(first_allele) + second_allele = alleles.most_common(2)[1] + logging.debug(second_allele) + + if (first_allele[1] / count_aln) * 100 >= threshold: + statistics_file.write('Snp is Homozygous\n') + logging.debug('The snp {0} is: Homozygous'.format(snp_name)) + count_homo += 1 + else: + statistics_file.write('Snp is Heterozygous\n') + logging.debug('The snp {0} is: Heterozygous'.format(snp_name)) + count_hete += 1 + + logging.debug( + 'Number of read aligned on SNP: {0}'.format(count_aln)) + statistics_file.write( + 'Number of read aligned on SNP: {0}\n'.format(count_aln)) + + out = {} + if count_aln > 0: + for key in alleles: + out[key] = (float(alleles[key]) / count_aln) * 100 + + logging.debug(out) + statistics_file.write('Base Frequences for the SNP:\n') + for key, value in out.items(): + statistics_file.write('{0} : {1}\n'.format(key, value)) + + prgbar.finish() + in_sam.close() + + logging.debug( + 'Total number of SNPs: {0}'.format(snp_total_number)) + + statistics_file.write( + 'Total number of SNPs: {0}\n'.format(snp_total_number)) + + logging.debug( + 'Number of SNPs: Homozygous {0} ({1}%)'.format( + count_homo, ((float(count_homo) / snp_total_number) * 100))) + + statistics_file.write('Number of SNPs: Homozygous {0} ({1}%)\n'.format( + count_homo, ((float(count_homo) / snp_total_number) * 100))) + + logging.debug( + 'Number of SNPs: Heterozygous {0} ({1}%)'.format( + count_hete, ((float(count_hete) / snp_total_number) * 100))) + + statistics_file.write('Number of SNPs: Heterozygous {0} ({1}%)\n'.format( + count_hete, ((float(count_hete) / snp_total_number) * 100))) + + logging.debug('Finished method classifySNP in: {0} seconds'.format( + time.time() - start_time)) + + statistics_file.close() + + if __name__ == '__main__': main() From d6d64bc084dfdf920bf66524d0718171350941a0 Mon Sep 17 00:00:00 2001 From: Nara88 Date: Thu, 29 Sep 2016 18:39:08 +0200 Subject: [PATCH 09/11] Add files via upload --- Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 Vcf_conversion.rb diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb new file mode 100644 index 00000000..d661f72f --- /dev/null +++ b/Vcf_conversion.rb @@ -0,0 +1,105 @@ +#Methods + +def get_line_from_file(path, line) +result = nil + File.open(path, "r") do |f| + while line > 0 + line -= 1 + result = f.gets + result ||= '' + result = result.chomp + end + end + return result +end + +def write_header(output_path) + formato = "##fileformat=VCFv4.2\n" + colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" + File.open(output_path, "w") { |file| file.write(formato + colonne) } +end + +def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") } +end + +def find_alt(snp_line, ref) + + alternatives = snp_line[9].to_s.split "/" + alt = "." + if (alternatives.length == 2) + alternatives.each do |a| + if a.to_s != ref + alt = a.to_s + end + end + else + puts "Error: bihallelic field" + ref = "X" + end + + return alt + +end + + +def create_line(snp_path, current_line, current_father, current_mother, output) + + current_snp_line = current_line + 3 + + snp_line = get_line_from_file(snp_path, current_snp_line).split "\t" + + chrom = snp_line[1].to_s + + pos = (snp_line[2].to_i + 1).to_s + + id = "." + + ref = snp_line[7].to_s + + alt = find_alt(snp_line, ref) + + qual = "." + + filter = "PASS" + + info = "." + + format = "GT" + + sample = current_father.to_s + "|" + current_mother.to_s + + write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + +end + +def write_body(haplotypes, snp, output) + + current_line = 0 + father = get_line_from_file(haplotypes, 1).split('') + mother = get_line_from_file(haplotypes, 2).split('') + + father.each do |i| + create_line(snp, current_line, father[current_line], mother[current_line], output) + current_line += 1 + end + +end + +def create_vcf(haplotypes_path, snp_path, output_path) + + snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3 + + if (get_line_from_file(haplotypes_path, 1).length == snp_lines) + write_header(output_path) + write_body(haplotypes_path, snp_path, output_path) + else + puts "Error: non-matching files" + end + +end + + +#Vcf file creation + +create_vcf(haplotypes_path, snp_path, output_path) From 268aa70e34422bbb3f95c29b61e3d802c61e4e92 Mon Sep 17 00:00:00 2001 From: Nara88 Date: Sun, 2 Oct 2016 20:35:50 +0200 Subject: [PATCH 10/11] Delete Vcf_conversion.rb --- Vcf_conversion.rb | 105 ---------------------------------------------- 1 file changed, 105 deletions(-) delete mode 100644 Vcf_conversion.rb diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb deleted file mode 100644 index d661f72f..00000000 --- a/Vcf_conversion.rb +++ /dev/null @@ -1,105 +0,0 @@ -#Methods - -def get_line_from_file(path, line) -result = nil - File.open(path, "r") do |f| - while line > 0 - line -= 1 - result = f.gets - result ||= '' - result = result.chomp - end - end - return result -end - -def write_header(output_path) - formato = "##fileformat=VCFv4.2\n" - colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" - File.open(output_path, "w") { |file| file.write(formato + colonne) } -end - -def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample) - File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") } -end - -def find_alt(snp_line, ref) - - alternatives = snp_line[9].to_s.split "/" - alt = "." - if (alternatives.length == 2) - alternatives.each do |a| - if a.to_s != ref - alt = a.to_s - end - end - else - puts "Error: bihallelic field" - ref = "X" - end - - return alt - -end - - -def create_line(snp_path, current_line, current_father, current_mother, output) - - current_snp_line = current_line + 3 - - snp_line = get_line_from_file(snp_path, current_snp_line).split "\t" - - chrom = snp_line[1].to_s - - pos = (snp_line[2].to_i + 1).to_s - - id = "." - - ref = snp_line[7].to_s - - alt = find_alt(snp_line, ref) - - qual = "." - - filter = "PASS" - - info = "." - - format = "GT" - - sample = current_father.to_s + "|" + current_mother.to_s - - write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample) - -end - -def write_body(haplotypes, snp, output) - - current_line = 0 - father = get_line_from_file(haplotypes, 1).split('') - mother = get_line_from_file(haplotypes, 2).split('') - - father.each do |i| - create_line(snp, current_line, father[current_line], mother[current_line], output) - current_line += 1 - end - -end - -def create_vcf(haplotypes_path, snp_path, output_path) - - snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3 - - if (get_line_from_file(haplotypes_path, 1).length == snp_lines) - write_header(output_path) - write_body(haplotypes_path, snp_path, output_path) - else - puts "Error: non-matching files" - end - -end - - -#Vcf file creation - -create_vcf(haplotypes_path, snp_path, output_path) From 5f32b743f7ea22e09f4c8264daec38947569ab07 Mon Sep 17 00:00:00 2001 From: Nara88 Date: Sun, 2 Oct 2016 20:38:59 +0200 Subject: [PATCH 11/11] src/output-vcf/Vcf_conversion.rb --- src/Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/Vcf_conversion.rb diff --git a/src/Vcf_conversion.rb b/src/Vcf_conversion.rb new file mode 100644 index 00000000..d661f72f --- /dev/null +++ b/src/Vcf_conversion.rb @@ -0,0 +1,105 @@ +#Methods + +def get_line_from_file(path, line) +result = nil + File.open(path, "r") do |f| + while line > 0 + line -= 1 + result = f.gets + result ||= '' + result = result.chomp + end + end + return result +end + +def write_header(output_path) + formato = "##fileformat=VCFv4.2\n" + colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" + File.open(output_path, "w") { |file| file.write(formato + colonne) } +end + +def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") } +end + +def find_alt(snp_line, ref) + + alternatives = snp_line[9].to_s.split "/" + alt = "." + if (alternatives.length == 2) + alternatives.each do |a| + if a.to_s != ref + alt = a.to_s + end + end + else + puts "Error: bihallelic field" + ref = "X" + end + + return alt + +end + + +def create_line(snp_path, current_line, current_father, current_mother, output) + + current_snp_line = current_line + 3 + + snp_line = get_line_from_file(snp_path, current_snp_line).split "\t" + + chrom = snp_line[1].to_s + + pos = (snp_line[2].to_i + 1).to_s + + id = "." + + ref = snp_line[7].to_s + + alt = find_alt(snp_line, ref) + + qual = "." + + filter = "PASS" + + info = "." + + format = "GT" + + sample = current_father.to_s + "|" + current_mother.to_s + + write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + +end + +def write_body(haplotypes, snp, output) + + current_line = 0 + father = get_line_from_file(haplotypes, 1).split('') + mother = get_line_from_file(haplotypes, 2).split('') + + father.each do |i| + create_line(snp, current_line, father[current_line], mother[current_line], output) + current_line += 1 + end + +end + +def create_vcf(haplotypes_path, snp_path, output_path) + + snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3 + + if (get_line_from_file(haplotypes_path, 1).length == snp_lines) + write_header(output_path) + write_body(haplotypes_path, snp_path, output_path) + else + puts "Error: non-matching files" + end + +end + + +#Vcf file creation + +create_vcf(haplotypes_path, snp_path, output_path)