diff --git a/src/Vcf_conversion.rb b/src/Vcf_conversion.rb new file mode 100644 index 00000000..d661f72f --- /dev/null +++ b/src/Vcf_conversion.rb @@ -0,0 +1,105 @@ +#Methods + +def get_line_from_file(path, line) +result = nil + File.open(path, "r") do |f| + while line > 0 + line -= 1 + result = f.gets + result ||= '' + result = result.chomp + end + end + return result +end + +def write_header(output_path) + formato = "##fileformat=VCFv4.2\n" + colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" + File.open(output_path, "w") { |file| file.write(formato + colonne) } +end + +def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") } +end + +def find_alt(snp_line, ref) + + alternatives = snp_line[9].to_s.split "/" + alt = "." + if (alternatives.length == 2) + alternatives.each do |a| + if a.to_s != ref + alt = a.to_s + end + end + else + puts "Error: bihallelic field" + ref = "X" + end + + return alt + +end + + +def create_line(snp_path, current_line, current_father, current_mother, output) + + current_snp_line = current_line + 3 + + snp_line = get_line_from_file(snp_path, current_snp_line).split "\t" + + chrom = snp_line[1].to_s + + pos = (snp_line[2].to_i + 1).to_s + + id = "." + + ref = snp_line[7].to_s + + alt = find_alt(snp_line, ref) + + qual = "." + + filter = "PASS" + + info = "." + + format = "GT" + + sample = current_father.to_s + "|" + current_mother.to_s + + write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample) + +end + +def write_body(haplotypes, snp, output) + + current_line = 0 + father = get_line_from_file(haplotypes, 1).split('') + mother = get_line_from_file(haplotypes, 2).split('') + + father.each do |i| + create_line(snp, current_line, father[current_line], mother[current_line], output) + current_line += 1 + end + +end + +def create_vcf(haplotypes_path, snp_path, output_path) + + snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3 + + if (get_line_from_file(haplotypes_path, 1).length == snp_lines) + write_header(output_path) + write_body(haplotypes_path, snp_path, output_path) + else + puts "Error: non-matching files" + end + +end + + +#Vcf file creation + +create_vcf(haplotypes_path, snp_path, output_path) diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py new file mode 100644 index 00000000..c7c24e53 --- /dev/null +++ b/src/input-bam/bamToWif.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import logging +import os +import sys +import argparse +import progressbar +import re +import pprint + +import pysam + + +def check_observed(observed_couple): + # observed are in the form A/T, T/-, -/C + regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') + couple = regexpr_snp.search(observed_couple) + return couple.group(1), couple.group(2) + + +def file_len(file): + # return the length (number of rows) of a file + with open(file) as f: + for i, l in enumerate(f): + pass + return i + + +def snpsInRead(alignment_file, annotation_file, file_delimiter, skip_first_line): + + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + + # return the number of SNPs in the file + # snp_total_number = file_len(annotation_file) + + total_fetch_alignment = in_sam.fetch() + total_read_number = in_sam.count() + + read_count = 0 + read_bar = progressbar.ProgressBar(maxval=total_read_number).start() + snp_read_list = {} + for read in total_fetch_alignment: + read_count += 1 + read_bar.update(read_count) + + read_qname = read.qname + read_start = read.reference_start + read_end = read.reference_end + logging.debug('read {0}, start: {1}, end: {2}'.format( + read_qname, read_start, read_end)) + + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + if skip_first_line: + ann_file.readline() + + # Header info - first line + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_chromosome = header.index('chrom') + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_observed = header.index('observed') + snp_read_list[read_qname] = [] + # snp_bar = progressbar.ProgressBar(maxval=snp_total_number).start() + # snp_count = 0 + for line in ann_file: + # snp_count += 1 + # snp_bar.update(snp_count) + + splitted_line = line.split(file_delimiter) + + snp_chrom = splitted_line[snp_header_index_chromosome] + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + first_observed, second_observed = check_observed( + splitted_line[snp_header_index_observed]) + + # Check every read aligned with the SNPs + for pileupcolumn in in_sam.pileup(snp_chrom, snp_start, snp_stop): + for pileupread in pileupcolumn.pileups: + read_snp_name = pileupread.alignment.query_name + logging.debug('Found read: {0}'.format(read_snp_name)) + if read_snp_name == read_qname: + if not pileupread.is_del and not pileupread.is_refskip: + read_snp_position = pileupread.query_position + base_value = pileupread.alignment.query_sequence[ + pileupread.query_position] + if base_value == first_observed: + allele = '0' + elif base_value == second_observed: + allele = '1' + else: + # skip the SNP if not in first or second allele + continue + base_quality = pileupread.alignment.query_alignment_qualities[ + read_snp_position] + # if read_snp_name == read_qname and + # read_snp_position >= read_start and + # read_snp_position <= read_end: + + snp_read_list[read_qname].append( + [read_snp_position, base_value, allele, base_quality]) + + ann_file.close() + snp_read_list[read_qname].append(read.mapping_quality) + # snp_bar.finish() + read_bar.finish() + return snp_read_list + + +def convertBamToWif(snp_read_list, out_dir): + + output_file = open(out_dir + 'alignment.wif', 'w') + + for read, snp_list in snp_read_list.items(): + if len(snp_list) <= 1: + continue + else: + snp_mapping_quality = snp_list[-1] + for snp in snp_list[:-1]: + output_file.write('{0} {1} {2} {3} : '.format( + snp[0], snp[1], snp[2], snp[3])) + output_file.write('# {0} : NA\n'.format(snp_mapping_quality)) + + +def main(): + + parser = argparse.ArgumentParser( + description='Convert an alignment file .bam in a .wif Hapcol compatible file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs annotation file.', + required=True) + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-skip-first-line', + action='store', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') + parser.add_argument('-v', + help='increase output verbosity', + action='count') + + args = parser.parse_args() + + logging.info('#### STEP 1 - PROCESSING INPUT ARGUMENTS ####') + + in_bam_file = args.fileBAM + out_root_dir = args.output_dir + + if args.v == 0: + log_level = logging.INFO + else: + log_level = logging.DEBUG + + logging.basicConfig(level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H%M%S') + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + logging.info('BamToWif: Program Started') + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter + + logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####') + + snps_list = snpsInRead(in_bam_file, args.annotationFile, + ann_file_delimiter, args.skip_first_line) + + logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####') + + convertBamToWif(snps_list, out_dir) + + logging.info('BamToWif: Program Completed') + +if __name__ == '__main__': + main() diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py new file mode 100644 index 00000000..a4225062 --- /dev/null +++ b/src/input-bam/statistics.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import argparse +import logging +import os +import re +import sys + +import progressbar +import pysam + + +def generate_statistics(annotation_file, file_delimiter, region, alignment_file, skip_first_line): + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + + # return the number of SNPs in the file + snp_total_number = file_len(annotation_file) + logging.info('Number of Total SNPs: {0}'.format(snp_total_number)) + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + if skip_first_line: + ann_file.readline() + + # read header info + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_name = header.index('name') + snp_header_observed = header.index('observed') + + # read annotation file + snp_valid = 0 # SNPs with almost 1 read aligned + snp_not_valid = 0 # SNPs with no read aligned + snp_count = 0 + read_with_first_allele = 0 + read_with_second_allele = 0 + read_with_different_allele = 0 + prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() + for line in ann_file: + snp_count += 1 + prgbar.update(snp_count) + + splitted_line = line.split(file_delimiter) + + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + snp_name = splitted_line[snp_header_index_name] + first_observed, second_observed = check_observed( + splitted_line[snp_header_observed]) + + frequences_observed = {'first': 0, 'second': 0, 'others': 0} + + logging.info('Loaded snp: {0}'.format(snp_name)) + + frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0} + count_read = 0 + for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop): + logging.debug('Coverage at base {0} = {1}'.format( + pileupcolumn.pos, pileupcolumn.n)) + for pileupread in pileupcolumn.pileups: + if not pileupread.is_del and not pileupread.is_refskip: + count_read += 1 + base_value = pileupread.alignment.query_sequence[ + pileupread.query_position] + logging.debug('Base in read = {0}, First observed = {1}, Second observed = {2}'.format( + base_value, first_observed, second_observed)) + frequences[base_value] += 1 + if base_value == first_observed: + frequences_observed['first'] += 1 + read_with_first_allele += 1 + elif base_value == second_observed: + frequences_observed['second'] += 1 + read_with_second_allele += 1 + else: + frequences_observed['others'] += 1 + read_with_different_allele += 1 + logging.debug(frequences) + + if count_read != 0: + snp_valid += 1 + # number of reads aligned with the SNP + logging.info( + 'Total number of read aligned with SNP: {0}'.format(count_read)) + + # number of reads aligned with the SNP in the first allele + logging.info('Number of read aligned with the first allele: {0} ({1}%)'.format( + frequences_observed['first'], round((float( + frequences_observed['first']) / count_read) * 100, 2))) + + # number of reads aligned with the SNP in the second allele + logging.info('Number of read aligned with the second allele: {0} ({1}%)'.format( + frequences_observed['second'], round((float( + frequences_observed['second']) / count_read) * 100, 2))) + + # reads aligned with the SNP - different allele + logging.info('Number of read aligned with different allele: {0} ({1}%)'.format( + frequences_observed['others'], round((float( + frequences_observed['others']) / count_read) * 100, 2))) + + else: + snp_not_valid += 1 + logging.info( + 'No reads aligned with this snp: {0}'.format(snp_name)) + + prgbar.finish() + logging.info( + '#### TOTAL RESULTS ####') + + logging.info('Processed {0} SNPs'.format(snp_count)) + + logging.info('SNPs with almost one read aligned: {0} ({1}%)'.format( + snp_valid, round(((float( + snp_valid) / snp_count) * 100), 2))) + + logging.info('SNPs with no read aligned: {0} ({1}%)'.format( + snp_not_valid, round(((float( + snp_not_valid) / snp_count) * 100), 2))) + + logging.info('Total number of read aligned with the first allele: {0}'.format( + read_with_first_allele)) + + logging.info('Total number of read aligned with the second allele: {0}'.format( + read_with_second_allele)) + + logging.info('Total number of read aligned with different allele: {0}'.format( + read_with_different_allele)) + + logging.info('Generation of statistics: Completed') + ann_file.close() + in_sam.close() + + +def main(): + + parser = argparse.ArgumentParser( + description='Generate statistics using alignment BAM file and SNPs annotation file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs location annotation file.', + required=True) + parser.add_argument('-r', + action='store', + dest='region', + help='Region to be examined in the form: chr:start-stop. Ex. 1:100-200.') + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-skip-first-line', + action='store_true', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') + parser.add_argument('-v', + help='increase output verbosity', + action='count') + + args = parser.parse_args() + + in_bam_file = args.fileBAM + in_region = args.region + out_root_dir = args.output_dir + + if args.v is None: + log_level = logging.INFO + else: + log_level = logging.DEBUG + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + prepare_loggers(out_dir, log_level) + + logging.info('StatisticsGenerator: Program Started') + + region = check_region(in_region) + + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter + + generate_statistics(args.annotationFile, + ann_file_delimiter, region, in_bam_file, args.skip_first_line) + + logging.info('StatisticsGenerator: Program Completed') + + +# utility + +def check_observed(observed_couple): + # observed are in the form A/T, T/-, -/C + regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') + couple = regexpr_snp.search(observed_couple) + return couple.group(1), couple.group(2) + + +def file_len(file): + # return the length (number of rows) of a file + with open(file) as f: + for i, l in enumerate(f): + pass + return i + + +def prepare_loggers(out_dir, log_level): + logging.basicConfig(filename=out_dir + 'statistics.txt', + filemode='w', + level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H:%M:%S') + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter( + '[%(levelname)-8s] %(asctime)s - %(message)s') + console.setFormatter(formatter) + logging.getLogger('').addHandler(console) + + +def check_region(input_region): + # if no region is set, is used all the 1st chromosome (for test purpose) + if input_region is not None: + regexp_reg = re.compile(r'^(\w+):(\d+)-(\d+)') + region = re.search(regexp_reg, input_region) + if region is not None: + in_chr = region.group(1) + in_start = int(region.group(2)) + in_stop = int(region.group(3)) + if in_start > in_stop: + logging.error('Wrong input region.') + sys.exit(1) + else: + return in_chr, in_start, in_stop + else: + return '1' + + +if __name__ == '__main__': + main() diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py new file mode 100644 index 00000000..41169dc9 --- /dev/null +++ b/src/input-bam/statistics_v2.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# - * -coding: utf - 8 - * - + +import argparse +import logging +import os +import re +import sys +from collections import Counter +import time + +import progressbar +import pysam + + +def main(): + + parser = argparse.ArgumentParser( + description='Generate statistics using alignment BAM file and SNPs annotation file') + parser.add_argument('-b', + action='store', + dest='fileBAM', + help='Alignment file in BAM format.', + required=True) + parser.add_argument('-a', + action='store', + dest='annotationFile', + help='SNPs location annotation file.', + required=True) + parser.add_argument('-o', + action='store', + dest='output_dir', + help='Output (root) directory.', + required=True) + parser.add_argument('-skip-first-line', + action='store_true', + dest='skip_first_line', + help='set this flag if the first line IS NOT the header') + parser.add_argument('-epsilon', + action='store', + dest='threshold', + help='set the threshold for Homozygous or Heterozygous. Default: 80') + parser.add_argument('-file-delimiter', + action='store', + dest='file_delim', + help='set the file delimiter for the annotation file. Default: \\t') + parser.add_argument('-v', + help='increase output verbosity', + dest='verbosity', + action='store_true') + + args = parser.parse_args() + + in_bam_file = args.fileBAM + out_root_dir = args.output_dir + + if args.verbosity: + log_level = logging.DEBUG + else: + log_level = logging.INFO + + if not os.path.exists(out_root_dir): + logging.error('Output dir not found.') + sys.exit(1) + + out_dir = out_root_dir + '/' + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + + prepare_loggers(log_level) + + logging.info('Program Started') + + if args.file_delim is None: + ann_file_delimiter = '\t' + else: + ann_file_delimiter = args.file_delimiter + + threshold = 80 + if args.threshold is not None: + threshold = int(args.threshold) + + logging.info('#### STEP 1 - Loading SNPs ####') + + snp_map = reading_snp(args.annotationFile, + ann_file_delimiter, args.skip_first_line) + + logging.info('#### STEP 2 - Generating Statistics ####') + + classify_snp(in_bam_file, snp_map, threshold, out_dir) + + logging.info('Program Completed') + + +# utility +def prepare_loggers(log_level): + logging.basicConfig(level=log_level, + format='%(levelname)-8s [%(asctime)s] %(message)s', + datefmt='%y%m%d %H:%M:%S') + + +def reading_snp(annotation_file, file_delimiter, flag_first_line): + + logging.debug('Started method reading_snp') + start_time = time.time() + ann_file = open(annotation_file) + + # skip first line (comment if the file starts with the header) + if flag_first_line: + ann_file.readline() + + # read header info + header = ann_file.readline().split(file_delimiter) + + # the fields to read + snp_header_index_chrom = header.index('chrom') + snp_header_index_start = header.index('chromStart') + snp_header_index_stop = header.index('chromEnd') + snp_header_index_name = header.index('name') + snp_map = {} + for line in ann_file: + + splitted_line = line.split(file_delimiter) + + snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1] + snp_start = int(splitted_line[snp_header_index_start]) + snp_stop = int(splitted_line[snp_header_index_stop]) + snp_name = splitted_line[snp_header_index_name] + + snp_map[snp_name] = [snp_chrom, snp_start, snp_stop] + + ann_file.close() + + logging.debug('Finished method reading_snp in: {0} seconds'.format( + time.time() - start_time)) + return snp_map + + +def classify_snp(alignment_file, snp_map, threshold, out_dir): + logging.debug('Started method classifySNP') + start_time = time.time() + count_homo = 0 + count_hete = 0 + snp_total_number = len(snp_map) + statistics_file = open(out_dir + 'statistics2.txt', 'w') + + in_sam = pysam.AlignmentFile(alignment_file, 'rb') + prgbar = progressbar.ProgressBar(maxval=snp_total_number).start() + cnt = 0 + + for snp_name, snp_values in snp_map.items(): + statistics_file.write('Reading SNP: {0}\n'.format(snp_name)) + cnt += 1 + prgbar.update(cnt) + snp_chrom = snp_values[0] + snp_start = snp_values[1] + snp_end = snp_values[2] + + logging.debug( + 'SNP start: {0} - SNP end:{1}'.format(snp_start, snp_end)) + + fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end) + count_aln = in_sam.count(snp_chrom, snp_start, snp_end) + + alleles = Counter() + + for read in fetch_aln: + read_name = read.query_name + logging.debug('Reading read: {0}'.format(read_name)) + read_sequence = read.query_sequence + read_base = read_sequence[snp_start:snp_end] + if not read_base: + logging.debug('Empty read - indel') + continue + logging.debug('Read base: {0}'.format(read_base)) + alleles[read_base] += 1 + + logging.debug(alleles) + statistics_file.write('Allele counter for the SNP:\n') + for key, value in alleles.items(): + statistics_file.write('{0} : {1}\n'.format(key, value)) + + if alleles: + first_allele = alleles.most_common(2)[0] + logging.debug(first_allele) + second_allele = alleles.most_common(2)[1] + logging.debug(second_allele) + + if (first_allele[1] / count_aln) * 100 >= threshold: + statistics_file.write('Snp is Homozygous\n') + logging.debug('The snp {0} is: Homozygous'.format(snp_name)) + count_homo += 1 + else: + statistics_file.write('Snp is Heterozygous\n') + logging.debug('The snp {0} is: Heterozygous'.format(snp_name)) + count_hete += 1 + + logging.debug( + 'Number of read aligned on SNP: {0}'.format(count_aln)) + statistics_file.write( + 'Number of read aligned on SNP: {0}\n'.format(count_aln)) + + out = {} + if count_aln > 0: + for key in alleles: + out[key] = (float(alleles[key]) / count_aln) * 100 + + logging.debug(out) + statistics_file.write('Base Frequences for the SNP:\n') + for key, value in out.items(): + statistics_file.write('{0} : {1}\n'.format(key, value)) + + prgbar.finish() + in_sam.close() + + logging.debug( + 'Total number of SNPs: {0}'.format(snp_total_number)) + + statistics_file.write( + 'Total number of SNPs: {0}\n'.format(snp_total_number)) + + logging.debug( + 'Number of SNPs: Homozygous {0} ({1}%)'.format( + count_homo, ((float(count_homo) / snp_total_number) * 100))) + + statistics_file.write('Number of SNPs: Homozygous {0} ({1}%)\n'.format( + count_homo, ((float(count_homo) / snp_total_number) * 100))) + + logging.debug( + 'Number of SNPs: Heterozygous {0} ({1}%)'.format( + count_hete, ((float(count_hete) / snp_total_number) * 100))) + + statistics_file.write('Number of SNPs: Heterozygous {0} ({1}%)\n'.format( + count_hete, ((float(count_hete) / snp_total_number) * 100))) + + logging.debug('Finished method classifySNP in: {0} seconds'.format( + time.time() - start_time)) + + statistics_file.close() + + +if __name__ == '__main__': + main()