-
Notifications
You must be signed in to change notification settings - Fork 4
Output vcf #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Nara88
wants to merge
11
commits into
AlgoLab:master
Choose a base branch
from
Nara88:output-vcf
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Output vcf #1
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
552de14
inserito lo script python che a partire da un file di allineamenti e …
ddcatania 22d3c35
Inserimento script per conversione BAM to WIF. Da completare!
ddcatania 9c219b2
qualche correzione ai log
ddcatania dcf3cc3
qualche correzione ai log
ddcatania 3bcd3a1
Aggiunta possibilità di decidere in fase di esecuzione dello script i…
ddcatania 1f0a0ef
fix sui log
ddcatania 1cb00ed
nuova versione. Carica in memoria la mappa di SNPs, la scorre e per o…
ddcatania 4ee1395
Calcola i due alleli con maggior frequenza e secondo una certa soglia…
ddcatania d6d64bc
Add files via upload
Nara88 268aa70
Delete Vcf_conversion.rb
Nara88 5f32b74
src/output-vcf/Vcf_conversion.rb
Nara88 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| #Methods | ||
|
|
||
| def get_line_from_file(path, line) | ||
| result = nil | ||
| File.open(path, "r") do |f| | ||
| while line > 0 | ||
| line -= 1 | ||
| result = f.gets | ||
| result ||= '' | ||
| result = result.chomp | ||
| end | ||
| end | ||
| return result | ||
| end | ||
|
|
||
| def write_header(output_path) | ||
| formato = "##fileformat=VCFv4.2\n" | ||
| colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n" | ||
| File.open(output_path, "w") { |file| file.write(formato + colonne) } | ||
| end | ||
|
|
||
| def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample) | ||
| File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") } | ||
| end | ||
|
|
||
| def find_alt(snp_line, ref) | ||
|
|
||
| alternatives = snp_line[9].to_s.split "/" | ||
| alt = "." | ||
| if (alternatives.length == 2) | ||
| alternatives.each do |a| | ||
| if a.to_s != ref | ||
| alt = a.to_s | ||
| end | ||
| end | ||
| else | ||
| puts "Error: bihallelic field" | ||
| ref = "X" | ||
| end | ||
|
|
||
| return alt | ||
|
|
||
| end | ||
|
|
||
|
|
||
| def create_line(snp_path, current_line, current_father, current_mother, output) | ||
|
|
||
| current_snp_line = current_line + 3 | ||
|
|
||
| snp_line = get_line_from_file(snp_path, current_snp_line).split "\t" | ||
|
|
||
| chrom = snp_line[1].to_s | ||
|
|
||
| pos = (snp_line[2].to_i + 1).to_s | ||
|
|
||
| id = "." | ||
|
|
||
| ref = snp_line[7].to_s | ||
|
|
||
| alt = find_alt(snp_line, ref) | ||
|
|
||
| qual = "." | ||
|
|
||
| filter = "PASS" | ||
|
|
||
| info = "." | ||
|
|
||
| format = "GT" | ||
|
|
||
| sample = current_father.to_s + "|" + current_mother.to_s | ||
|
|
||
| write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample) | ||
|
|
||
| end | ||
|
|
||
| def write_body(haplotypes, snp, output) | ||
|
|
||
| current_line = 0 | ||
| father = get_line_from_file(haplotypes, 1).split('') | ||
| mother = get_line_from_file(haplotypes, 2).split('') | ||
|
|
||
| father.each do |i| | ||
| create_line(snp, current_line, father[current_line], mother[current_line], output) | ||
| current_line += 1 | ||
| end | ||
|
|
||
| end | ||
|
|
||
| def create_vcf(haplotypes_path, snp_path, output_path) | ||
|
|
||
| snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3 | ||
|
|
||
| if (get_line_from_file(haplotypes_path, 1).length == snp_lines) | ||
| write_header(output_path) | ||
| write_body(haplotypes_path, snp_path, output_path) | ||
| else | ||
| puts "Error: non-matching files" | ||
| end | ||
|
|
||
| end | ||
|
|
||
|
|
||
| #Vcf file creation | ||
|
|
||
| create_vcf(haplotypes_path, snp_path, output_path) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,205 @@ | ||
| #!/usr/bin/env python3 | ||
| # - * -coding: utf - 8 - * - | ||
|
|
||
| import logging | ||
| import os | ||
| import sys | ||
| import argparse | ||
| import progressbar | ||
| import re | ||
| import pprint | ||
|
|
||
| import pysam | ||
|
|
||
|
|
||
| def check_observed(observed_couple): | ||
| # observed are in the form A/T, T/-, -/C | ||
| regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)') | ||
| couple = regexpr_snp.search(observed_couple) | ||
| return couple.group(1), couple.group(2) | ||
|
|
||
|
|
||
| def file_len(file): | ||
| # return the length (number of rows) of a file | ||
| with open(file) as f: | ||
| for i, l in enumerate(f): | ||
| pass | ||
| return i | ||
|
|
||
|
|
||
| def snpsInRead(alignment_file, annotation_file, file_delimiter, skip_first_line): | ||
|
|
||
| in_sam = pysam.AlignmentFile(alignment_file, 'rb') | ||
|
|
||
| # return the number of SNPs in the file | ||
| # snp_total_number = file_len(annotation_file) | ||
|
|
||
| total_fetch_alignment = in_sam.fetch() | ||
| total_read_number = in_sam.count() | ||
|
|
||
| read_count = 0 | ||
| read_bar = progressbar.ProgressBar(maxval=total_read_number).start() | ||
| snp_read_list = {} | ||
| for read in total_fetch_alignment: | ||
| read_count += 1 | ||
| read_bar.update(read_count) | ||
|
|
||
| read_qname = read.qname | ||
| read_start = read.reference_start | ||
| read_end = read.reference_end | ||
| logging.debug('read {0}, start: {1}, end: {2}'.format( | ||
| read_qname, read_start, read_end)) | ||
|
|
||
| ann_file = open(annotation_file) | ||
|
|
||
| # skip first line (comment if the file starts with the header) | ||
| if skip_first_line: | ||
| ann_file.readline() | ||
|
|
||
| # Header info - first line | ||
| header = ann_file.readline().split(file_delimiter) | ||
|
|
||
| # the fields to read | ||
| snp_header_index_chromosome = header.index('chrom') | ||
| snp_header_index_start = header.index('chromStart') | ||
| snp_header_index_stop = header.index('chromEnd') | ||
| snp_header_index_observed = header.index('observed') | ||
| snp_read_list[read_qname] = [] | ||
| # snp_bar = progressbar.ProgressBar(maxval=snp_total_number).start() | ||
| # snp_count = 0 | ||
| for line in ann_file: | ||
| # snp_count += 1 | ||
| # snp_bar.update(snp_count) | ||
|
|
||
| splitted_line = line.split(file_delimiter) | ||
|
|
||
| snp_chrom = splitted_line[snp_header_index_chromosome] | ||
| snp_start = int(splitted_line[snp_header_index_start]) | ||
| snp_stop = int(splitted_line[snp_header_index_stop]) | ||
| first_observed, second_observed = check_observed( | ||
| splitted_line[snp_header_index_observed]) | ||
|
|
||
| # Check every read aligned with the SNPs | ||
| for pileupcolumn in in_sam.pileup(snp_chrom, snp_start, snp_stop): | ||
| for pileupread in pileupcolumn.pileups: | ||
| read_snp_name = pileupread.alignment.query_name | ||
| logging.debug('Found read: {0}'.format(read_snp_name)) | ||
| if read_snp_name == read_qname: | ||
| if not pileupread.is_del and not pileupread.is_refskip: | ||
| read_snp_position = pileupread.query_position | ||
| base_value = pileupread.alignment.query_sequence[ | ||
| pileupread.query_position] | ||
| if base_value == first_observed: | ||
| allele = '0' | ||
| elif base_value == second_observed: | ||
| allele = '1' | ||
| else: | ||
| # skip the SNP if not in first or second allele | ||
| continue | ||
| base_quality = pileupread.alignment.query_alignment_qualities[ | ||
| read_snp_position] | ||
| # if read_snp_name == read_qname and | ||
| # read_snp_position >= read_start and | ||
| # read_snp_position <= read_end: | ||
|
|
||
| snp_read_list[read_qname].append( | ||
| [read_snp_position, base_value, allele, base_quality]) | ||
|
|
||
| ann_file.close() | ||
| snp_read_list[read_qname].append(read.mapping_quality) | ||
| # snp_bar.finish() | ||
| read_bar.finish() | ||
| return snp_read_list | ||
|
|
||
|
|
||
| def convertBamToWif(snp_read_list, out_dir): | ||
|
|
||
| output_file = open(out_dir + 'alignment.wif', 'w') | ||
|
|
||
| for read, snp_list in snp_read_list.items(): | ||
| if len(snp_list) <= 1: | ||
| continue | ||
| else: | ||
| snp_mapping_quality = snp_list[-1] | ||
| for snp in snp_list[:-1]: | ||
| output_file.write('{0} {1} {2} {3} : '.format( | ||
| snp[0], snp[1], snp[2], snp[3])) | ||
| output_file.write('# {0} : NA\n'.format(snp_mapping_quality)) | ||
|
|
||
|
|
||
| def main(): | ||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description='Convert an alignment file .bam in a .wif Hapcol compatible file') | ||
| parser.add_argument('-b', | ||
| action='store', | ||
| dest='fileBAM', | ||
| help='Alignment file in BAM format.', | ||
| required=True) | ||
| parser.add_argument('-a', | ||
| action='store', | ||
| dest='annotationFile', | ||
| help='SNPs annotation file.', | ||
| required=True) | ||
| parser.add_argument('-o', | ||
| action='store', | ||
| dest='output_dir', | ||
| help='Output (root) directory.', | ||
| required=True) | ||
| parser.add_argument('-skip-first-line', | ||
| action='store', | ||
| dest='skip_first_line', | ||
| help='set this flag if the first line IS NOT the header') | ||
| parser.add_argument('-file-delimiter', | ||
| action='store', | ||
| dest='file_delim', | ||
| help='set the file delimiter for the annotation file. Default: \\t') | ||
| parser.add_argument('-v', | ||
| help='increase output verbosity', | ||
| action='count') | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| logging.info('#### STEP 1 - PROCESSING INPUT ARGUMENTS ####') | ||
|
|
||
| in_bam_file = args.fileBAM | ||
| out_root_dir = args.output_dir | ||
|
|
||
| if args.v == 0: | ||
| log_level = logging.INFO | ||
| else: | ||
| log_level = logging.DEBUG | ||
|
|
||
| logging.basicConfig(level=log_level, | ||
| format='%(levelname)-8s [%(asctime)s] %(message)s', | ||
| datefmt='%y%m%d %H%M%S') | ||
|
|
||
| if not os.path.exists(out_root_dir): | ||
| logging.error('Output dir not found.') | ||
| sys.exit(1) | ||
|
|
||
| logging.info('BamToWif: Program Started') | ||
|
|
||
| out_dir = out_root_dir + '/' | ||
|
|
||
| if not os.path.exists(out_dir): | ||
| os.mkdir(out_dir) | ||
|
|
||
| if args.file_delim is None: | ||
| ann_file_delimiter = '\t' | ||
| else: | ||
| ann_file_delimiter = args.file_delimiter | ||
|
|
||
| logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####') | ||
|
|
||
| snps_list = snpsInRead(in_bam_file, args.annotationFile, | ||
| ann_file_delimiter, args.skip_first_line) | ||
|
|
||
| logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####') | ||
|
|
||
| convertBamToWif(snps_list, out_dir) | ||
|
|
||
| logging.info('BamToWif: Program Completed') | ||
|
|
||
| if __name__ == '__main__': | ||
| main() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Variable names, commit messages, and everything else must be written in English