From 552de149b21bdf5ceede0607ea3553c220db80f3 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Tue, 17 May 2016 21:15:01 +0200
Subject: [PATCH 01/11] inserito lo script python che a partire da un file di
 allineamenti e un file di annotazioni su SNPs restituisce calcola alcune
 statistiche.

---
 src/input-bam/statistics.py | 264 ++++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 src/input-bam/statistics.py

diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
new file mode 100644
index 00000000..7c5df217
--- /dev/null
+++ b/src/input-bam/statistics.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import progressbar
+import pysam
+
+
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir):
+    in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+    # return the number of SNPs in the file
+    snp_total_number = file_len(annotation_file)
+    logging.info('Number of Total SNPs: {0}'.format(snp_total_number))
+    ann_file = open(annotation_file)
+
+    # skip first line (comment if the file starts with the header)
+    ann_file.readline()
+
+    # read header info
+    header = ann_file.readline().split(file_delimiter)
+
+    # the fields to read
+    snp_header_index_start = header.index('chromStart')
+    snp_header_index_stop = header.index('chromEnd')
+    snp_header_index_name = header.index('name')
+    snp_header_observed = header.index('observed')
+
+    statistics_file = open(out_dir + 'statistics.txt', 'w')
+
+    # read annotation file
+    snp_valid = 0  # SNPs with almost 1 read aligned
+    snp_not_valid = 0  # SNPs with no read aligned
+    snp_count = 0
+    read_with_first_allele = 0
+    read_with_second_allele = 0
+    read_with_different_allele = 0
+    prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+    for line in ann_file:
+        snp_count += 1
+        prgbar.update(snp_count)
+
+        splitted_line = line.split(file_delimiter)
+
+        snp_start = int(splitted_line[snp_header_index_start])
+        snp_stop = int(splitted_line[snp_header_index_stop])
+        snp_name = splitted_line[snp_header_index_name]
+        first_observed, second_observed = check_observed(
+            splitted_line[snp_header_observed])
+
+        frequences_observed = {'first': 0, 'second': 0, 'others': 0}
+
+        logging.info('Loaded snp: {0}'.format(snp_name))
+        statistics_file.write('\nLoaded snp: {0}'.format(snp_name))
+        frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
+        count_read = 0
+        for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop):
+
+            logging.debug('Coverage at base {0} = {1}'.format(
+                pileupcolumn.pos, pileupcolumn.n))
+            for pileupread in pileupcolumn.pileups:
+                if not pileupread.is_del and not pileupread.is_refskip:
+                    count_read += 1
+                    base_value = pileupread.alignment.query_sequence[
+                        pileupread.query_position]
+                    logging.debug('Base in read = {0}, First observed = {1}, Second observed = {2}'.format(
+                        base_value, first_observed, second_observed))
+                    frequences[base_value] += 1
+                    if base_value == first_observed:
+                        frequences_observed['first'] += 1
+                        read_with_first_allele += 1
+                    elif base_value == second_observed:
+                        frequences_observed['second'] += 1
+                        read_with_second_allele += 1
+                    else:
+                        frequences_observed['others'] += 1
+                        read_with_different_allele += 1
+        logging.debug(frequences)
+
+        if count_read != 0:
+            snp_valid += 1
+            # number of reads aligned with the SNP
+            logging.info(
+                '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+            statistics_file.write(
+                '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+
+            # number of reads aligned with the SNP in the first allele
+            logging.info('Number of read aligned with the first allele: {0}'.format(
+                frequences_observed['first']))
+            statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
+                frequences_observed['first']))
+
+            # reads aligned with the SNP in the first allele over all
+            logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
+                frequences_observed['first']) / count_read * 100))
+            statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
+                frequences_observed['first']) / count_read * 100))
+
+            # number of reads aligned with the SNP in the second allele
+            logging.info('Number of read aligned with the second allele: {0}'.format(
+                frequences_observed['second']))
+            statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
+                frequences_observed['second']))
+
+            # reads aligned with the SNP in the second allele over all
+            logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
+                frequences_observed['second']) / count_read * 100))
+            statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
+                frequences_observed['second']) / count_read * 100))
+
+            # reads aligned with the SNP - no first or second allele
+            logging.info('Number of read aligned with different allele: {0}'.format(
+                frequences_observed['others']))
+            statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format(
+                float(frequences_observed['others']) / count_read) * 100)
+
+        else:
+            snp_not_valid += 1
+            logging.info(
+                '\nNo reads aligned with this snp: {0}'.format(snp_name))
+            statistics_file.write(
+                '\nNo reads aligned with this snp: {0}'.format(snp_name))
+
+    prgbar.finish()
+    logging.info(
+        '\n*************************************************************')
+    logging.info('Processed {0} SNPs'.format(snp_count))
+    statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
+
+    logging.info('SNPs with almost one read aligned (%): {0}'.format(
+        float(snp_valid) / snp_count) * 100)
+    statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format(
+        float(snp_valid) / snp_count) * 100)
+    logging.info('SNPs with no read aligned (%): {0}'.format(
+        float(snp_not_valid) / snp_count) * 100)
+    statistics_file.write('\nSNPs with no read aligned (%): {0}'.format(
+        float(snp_not_valid) / snp_count) * 100)
+
+    logging.info('Total number of read aligned with the first allele: {0}'.format(
+        read_with_first_allele))
+    statistics_file.write(
+        '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele))
+
+    logging.info('Total number of read aligned with the second allele: {0}'.format(
+        read_with_second_allele))
+    statistics_file.write(
+        '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele))
+    logging.info('Total number of read aligned with different allele: {0}'.format(
+        read_with_different_allele))
+    statistics_file.write(
+        '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele))
+
+    logging.info('Generation of statistics: Completed')
+    statistics_file.close()
+    ann_file.close()
+    in_sam.close()
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='Generate statistics using alignment BAM file and SNPs annotation file')
+    parser.add_argument('-b',
+                        action='store',
+                        dest='fileBAM',
+                        help='Alignment file in BAM format.',
+                        required=True)
+    parser.add_argument('-a',
+                        action='store',
+                        dest='annotationFile',
+                        help='SNPs location annotation file.',
+                        required=True)
+    parser.add_argument('-r',
+                        action='store',
+                        dest='region',
+                        help='Region to be examined in the form: chr:start-stop. Ex. 1:100-200.')
+    parser.add_argument('-o',
+                        action='store',
+                        dest='output_dir',
+                        help='Output (root) directory.',
+                        required=True)
+    parser.add_argument('-v',
+                        help='increase output verbosity',
+                        action='count')
+
+    args = parser.parse_args()
+
+    in_bam_file = args.fileBAM
+    in_region = args.region
+    out_root_dir = args.output_dir
+
+    if args.v is None:
+        log_level = logging.INFO
+    else:
+        log_level = logging.DEBUG
+
+    logging.basicConfig(level=log_level,
+                        format='%(levelname)-8s [%(asctime)s]  %(message)s',
+                        datefmt='%y%m%d %H%M%S')
+
+    if not os.path.exists(out_root_dir):
+        logging.error('Output dir not found.')
+        sys.exit(1)
+
+    logging.info('StatisticsGenerator: Program Started')
+
+    region = check_region(in_region)
+
+    out_dir = out_root_dir + '/'
+
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    ann_file_delimiter = '\t'
+
+    generate_statistics(args.annotationFile,
+                        ann_file_delimiter, region, in_bam_file, out_dir)
+
+    logging.info('StatisticsGenerator: Program Completed')
+
+
+# utility
+
+def check_observed(observed_couple):
+    # observed are in the form A/T, T/-, -/C
+    regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+    couple = regexpr_snp.search(observed_couple)
+    return couple.group(1), couple.group(2)
+
+
+def file_len(file):
+    # return the length (number of rows) of a file
+    with open(file) as f:
+        for i, l in enumerate(f):
+            pass
+    return i
+
+
+def check_region(input_region):
+    # if no region is set, is used all the 1st chromosome (for test purpose)
+    if input_region is not None:
+        regexp_reg = re.compile(r'^(\w+):(\d+)-(\d+)')
+        region = re.search(regexp_reg, input_region)
+        if region is not None:
+            in_chr = region.group(1)
+            in_start = int(region.group(2))
+            in_stop = int(region.group(3))
+            if in_start > in_stop:
+                logging.error('Wrong input region.')
+                sys.exit(1)
+            else:
+                return in_chr, in_start, in_stop
+    else:
+        return '1'
+
+
+if __name__ == '__main__':
+    main()

From 22d3c35bf54d7abdcfadebd28313128904268fe8 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Tue, 17 May 2016 23:03:45 +0200
Subject: [PATCH 02/11] Inserimento script per conversione BAM to WIF. Da
 completare!

---
 src/input-bam/bamToWif.py | 193 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 src/input-bam/bamToWif.py

diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
new file mode 100644
index 00000000..961947f3
--- /dev/null
+++ b/src/input-bam/bamToWif.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import logging
+import os
+import sys
+import argparse
+import progressbar
+import re
+import pprint
+
+import pysam
+
+
+def check_observed(observed_couple):
+    # observed are in the form A/T, T/-, -/C
+    regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+    couple = regexpr_snp.search(observed_couple)
+    return couple.group(1), couple.group(2)
+
+
+def file_len(file):
+    # return the length (number of rows) of a file
+    with open(file) as f:
+        for i, l in enumerate(f):
+            pass
+    return i
+
+
+def snpsInRead(alignment_file, annotation_file, file_delimiter):
+
+    in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+    # return the number of SNPs in the file
+    # snp_total_number = file_len(annotation_file)
+
+    total_fetch_alignment = in_sam.fetch()
+    total_read_number = in_sam.count()
+
+    read_count = 0
+    read_bar = progressbar.ProgressBar(maxval=total_read_number).start()
+    snp_read_list = {}
+    for read in total_fetch_alignment:
+        read_count += 1
+        read_bar.update(read_count)
+
+        read_qname = read.qname
+        read_start = read.reference_start
+        read_end = read.reference_end
+        logging.debug('read {0}, start: {1}, end: {2}'.format(
+            read_qname, read_start, read_end))
+
+        ann_file = open(annotation_file)
+
+        # skip first line (comment if the file starts with the header)
+        # ann_file.readline()
+
+        # Header info - first line
+        header = ann_file.readline().split(file_delimiter)
+
+        # the fields to read
+        snp_header_index_chromosome = header.index('chrom')
+        snp_header_index_start = header.index('chromStart')
+        snp_header_index_stop = header.index('chromEnd')
+        snp_header_index_observed = header.index('observed')
+        snp_read_list[read_qname] = []
+        # snp_bar = progressbar.ProgressBar(maxval=snp_total_number).start()
+        # snp_count = 0
+        for line in ann_file:
+            # snp_count += 1
+            # snp_bar.update(snp_count)
+
+            splitted_line = line.split(file_delimiter)
+
+            snp_chrom = splitted_line[snp_header_index_chromosome]
+            snp_start = int(splitted_line[snp_header_index_start])
+            snp_stop = int(splitted_line[snp_header_index_stop])
+            first_observed, second_observed = check_observed(
+                splitted_line[snp_header_index_observed])
+
+            # Check every read aligned with the SNPs
+            for pileupcolumn in in_sam.pileup(snp_chrom, snp_start, snp_stop):
+                for pileupread in pileupcolumn.pileups:
+                    read_snp_name = pileupread.alignment.query_name
+                    logging.debug('Found read: {0}'.format(read_snp_name))
+                    if read_snp_name == read_qname:
+                        if not pileupread.is_del and not pileupread.is_refskip:
+                            read_snp_position = pileupread.query_position
+                            base_value = pileupread.alignment.query_sequence[
+                                pileupread.query_position]
+                            if base_value == first_observed:
+                                allele = '0'
+                            elif base_value == second_observed:
+                                allele = '1'
+                            else:
+                                # skip the SNP if not in first or second allele
+                                continue
+                            base_quality = pileupread.alignment.query_alignment_qualities[
+                                read_snp_position]
+                            # if read_snp_name == read_qname and
+                            # read_snp_position >= read_start and
+                            # read_snp_position <= read_end:
+
+                            snp_read_list[read_qname].append(
+                                [read_snp_position, base_value, allele, base_quality])
+
+        ann_file.close()
+        snp_read_list[read_qname].append(read.mapping_quality)
+        # snp_bar.finish()
+    read_bar.finish()
+    return snp_read_list
+
+
+def convertBamToWif(snp_read_list, out_dir):
+
+    output_file = open(out_dir + 'alignment.wif', 'w')
+
+    for read, snp_list in snp_read_list.items():
+        if len(snp_list) <= 1:
+            continue
+        else:
+            snp_mapping_quality = snp_list[-1]
+            for snp in snp_list[:-1]:
+                output_file.write('{0} {1} {2} {3} : '.format(
+                    snp[0], snp[1], snp[2], snp[3]))
+            output_file.write('# {0} : NA\n'.format(snp_mapping_quality))
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='Convert an alignment file .bam in a .wif Hapcol compatible file')
+    parser.add_argument('-b',
+                        action='store',
+                        dest='fileBAM',
+                        help='Alignment file in BAM format.',
+                        required=True)
+    parser.add_argument('-a',
+                        action='store',
+                        dest='annotationFile',
+                        help='SNPs annotation file.',
+                        required=True)
+    parser.add_argument('-o',
+                        action='store',
+                        dest='output_dir',
+                        help='Output (root) directory.',
+                        required=True)
+    parser.add_argument('-v',
+                        help='increase output verbosity',
+                        action='count')
+
+    args = parser.parse_args()
+
+    logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####")
+
+    in_bam_file = args.fileBAM
+    out_root_dir = args.output_dir
+
+    if args.v == 0:
+        log_level = logging.INFO
+    else:
+        log_level = logging.DEBUG
+
+    logging.basicConfig(level=log_level,
+                        format='%(levelname)-8s [%(asctime)s]  %(message)s',
+                        datefmt='%y%m%d %H%M%S')
+
+    if not os.path.exists(out_root_dir):
+        logging.error('Output dir not found.')
+        sys.exit(1)
+
+    logging.info('BamToWif: Program Started')
+
+    out_dir = out_root_dir + '/'
+
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    ann_file_delimiter = '\t'
+
+    logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####")
+
+    snps_list = snpsInRead(in_bam_file, args.annotationFile,
+                           ann_file_delimiter)
+
+    logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####")
+
+    convertBamToWif(snps_list, out_dir)
+
+    logging.info('BamToWif: Program Completed')
+
+if __name__ == '__main__':
+    main()

From 9c219b26230df544340d875f828c7d0660d6c530 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Wed, 18 May 2016 21:56:21 +0200
Subject: [PATCH 03/11] qualche correzione ai log

---
 src/input-bam/statistics.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index 7c5df217..9c81e13c 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -86,7 +86,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
             snp_valid += 1
             # number of reads aligned with the SNP
             logging.info(
-                '\nTotal number of read aligned with SNP: {0}'.format(count_read))
+                'Total number of read aligned with SNP: {0}'.format(count_read))
             statistics_file.write(
                 '\nTotal number of read aligned with SNP: {0}'.format(count_read))
 
@@ -96,7 +96,7 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
             statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
                 frequences_observed['first']))
 
-            # reads aligned with the SNP in the first allele over all
+            # reads aligned with the SNP in the first allele
             logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
                 frequences_observed['first']) / count_read * 100))
             statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
@@ -108,28 +108,36 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
             statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
                 frequences_observed['second']))
 
-            # reads aligned with the SNP in the second allele over all
+            # reads aligned with the SNP in the second allele
             logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
                 frequences_observed['second']) / count_read * 100))
             statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
                 frequences_observed['second']) / count_read * 100))
 
-            # reads aligned with the SNP - no first or second allele
+            # reads aligned with the SNP - different allele
             logging.info('Number of read aligned with different allele: {0}'.format(
                 frequences_observed['others']))
-            statistics_file.write('\nPercentual of read aligned with different allele:: {0}'.format(
+            statistics_file.info('\nNumber of read aligned with different allele: {0}'.format(
+                frequences_observed['others']))
+
+            logging.info('Percentual of read aligned with different allele: {0}'.format(
+                float(frequences_observed['others']) / count_read) * 100)
+            statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format(
                 float(frequences_observed['others']) / count_read) * 100)
 
         else:
             snp_not_valid += 1
             logging.info(
-                '\nNo reads aligned with this snp: {0}'.format(snp_name))
+                'No reads aligned with this snp: {0}'.format(snp_name))
             statistics_file.write(
                 '\nNo reads aligned with this snp: {0}'.format(snp_name))
 
     prgbar.finish()
     logging.info(
-        '\n*************************************************************')
+        '#### TOTAL RESULTS ####')
+    statistics_file.write(
+        '\n#### TOTAL RESULTS ####')
+
     logging.info('Processed {0} SNPs'.format(snp_count))
     statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
 

From dcf3cc3c305cdc637f94cce4acb08de8564e8b93 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Wed, 18 May 2016 21:57:50 +0200
Subject: [PATCH 04/11] qualche correzione ai log

---
 src/input-bam/bamToWif.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
index 961947f3..98f93cf9 100644
--- a/src/input-bam/bamToWif.py
+++ b/src/input-bam/bamToWif.py
@@ -151,7 +151,7 @@ def main():
 
     args = parser.parse_args()
 
-    logging.info("#### STEP 1 - PROCESSING INPUT ARGUMENTS ####")
+    logging.info('#### STEP 1 - PROCESSING INPUT ARGUMENTS ####')
 
     in_bam_file = args.fileBAM
     out_root_dir = args.output_dir
@@ -178,12 +178,12 @@ def main():
 
     ann_file_delimiter = '\t'
 
-    logging.info("#### STEP 2 - RETREIVING SNPs FOR READS ####")
+    logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####')
 
     snps_list = snpsInRead(in_bam_file, args.annotationFile,
                            ann_file_delimiter)
 
-    logging.info("#### STEP 3 - CONVERTING BAM TO WIF ####")
+    logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####')
 
     convertBamToWif(snps_list, out_dir)
 

From 3bcd3a11072b3c7bd9b90462d712f5787543ea64 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Wed, 18 May 2016 22:16:48 +0200
Subject: [PATCH 05/11] =?UTF-8?q?Aggiunta=20possibilit=C3=A0=20di=20decide?=
 =?UTF-8?q?re=20in=20fase=20di=20esecuzione=20dello=20script=20il=20delimi?=
 =?UTF-8?q?tatore=20dei=20campi=20dei=20file=20di=20annotazione=20e=20lo?=
 =?UTF-8?q?=20skip=20della=20prima=20riga?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/input-bam/bamToWif.py   | 20 ++++++++++++++++----
 src/input-bam/statistics.py | 20 ++++++++++++++++----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/input-bam/bamToWif.py b/src/input-bam/bamToWif.py
index 98f93cf9..c7c24e53 100644
--- a/src/input-bam/bamToWif.py
+++ b/src/input-bam/bamToWif.py
@@ -27,7 +27,7 @@ def file_len(file):
     return i
 
 
-def snpsInRead(alignment_file, annotation_file, file_delimiter):
+def snpsInRead(alignment_file, annotation_file, file_delimiter, skip_first_line):
 
     in_sam = pysam.AlignmentFile(alignment_file, 'rb')
 
@@ -53,7 +53,8 @@ def snpsInRead(alignment_file, annotation_file, file_delimiter):
         ann_file = open(annotation_file)
 
         # skip first line (comment if the file starts with the header)
-        # ann_file.readline()
+        if skip_first_line:
+            ann_file.readline()
 
         # Header info - first line
         header = ann_file.readline().split(file_delimiter)
@@ -145,6 +146,14 @@ def main():
                         dest='output_dir',
                         help='Output (root) directory.',
                         required=True)
+    parser.add_argument('-skip-first-line',
+                        action='store',
+                        dest='skip_first_line',
+                        help='set this flag if the first line IS NOT the header')
+    parser.add_argument('-file-delimiter',
+                        action='store',
+                        dest='file_delim',
+                        help='set the file delimiter for the annotation file. Default: \\t')
     parser.add_argument('-v',
                         help='increase output verbosity',
                         action='count')
@@ -176,12 +185,15 @@ def main():
     if not os.path.exists(out_dir):
         os.mkdir(out_dir)
 
-    ann_file_delimiter = '\t'
+    if args.file_delim is None:
+        ann_file_delimiter = '\t'
+    else:
+        ann_file_delimiter = args.file_delimiter
 
     logging.info('#### STEP 2 - RETREIVING SNPs FOR READS ####')
 
     snps_list = snpsInRead(in_bam_file, args.annotationFile,
-                           ann_file_delimiter)
+                           ann_file_delimiter, args.skip_first_line)
 
     logging.info('#### STEP 3 - CONVERTING BAM TO WIF ####')
 
diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index 9c81e13c..af123a39 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -11,7 +11,7 @@
 import pysam
 
 
-def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir):
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line):
     in_sam = pysam.AlignmentFile(alignment_file, 'rb')
 
     # return the number of SNPs in the file
@@ -20,7 +20,8 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
     ann_file = open(annotation_file)
 
     # skip first line (comment if the file starts with the header)
-    ann_file.readline()
+    if skip_first_line:
+        ann_file.readline()
 
     # read header info
     header = ann_file.readline().split(file_delimiter)
@@ -193,6 +194,14 @@ def main():
                         dest='output_dir',
                         help='Output (root) directory.',
                         required=True)
+    parser.add_argument('-skip-first-line',
+                        action='store',
+                        dest='skip_first_line',
+                        help='set this flag if the first line IS NOT the header')
+    parser.add_argument('-file-delimiter',
+                        action='store',
+                        dest='file_delim',
+                        help='set the file delimiter for the annotation file. Default: \\t')
     parser.add_argument('-v',
                         help='increase output verbosity',
                         action='count')
@@ -225,10 +234,13 @@ def main():
     if not os.path.exists(out_dir):
         os.mkdir(out_dir)
 
-    ann_file_delimiter = '\t'
+    if args.file_delim is None:
+        ann_file_delimiter = '\t'
+    else:
+        ann_file_delimiter = args.file_delimiter
 
     generate_statistics(args.annotationFile,
-                        ann_file_delimiter, region, in_bam_file, out_dir)
+                        ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line)
 
     logging.info('StatisticsGenerator: Program Completed')
 

From 1f0a0ef6abbcd53904471609d779189700292ffa Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Sun, 22 May 2016 23:43:12 +0200
Subject: [PATCH 06/11] fix sui log

---
 src/input-bam/statistics.py | 107 ++++++++++++++----------------------
 1 file changed, 41 insertions(+), 66 deletions(-)

diff --git a/src/input-bam/statistics.py b/src/input-bam/statistics.py
index af123a39..a4225062 100644
--- a/src/input-bam/statistics.py
+++ b/src/input-bam/statistics.py
@@ -11,7 +11,7 @@
 import pysam
 
 
-def generate_statistics(annotation_file, file_delimiter, region, alignment_file, out_dir, skip_first_line):
+def generate_statistics(annotation_file, file_delimiter, region, alignment_file, skip_first_line):
     in_sam = pysam.AlignmentFile(alignment_file, 'rb')
 
     # return the number of SNPs in the file
@@ -32,8 +32,6 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
     snp_header_index_name = header.index('name')
     snp_header_observed = header.index('observed')
 
-    statistics_file = open(out_dir + 'statistics.txt', 'w')
-
     # read annotation file
     snp_valid = 0  # SNPs with almost 1 read aligned
     snp_not_valid = 0  # SNPs with no read aligned
@@ -57,11 +55,10 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
         frequences_observed = {'first': 0, 'second': 0, 'others': 0}
 
         logging.info('Loaded snp: {0}'.format(snp_name))
-        statistics_file.write('\nLoaded snp: {0}'.format(snp_name))
+
         frequences = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
         count_read = 0
         for pileupcolumn in in_sam.pileup(region[0], snp_start, snp_stop):
-
             logging.debug('Coverage at base {0} = {1}'.format(
                 pileupcolumn.pos, pileupcolumn.n))
             for pileupread in pileupcolumn.pileups:
@@ -88,85 +85,51 @@ def generate_statistics(annotation_file, file_delimiter, region, alignment_file,
             # number of reads aligned with the SNP
             logging.info(
                 'Total number of read aligned with SNP: {0}'.format(count_read))
-            statistics_file.write(
-                '\nTotal number of read aligned with SNP: {0}'.format(count_read))
 
             # number of reads aligned with the SNP in the first allele
-            logging.info('Number of read aligned with the first allele: {0}'.format(
-                frequences_observed['first']))
-            statistics_file.write('\nNumber of read aligned with the first allele: {0}'.format(
-                frequences_observed['first']))
-
-            # reads aligned with the SNP in the first allele
-            logging.info('Percentual of read aligned with the first allele: {0}'.format(float(
-                frequences_observed['first']) / count_read * 100))
-            statistics_file.write('\nPercentual of read aligned with the first allele: {0}'.format(float(
-                frequences_observed['first']) / count_read * 100))
+            logging.info('Number of read aligned with the first allele: {0} ({1}%)'.format(
+                frequences_observed['first'], round((float(
+                    frequences_observed['first']) / count_read) * 100, 2)))
 
             # number of reads aligned with the SNP in the second allele
-            logging.info('Number of read aligned with the second allele: {0}'.format(
-                frequences_observed['second']))
-            statistics_file.write('\nNumber of read aligned with the second allele: {0}'.format(
-                frequences_observed['second']))
-
-            # reads aligned with the SNP in the second allele
-            logging.info('Percentual of read aligned with the second allele: {0}'.format(float(
-                frequences_observed['second']) / count_read * 100))
-            statistics_file.write('\nPercentual of read aligned with the second allele: {0}'.format(float(
-                frequences_observed['second']) / count_read * 100))
+            logging.info('Number of read aligned with the second allele: {0} ({1}%)'.format(
+                frequences_observed['second'], round((float(
+                    frequences_observed['second']) / count_read) * 100, 2)))
 
             # reads aligned with the SNP - different allele
-            logging.info('Number of read aligned with different allele: {0}'.format(
-                frequences_observed['others']))
-            statistics_file.info('\nNumber of read aligned with different allele: {0}'.format(
-                frequences_observed['others']))
-
-            logging.info('Percentual of read aligned with different allele: {0}'.format(
-                float(frequences_observed['others']) / count_read) * 100)
-            statistics_file.write('\nPercentual of read aligned with different allele: {0}'.format(
-                float(frequences_observed['others']) / count_read) * 100)
+            logging.info('Number of read aligned with different allele: {0} ({1}%)'.format(
+                frequences_observed['others'], round((float(
+                    frequences_observed['others']) / count_read) * 100, 2)))
 
         else:
             snp_not_valid += 1
             logging.info(
                 'No reads aligned with this snp: {0}'.format(snp_name))
-            statistics_file.write(
-                '\nNo reads aligned with this snp: {0}'.format(snp_name))
 
     prgbar.finish()
     logging.info(
         '#### TOTAL RESULTS ####')
-    statistics_file.write(
-        '\n#### TOTAL RESULTS ####')
 
     logging.info('Processed {0} SNPs'.format(snp_count))
-    statistics_file.write('\nProcessed {0} SNPs'.format(snp_count))
 
-    logging.info('SNPs with almost one read aligned (%): {0}'.format(
-        float(snp_valid) / snp_count) * 100)
-    statistics_file.write('\nSNPs with almost one read aligned (%): {0}'.format(
-        float(snp_valid) / snp_count) * 100)
-    logging.info('SNPs with no read aligned (%): {0}'.format(
-        float(snp_not_valid) / snp_count) * 100)
-    statistics_file.write('\nSNPs with no read aligned (%): {0}'.format(
-        float(snp_not_valid) / snp_count) * 100)
+    logging.info('SNPs with almost one read aligned: {0} ({1}%)'.format(
+        snp_valid, round(((float(
+            snp_valid) / snp_count) * 100), 2)))
+
+    logging.info('SNPs with no read aligned: {0} ({1}%)'.format(
+        snp_not_valid, round(((float(
+            snp_not_valid) / snp_count) * 100), 2)))
 
     logging.info('Total number of read aligned with the first allele: {0}'.format(
         read_with_first_allele))
-    statistics_file.write(
-        '\nTotal number of read aligned with the first allele: {0}'.format(read_with_first_allele))
 
     logging.info('Total number of read aligned with the second allele: {0}'.format(
         read_with_second_allele))
-    statistics_file.write(
-        '\nTotal number of read aligned with the second allele: {0}'.format(read_with_second_allele))
+
     logging.info('Total number of read aligned with different allele: {0}'.format(
         read_with_different_allele))
-    statistics_file.write(
-        '\nTotal number of read aligned with different allele: {0}'.format(read_with_different_allele))
 
     logging.info('Generation of statistics: Completed')
-    statistics_file.close()
     ann_file.close()
     in_sam.close()
 
@@ -195,7 +158,7 @@ def main():
                         help='Output (root) directory.',
                         required=True)
     parser.add_argument('-skip-first-line',
-                        action='store',
+                        action='store_true',
                         dest='skip_first_line',
                         help='set this flag if the first line IS NOT the header')
     parser.add_argument('-file-delimiter',
@@ -217,30 +180,28 @@ def main():
     else:
         log_level = logging.DEBUG
 
-    logging.basicConfig(level=log_level,
-                        format='%(levelname)-8s [%(asctime)s]  %(message)s',
-                        datefmt='%y%m%d %H%M%S')
-
     if not os.path.exists(out_root_dir):
         logging.error('Output dir not found.')
         sys.exit(1)
 
-    logging.info('StatisticsGenerator: Program Started')
-
-    region = check_region(in_region)
-
     out_dir = out_root_dir + '/'
 
     if not os.path.exists(out_dir):
         os.mkdir(out_dir)
 
+    prepare_loggers(out_dir, log_level)
+
+    logging.info('StatisticsGenerator: Program Started')
+
+    region = check_region(in_region)
+
     if args.file_delim is None:
         ann_file_delimiter = '\t'
     else:
         ann_file_delimiter = args.file_delimiter
 
     generate_statistics(args.annotationFile,
-                        ann_file_delimiter, region, in_bam_file, out_dir, args.skip_first_line)
+                        ann_file_delimiter, region, in_bam_file, args.skip_first_line)
 
     logging.info('StatisticsGenerator: Program Completed')
 
@@ -262,6 +223,20 @@ def file_len(file):
     return i
 
 
+def prepare_loggers(out_dir, log_level):
+    logging.basicConfig(filename=out_dir + 'statistics.txt',
+                        filemode='w',
+                        level=log_level,
+                        format='%(levelname)-8s [%(asctime)s]  %(message)s',
+                        datefmt='%y%m%d %H:%M:%S')
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+        '[%(levelname)-8s] %(asctime)s - %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+
+
 def check_region(input_region):
     # if no region is set, is used all the 1st chromosome (for test purpose)
     if input_region is not None:

From 1cb00ed95d36f8f7d434389af71b031748bf9d99 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Mon, 13 Jun 2016 23:44:48 +0200
Subject: [PATCH 07/11] nuova versione. Carica in memoria la mappa di SNPs, la
 scorre e per ogni read allineato allo snp controlla se questo si allinea con
 l'allele di maggioranza, di minoranza o altro.

---
 src/input-bam/statistics_v2.py | 220 +++++++++++++++++++++++++++++++++
 1 file changed, 220 insertions(+)
 create mode 100644 src/input-bam/statistics_v2.py

diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py
new file mode 100644
index 00000000..87867e1b
--- /dev/null
+++ b/src/input-bam/statistics_v2.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+# - * -coding: utf - 8 - * -
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import progressbar
+import pysam
+import time
+
+
+def generate_statistics(snp_map, alignment_file):
+    logging.debug('Started method generate_statistics')
+    start_time = time.time()
+
+    in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+
+    snp_total_number = len(snp_map)
+
+    snp_count = 0
+    prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+    count_first_allele = 0
+    count_second_allele = 0
+    count_other_allele = 0
+
+    read_count = 0
+    for snp_name, snp_values in snp_map.items():
+        snp_count += 1
+        prgbar.update(snp_count)
+        logging.debug('Loaded snp: {0}'.format(snp_name))
+        snp_chrom = snp_values[0]
+        snp_start = snp_values[1]
+        snp_end = snp_values[2]
+        first_observed = snp_values[3]
+        second_observed = snp_values[4]
+        logging.debug(
+            'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end))
+        fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
+        tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end)
+        logging.info(
+            'Number of reads aligned with snp: {0}'.format(tot_fetch_aln))
+
+        if tot_fetch_aln == 0 or tot_fetch_aln < 10:
+            logging.debug('SNP without at least ten items will be discarded.')
+            continue
+
+        for read in fetch_aln:
+            read_count += 1
+            read_name = read.query_name
+            read_start = read.reference_start
+            read_end = read.reference_end
+            read_sequence = read.query_sequence
+            logging.debug('Loaded read: {0}'.format(read_name))
+
+            if snp_start >= read_start and snp_end <= read_end:
+                logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format(
+                    read_sequence[snp_start:snp_end], first_observed, second_observed))
+                if read_sequence[snp_start:snp_end] == first_observed:
+                    count_first_allele += 1
+                elif read_sequence[snp_start:snp_end] == second_observed:
+                    count_second_allele += 1
+                elif not read_sequence[snp_start:snp_end].strip():
+                    logging.debug('No valid read found.')
+                else:
+                    count_other_allele += 1
+    prgbar.finish()
+
+    logging.info('Total number of SNPs: {0}'.format(snp_total_number))
+
+    logging.info('Total number of reads: {0}'.format(read_count))
+
+    logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format(
+        count_first_allele, (float(count_first_allele) / read_count) * 100))
+
+    logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format(
+        count_second_allele, (float(count_second_allele) / read_count) * 100))
+
+    logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format(
+        count_other_allele, (float(count_other_allele) / read_count) * 100))
+
+    logging.debug('Finished method generate_statistics in {0} seconds'.format(
+        time.time() - start_time))
+    logging.info('Generation of statistics: Completed')
+    in_sam.close()
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description='Generate statistics using alignment BAM file and SNPs annotation file')
+    parser.add_argument('-b',
+                        action='store',
+                        dest='fileBAM',
+                        help='Alignment file in BAM format.',
+                        required=True)
+    parser.add_argument('-a',
+                        action='store',
+                        dest='annotationFile',
+                        help='SNPs location annotation file.',
+                        required=True)
+    parser.add_argument('-o',
+                        action='store',
+                        dest='output_dir',
+                        help='Output (root) directory.',
+                        required=True)
+    parser.add_argument('-skip-first-line',
+                        action='store_true',
+                        dest='skip_first_line',
+                        help='set this flag if the first line IS NOT the header')
+    parser.add_argument('-file-delimiter',
+                        action='store',
+                        dest='file_delim',
+                        help='set the file delimiter for the annotation file. Default: \\t')
+    parser.add_argument('-v',
+                        help='increase output verbosity',
+                        dest='verbosity',
+                        action='store_true')
+
+    args = parser.parse_args()
+
+    in_bam_file = args.fileBAM
+    out_root_dir = args.output_dir
+
+    if args.verbosity:
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+
+    if not os.path.exists(out_root_dir):
+        logging.error('Output dir not found.')
+        sys.exit(1)
+
+    out_dir = out_root_dir + '/'
+
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+
+    prepare_loggers(log_level)
+
+    logging.info('Program Started')
+
+    if args.file_delim is None:
+        ann_file_delimiter = '\t'
+    else:
+        ann_file_delimiter = args.file_delimiter
+
+    logging.info('#### STEP 1 - Loading SNPs ####')
+
+    snp_map = reading_snp(args.annotationFile,
+                          ann_file_delimiter, args.skip_first_line)
+
+    logging.info('#### STEP 2 - Generating Statistics ####')
+
+    generate_statistics(snp_map, in_bam_file)
+
+    logging.info('Program Completed')
+
+
+# utility
+
+def check_observed(observed_couple):
+    # observed are in the form A/T, T/-, -/C
+    regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
+    couple = regexpr_snp.search(observed_couple)
+    return couple.group(1), couple.group(2)
+
+
+def prepare_loggers(log_level):
+    logging.basicConfig(level=log_level,
+                        format='%(levelname)-8s [%(asctime)s]  %(message)s',
+                        datefmt='%y%m%d %H:%M:%S')
+
+
+def reading_snp(annotation_file, file_delimiter, flag_first_line):
+
+    logging.debug('Started method reading_snp')
+    start_time = time.time()
+    ann_file = open(annotation_file)
+
+    # skip first line (comment if the file starts with the header)
+    if flag_first_line:
+        ann_file.readline()
+
+    # read header info
+    header = ann_file.readline().split(file_delimiter)
+
+    # the fields to read
+    snp_header_index_chrom = header.index('chrom')
+    snp_header_index_start = header.index('chromStart')
+    snp_header_index_stop = header.index('chromEnd')
+    snp_header_index_name = header.index('name')
+    snp_header_observed = header.index('observed')
+    snp_map = {}
+    for line in ann_file:
+        splitted_line = line.split(file_delimiter)
+
+        snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1]
+        snp_start = int(splitted_line[snp_header_index_start])
+        snp_stop = int(splitted_line[snp_header_index_stop])
+        snp_name = splitted_line[snp_header_index_name]
+        first_observed, second_observed = check_observed(
+            splitted_line[snp_header_observed])
+        logging.debug('{0},{1},{2},{3}'.format(
+            snp_chrom, snp_start, snp_stop, snp_name))
+        snp_map[snp_name] = [snp_chrom, snp_start, snp_stop,
+                             first_observed, second_observed]
+
+    ann_file.close()
+
+    logging.debug('Finished method reading_snp in: {0} seconds'.format(
+        time.time() - start_time))
+
+    return snp_map
+
+
+if __name__ == '__main__':
+    main()

From 4ee1395b1c4aac7ddb311645ac8c9246b5ee6560 Mon Sep 17 00:00:00 2001
From: ddcatania <dd.catania@gmail.com>
Date: Tue, 21 Jun 2016 22:59:09 +0200
Subject: [PATCH 08/11] Calcola i due alleli con maggior frequenza e secondo
 una certa soglia 'epsilon' classifica gli SNP in omozigote o eterozigote

---
 src/input-bam/statistics_v2.py | 210 ++++++++++++++++++---------------
 1 file changed, 117 insertions(+), 93 deletions(-)

diff --git a/src/input-bam/statistics_v2.py b/src/input-bam/statistics_v2.py
index 87867e1b..41169dc9 100644
--- a/src/input-bam/statistics_v2.py
+++ b/src/input-bam/statistics_v2.py
@@ -6,85 +6,11 @@
 import os
 import re
 import sys
+from collections import Counter
+import time
 
 import progressbar
 import pysam
-import time
-
-
-def generate_statistics(snp_map, alignment_file):
-    logging.debug('Started method generate_statistics')
-    start_time = time.time()
-
-    in_sam = pysam.AlignmentFile(alignment_file, 'rb')
-
-    snp_total_number = len(snp_map)
-
-    snp_count = 0
-    prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
-    count_first_allele = 0
-    count_second_allele = 0
-    count_other_allele = 0
-
-    read_count = 0
-    for snp_name, snp_values in snp_map.items():
-        snp_count += 1
-        prgbar.update(snp_count)
-        logging.debug('Loaded snp: {0}'.format(snp_name))
-        snp_chrom = snp_values[0]
-        snp_start = snp_values[1]
-        snp_end = snp_values[2]
-        first_observed = snp_values[3]
-        second_observed = snp_values[4]
-        logging.debug(
-            'Reading chromosome: {0}, Starting: {1} - Ending{2}'.format(snp_chrom, snp_start, snp_end))
-        fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
-        tot_fetch_aln = in_sam.count(snp_chrom, snp_start, snp_end)
-        logging.info(
-            'Number of reads aligned with snp: {0}'.format(tot_fetch_aln))
-
-        if tot_fetch_aln == 0 or tot_fetch_aln < 10:
-            logging.debug('SNP without at least ten items will be discarded.')
-            continue
-
-        for read in fetch_aln:
-            read_count += 1
-            read_name = read.query_name
-            read_start = read.reference_start
-            read_end = read.reference_end
-            read_sequence = read.query_sequence
-            logging.debug('Loaded read: {0}'.format(read_name))
-
-            if snp_start >= read_start and snp_end <= read_end:
-                logging.debug('In the read: {0}, in the SNP (first): {1}, in the SNP (second): {2}'.format(
-                    read_sequence[snp_start:snp_end], first_observed, second_observed))
-                if read_sequence[snp_start:snp_end] == first_observed:
-                    count_first_allele += 1
-                elif read_sequence[snp_start:snp_end] == second_observed:
-                    count_second_allele += 1
-                elif not read_sequence[snp_start:snp_end].strip():
-                    logging.debug('No valid read found.')
-                else:
-                    count_other_allele += 1
-    prgbar.finish()
-
-    logging.info('Total number of SNPs: {0}'.format(snp_total_number))
-
-    logging.info('Total number of reads: {0}'.format(read_count))
-
-    logging.info('Total number of read aligned with the first allele: {0} ({1}%)'.format(
-        count_first_allele, (float(count_first_allele) / read_count) * 100))
-
-    logging.info('Total number of read aligned with the second allele: {0} ({1}%)'.format(
-        count_second_allele, (float(count_second_allele) / read_count) * 100))
-
-    logging.info('Total number of read aligned with other allele: {0} ({1}%)'.format(
-        count_other_allele, (float(count_other_allele) / read_count) * 100))
-
-    logging.debug('Finished method generate_statistics in {0} seconds'.format(
-        time.time() - start_time))
-    logging.info('Generation of statistics: Completed')
-    in_sam.close()
 
 
 def main():
@@ -110,6 +36,10 @@ def main():
                         action='store_true',
                         dest='skip_first_line',
                         help='set this flag if the first line IS NOT the header')
+    parser.add_argument('-epsilon',
+                        action='store',
+                        dest='threshold',
+                        help='set the threshold for Homozygous or Heterozygous. Default: 80')
     parser.add_argument('-file-delimiter',
                         action='store',
                         dest='file_delim',
@@ -147,6 +77,10 @@ def main():
     else:
         ann_file_delimiter = args.file_delimiter
 
+    threshold = 80
+    if args.threshold is not None:
+        threshold = int(args.threshold)
+
     logging.info('#### STEP 1 - Loading SNPs ####')
 
     snp_map = reading_snp(args.annotationFile,
@@ -154,20 +88,12 @@ def main():
 
     logging.info('#### STEP 2 - Generating Statistics ####')
 
-    generate_statistics(snp_map, in_bam_file)
+    classify_snp(in_bam_file, snp_map, threshold, out_dir)
 
     logging.info('Program Completed')
 
 
 # utility
-
-def check_observed(observed_couple):
-    # observed are in the form A/T, T/-, -/C
-    regexpr_snp = re.compile(r'^(\-|\w+)\/(\-|\w+)')
-    couple = regexpr_snp.search(observed_couple)
-    return couple.group(1), couple.group(2)
-
-
 def prepare_loggers(log_level):
     logging.basicConfig(level=log_level,
                         format='%(levelname)-8s [%(asctime)s]  %(message)s',
@@ -192,29 +118,127 @@ def reading_snp(annotation_file, file_delimiter, flag_first_line):
     snp_header_index_start = header.index('chromStart')
     snp_header_index_stop = header.index('chromEnd')
     snp_header_index_name = header.index('name')
-    snp_header_observed = header.index('observed')
     snp_map = {}
     for line in ann_file:
+
         splitted_line = line.split(file_delimiter)
 
         snp_chrom = splitted_line[snp_header_index_chrom].split('chr', 1)[1]
         snp_start = int(splitted_line[snp_header_index_start])
         snp_stop = int(splitted_line[snp_header_index_stop])
         snp_name = splitted_line[snp_header_index_name]
-        first_observed, second_observed = check_observed(
-            splitted_line[snp_header_observed])
-        logging.debug('{0},{1},{2},{3}'.format(
-            snp_chrom, snp_start, snp_stop, snp_name))
-        snp_map[snp_name] = [snp_chrom, snp_start, snp_stop,
-                             first_observed, second_observed]
+
+        snp_map[snp_name] = [snp_chrom, snp_start, snp_stop]
 
     ann_file.close()
 
     logging.debug('Finished method reading_snp in: {0} seconds'.format(
         time.time() - start_time))
-
     return snp_map
 
 
+def classify_snp(alignment_file, snp_map, threshold, out_dir):
+    logging.debug('Started method classifySNP')
+    start_time = time.time()
+    count_homo = 0
+    count_hete = 0
+    snp_total_number = len(snp_map)
+    statistics_file = open(out_dir + 'statistics2.txt', 'w')
+
+    in_sam = pysam.AlignmentFile(alignment_file, 'rb')
+    prgbar = progressbar.ProgressBar(maxval=snp_total_number).start()
+    cnt = 0
+
+    for snp_name, snp_values in snp_map.items():
+        statistics_file.write('Reading SNP: {0}\n'.format(snp_name))
+        cnt += 1
+        prgbar.update(cnt)
+        snp_chrom = snp_values[0]
+        snp_start = snp_values[1]
+        snp_end = snp_values[2]
+
+        logging.debug(
+            'SNP start: {0} - SNP end:{1}'.format(snp_start, snp_end))
+
+        fetch_aln = in_sam.fetch(snp_chrom, snp_start, snp_end)
+        count_aln = in_sam.count(snp_chrom, snp_start, snp_end)
+
+        alleles = Counter()
+
+        for read in fetch_aln:
+            read_name = read.query_name
+            logging.debug('Reading read: {0}'.format(read_name))
+            read_sequence = read.query_sequence
+            read_base = read_sequence[snp_start:snp_end]
+            if not read_base:
+                logging.debug('Empty read - indel')
+                continue
+            logging.debug('Read base: {0}'.format(read_base))
+            alleles[read_base] += 1
+
+        logging.debug(alleles)
+        statistics_file.write('Allele counter for the SNP:\n')
+        for key, value in alleles.items():
+            statistics_file.write('{0} : {1}\n'.format(key, value))
+
+        if alleles:
+            first_allele = alleles.most_common(2)[0]
+            logging.debug(first_allele)
+            second_allele = alleles.most_common(2)[1]
+            logging.debug(second_allele)
+
+            if (first_allele[1] / count_aln) * 100 >= threshold:
+                statistics_file.write('Snp is Homozygous\n')
+                logging.debug('The snp {0} is: Homozygous'.format(snp_name))
+                count_homo += 1
+            else:
+                statistics_file.write('Snp is Heterozygous\n')
+                logging.debug('The snp {0} is: Heterozygous'.format(snp_name))
+                count_hete += 1
+
+        logging.debug(
+            'Number of read aligned on SNP: {0}'.format(count_aln))
+        statistics_file.write(
+            'Number of read aligned on SNP: {0}\n'.format(count_aln))
+
+        out = {}
+        if count_aln > 0:
+            for key in alleles:
+                out[key] = (float(alleles[key]) / count_aln) * 100
+
+        logging.debug(out)
+        statistics_file.write('Base Frequences for the SNP:\n')
+        for key, value in out.items():
+            statistics_file.write('{0} : {1}\n'.format(key, value))
+
+    prgbar.finish()
+    in_sam.close()
+
+    logging.debug(
+        'Total number of SNPs: {0}'.format(snp_total_number))
+
+    statistics_file.write(
+        'Total number of SNPs: {0}\n'.format(snp_total_number))
+
+    logging.debug(
+        'Number of SNPs: Homozygous {0} ({1}%)'.format(
+            count_homo, ((float(count_homo) / snp_total_number) * 100)))
+
+    statistics_file.write('Number of SNPs: Homozygous {0} ({1}%)\n'.format(
+        count_homo, ((float(count_homo) / snp_total_number) * 100)))
+
+    logging.debug(
+        'Number of SNPs: Heterozygous {0} ({1}%)'.format(
+            count_hete, ((float(count_hete) / snp_total_number) * 100)))
+
+    statistics_file.write('Number of SNPs: Heterozygous {0} ({1}%)\n'.format(
+        count_hete, ((float(count_hete) / snp_total_number) * 100)))
+
+    logging.debug('Finished method classifySNP in: {0} seconds'.format(
+        time.time() - start_time))
+
+    statistics_file.close()
+
+
 if __name__ == '__main__':
     main()

From d6d64bc084dfdf920bf66524d0718171350941a0 Mon Sep 17 00:00:00 2001
From: Nara88 <biancoandrea88@gmail.com>
Date: Thu, 29 Sep 2016 18:39:08 +0200
Subject: [PATCH 09/11] Add files via upload

---
 Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 Vcf_conversion.rb

diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb
new file mode 100644
index 00000000..d661f72f
--- /dev/null
+++ b/Vcf_conversion.rb
@@ -0,0 +1,105 @@
+#Methods
+
+def get_line_from_file(path, line)
+result = nil
+  File.open(path, "r") do |f|
+    while line > 0
+      line -= 1
+      result = f.gets
+      result ||= '' 
+      result = result.chomp
+    end
+  end
+  return result
+end
+
+def write_header(output_path)
+  formato = "##fileformat=VCFv4.2\n"
+  colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
+  File.open(output_path, "w") { |file| file.write(formato + colonne) }
+end
+
+def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+   File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
+end
+
+def find_alt(snp_line, ref)
+
+  alternatives = snp_line[9].to_s.split "/"
+  alt = "."
+  if (alternatives.length == 2)
+    alternatives.each do |a|
+      if a.to_s != ref
+        alt = a.to_s
+      end
+    end
+  else  
+    puts "Error: bihallelic field"
+    ref = "X"
+  end
+
+  return alt
+    
+end
+
+
+def create_line(snp_path, current_line, current_father, current_mother, output)
+  
+  current_snp_line = current_line + 3
+
+  snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
+
+  chrom = snp_line[1].to_s 
+
+  pos = (snp_line[2].to_i + 1).to_s 
+
+  id = "."
+
+  ref = snp_line[7].to_s
+
+  alt = find_alt(snp_line, ref)
+  
+  qual = "."
+
+  filter = "PASS"
+
+  info =  "."
+
+  format = "GT" 
+
+  sample = current_father.to_s + "|" + current_mother.to_s
+  
+  write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+
+end
+
+def write_body(haplotypes, snp, output)
+
+  current_line = 0
+  father = get_line_from_file(haplotypes, 1).split('')
+  mother = get_line_from_file(haplotypes, 2).split('')
+  
+  father.each do |i|
+    create_line(snp, current_line, father[current_line], mother[current_line], output)
+    current_line += 1
+  end
+
+end
+
+def create_vcf(haplotypes_path, snp_path, output_path)
+
+  snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
+  
+  if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
+    write_header(output_path)
+    write_body(haplotypes_path, snp_path, output_path)
+  else
+    puts "Error: non-matching files"
+  end
+
+end
+
+
+#Vcf file creation
+
+create_vcf(haplotypes_path, snp_path, output_path)

From 268aa70e34422bbb3f95c29b61e3d802c61e4e92 Mon Sep 17 00:00:00 2001
From: Nara88 <biancoandrea88@gmail.com>
Date: Sun, 2 Oct 2016 20:35:50 +0200
Subject: [PATCH 10/11] Delete Vcf_conversion.rb

---
 Vcf_conversion.rb | 105 ----------------------------------------------
 1 file changed, 105 deletions(-)
 delete mode 100644 Vcf_conversion.rb

diff --git a/Vcf_conversion.rb b/Vcf_conversion.rb
deleted file mode 100644
index d661f72f..00000000
--- a/Vcf_conversion.rb
+++ /dev/null
@@ -1,105 +0,0 @@
-#Methods
-
-def get_line_from_file(path, line)
-result = nil
-  File.open(path, "r") do |f|
-    while line > 0
-      line -= 1
-      result = f.gets
-      result ||= '' 
-      result = result.chomp
-    end
-  end
-  return result
-end
-
-def write_header(output_path)
-  formato = "##fileformat=VCFv4.2\n"
-  colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
-  File.open(output_path, "w") { |file| file.write(formato + colonne) }
-end
-
-def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
-   File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
-end
-
-def find_alt(snp_line, ref)
-
-  alternatives = snp_line[9].to_s.split "/"
-  alt = "."
-  if (alternatives.length == 2)
-    alternatives.each do |a|
-      if a.to_s != ref
-        alt = a.to_s
-      end
-    end
-  else  
-    puts "Error: bihallelic field"
-    ref = "X"
-  end
-
-  return alt
-    
-end
-
-
-def create_line(snp_path, current_line, current_father, current_mother, output)
-  
-  current_snp_line = current_line + 3
-
-  snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
-
-  chrom = snp_line[1].to_s 
-
-  pos = (snp_line[2].to_i + 1).to_s 
-
-  id = "."
-
-  ref = snp_line[7].to_s
-
-  alt = find_alt(snp_line, ref)
-  
-  qual = "."
-
-  filter = "PASS"
-
-  info =  "."
-
-  format = "GT" 
-
-  sample = current_father.to_s + "|" + current_mother.to_s
-  
-  write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
-
-end
-
-def write_body(haplotypes, snp, output)
-
-  current_line = 0
-  father = get_line_from_file(haplotypes, 1).split('')
-  mother = get_line_from_file(haplotypes, 2).split('')
-  
-  father.each do |i|
-    create_line(snp, current_line, father[current_line], mother[current_line], output)
-    current_line += 1
-  end
-
-end
-
-def create_vcf(haplotypes_path, snp_path, output_path)
-
-  snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
-  
-  if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
-    write_header(output_path)
-    write_body(haplotypes_path, snp_path, output_path)
-  else
-    puts "Error: non-matching files"
-  end
-
-end
-
-
-#Vcf file creation
-
-create_vcf(haplotypes_path, snp_path, output_path)

From 5f32b743f7ea22e09f4c8264daec38947569ab07 Mon Sep 17 00:00:00 2001
From: Nara88 <biancoandrea88@gmail.com>
Date: Sun, 2 Oct 2016 20:38:59 +0200
Subject: [PATCH 11/11] src/output-vcf/Vcf_conversion.rb

---
 src/Vcf_conversion.rb | 105 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 src/Vcf_conversion.rb

diff --git a/src/Vcf_conversion.rb b/src/Vcf_conversion.rb
new file mode 100644
index 00000000..d661f72f
--- /dev/null
+++ b/src/Vcf_conversion.rb
@@ -0,0 +1,105 @@
+#Methods
+
+def get_line_from_file(path, line)
+result = nil
+  File.open(path, "r") do |f|
+    while line > 0
+      line -= 1
+      result = f.gets
+      result ||= '' 
+      result = result.chomp
+    end
+  end
+  return result
+end
+
+def write_header(output_path)
+  formato = "##fileformat=VCFv4.2\n"
+  colonne = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
+  File.open(output_path, "w") { |file| file.write(formato + colonne) }
+end
+
+def write_line(output_path, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+   File.open(output_path, "a") { |file| file.write(chrom + "\t" + pos + "\t" + id + "\t" + ref + "\t" + alt + "\t" + qual + "\t" + filter + "\t" + info + "\t" + format + "\t" + sample + "\n") }
+end
+
+def find_alt(snp_line, ref)
+
+  alternatives = snp_line[9].to_s.split "/"
+  alt = "."
+  if (alternatives.length == 2)
+    alternatives.each do |a|
+      if a.to_s != ref
+        alt = a.to_s
+      end
+    end
+  else  
+    puts "Error: bihallelic field"
+    ref = "X"
+  end
+
+  return alt
+    
+end
+
+
+def create_line(snp_path, current_line, current_father, current_mother, output)
+  
+  current_snp_line = current_line + 3
+
+  snp_line = get_line_from_file(snp_path, current_snp_line).split "\t"
+
+  chrom = snp_line[1].to_s 
+
+  pos = (snp_line[2].to_i + 1).to_s 
+
+  id = "."
+
+  ref = snp_line[7].to_s
+
+  alt = find_alt(snp_line, ref)
+  
+  qual = "."
+
+  filter = "PASS"
+
+  info =  "."
+
+  format = "GT" 
+
+  sample = current_father.to_s + "|" + current_mother.to_s
+  
+  write_line(output, chrom, pos, id, ref, alt, qual, filter, info, format, sample)
+
+end
+
+def write_body(haplotypes, snp, output)
+
+  current_line = 0
+  father = get_line_from_file(haplotypes, 1).split('')
+  mother = get_line_from_file(haplotypes, 2).split('')
+  
+  father.each do |i|
+    create_line(snp, current_line, father[current_line], mother[current_line], output)
+    current_line += 1
+  end
+
+end
+
+def create_vcf(haplotypes_path, snp_path, output_path)
+
+  snp_lines = `wc -l "#{snp_path}"`.strip.split(' ')[0].to_i - 3
+  
+  if (get_line_from_file(haplotypes_path, 1).length == snp_lines)
+    write_header(output_path)
+    write_body(haplotypes_path, snp_path, output_path)
+  else
+    puts "Error: non-matching files"
+  end
+
+end
+
+
+#Vcf file creation
+
+create_vcf(haplotypes_path, snp_path, output_path)