#! /usr/bin/env python3
# coding=utf-8

# Copyright (C) 2022  Vahid Akbari

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
PatMat: Parent-of-origin (Paternal and Maternal) resolved chromosome-scale haplotyping
"""

__author__ = "Vahid Akbari"
__email__ = "vakbari@bcgsc.ca"
__copyright__ = "Copyright (C) 2022, " + __author__
__license__ = "GPLv3"

import subprocess
import os
import gzip
import bz2
import argparse
import warnings
import multiprocessing as mp
from collections import defaultdict
from itertools import repeat
import pysam
import tabix
from tqdm import tqdm

def get_variant_info(feed_list,
                     alignment_file,
                     chrom,
                     MappingQuality,
                     IncludeSupplementary):
    '''
    This function maps each read to heterozygous variants and returns a list of  
    haplotype 1, haplotype 2, and unphased variants mapped to each read.
    '''
    read_var_list= list()
    read_hap_list= list()
    read_var_cov= defaultdict(int)
    samfile = pysam.AlignmentFile(alignment_file, 'rb')
    for varinfo in feed_list:
        gt,position,ref,alt= varinfo
        try:
            sam_pileup= samfile.pileup(chrom,
                                        position,
                                        position+1,
                                        truncate=True)
        except:#The cordiniate is not found so ignore it
            warnings.warn("Variant {} {} did not find or do not have any map "
                          "reads in the alignment file. Check if correct"
                          "bam file is given or bam is indexed and not corrupted."
                          " Skipping it.".format(chrom,
                                                    position+1))
            continue
        for pileupcolumn in sam_pileup:
            pileupcolumn.set_min_base_quality(0)
            if pileupcolumn.pos == position:
                for pileupread in pileupcolumn.pileups:
                    if pileupread.alignment.mapping_quality < MappingQuality:
                        continue
                    if IncludeSupplementary and pileupread.alignment.is_supplementary:
                        continue
                    read_var_cov[(chrom,position)]+=1
                    read_id= pileupread.alignment.query_name
                    if pileupread.is_del or pileupread.is_refskip:
                        continue
                    read_base= None
                    read_base1= None
                    read_base2= None
                    if gt != "1/2":                    
                        if len(ref) == 1 and len(alt)==1: # dealing with mismatches
                            read_base= pileupread.alignment.query_sequence[
                                    pileupread.query_position]
                        elif len(ref) > 1 and len(alt) == 1: #dealing with deletions
                            if pileupread.indel < 0 and abs(pileupread.indel) == len(ref) -1:
                                read_base= pileupread.alignment.query_sequence[
                                        pileupread.query_position]
                            elif pileupread.indel == 0:
                                read_base = pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(ref)]
                            else:
                                continue
                        elif len(ref) == 1 and len(alt) > 1: #dealing with insertions
                            if pileupread.indel > 0 and pileupread.indel == len(alt) -1:
                                read_base= pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(alt)]
                            elif pileupread.indel == 0:
                                read_base= pileupread.alignment.query_sequence[
                                        pileupread.query_position]
                            else:
                                continue
                        if read_base == ref:
                            read_var_list.append((chrom,position,read_base.upper(),read_id))
                            if gt == '1|0':
                                read_hap_list.append((chrom,read_id,2))
                            elif gt == '0|1' and read_base == ref:
                                read_hap_list.append((chrom,read_id,1))
                        elif read_base == alt:
                            read_var_list.append((chrom,position,read_base.upper(),read_id))
                            if gt == '1|0' and read_base == alt:
                                read_hap_list.append((chrom,read_id,1))
                            elif gt == '0|1' and read_base == alt:
                                read_hap_list.append((chrom,read_id,2))
                    else:
                        alt1,alt2 = alt
                        if len(ref) > 1 and len(alt1) > 1 and len(alt2)  > 1:
                            continue
                        if len(ref) == 1 and len(alt1) == 1: # dealing with mismatches
                            read_base1= pileupread.alignment.query_sequence[
                                    pileupread.query_position]
                        elif len(ref) > 1 and len(ref) > len(alt1): #dealing with deletions
                            if pileupread.indel < 0 and abs(pileupread.indel) == len(ref) - len(alt1):
                                read_base1= pileupread.alignment.query_sequence[
                                        pileupread.query_position:pileupread.query_position+len(ref)-abs(pileupread.indel)]
                            elif pileupread.indel == 0:
                                read_base1 = pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(ref)]
                        elif len(alt1) > 1 and len(alt1) > len(ref): #dealing with insertions
                            if pileupread.indel > 0 and pileupread.indel == len(alt1) - len(ref):
                                read_base1= pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(alt1)]
                            elif pileupread.indel == 0:
                                read_base1= pileupread.alignment.query_sequence[
                                        pileupread.query_position:pileupread.query_position+len(ref)]
                        if len(ref) == 1 and len(alt2) == 1: # dealing with mismatches
                            read_base2= pileupread.alignment.query_sequence[
                                    pileupread.query_position]
                        elif len(ref) > 1 and len(ref) > len(alt2): #dealing with deletions
                            if pileupread.indel < 0 and abs(pileupread.indel) == len(ref) - len(alt2):
                                read_base2= pileupread.alignment.query_sequence[
                                        pileupread.query_position:pileupread.query_position+len(ref)-abs(pileupread.indel)]
                            elif pileupread.indel == 0:
                                read_base2 = pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(ref)]
                        elif len(alt2) > 1 and len(alt2) > len(ref): #dealing with insertions
                            if pileupread.indel > 0 and pileupread.indel == len(alt2) - len(ref):
                                read_base2= pileupread.alignment.query_sequence[
                                                pileupread.query_position:pileupread.query_position+len(alt2)]
                            elif pileupread.indel == 0:
                                read_base2= pileupread.alignment.query_sequence[
                                        pileupread.query_position:pileupread.query_position+len(ref)]
                        if read_base1 == alt1 or read_base1 == alt2:
                            read_var_list.append((chrom,position,read_base1.upper(),read_id))
                        elif read_base2 == alt1 or read_base2 == alt2:
                            read_var_list.append((chrom,position,read_base2.upper(),read_id))
                            
    return read_var_list, read_hap_list, read_var_cov


def getChromsFromBAM(filename):
    chroms = set()
    stats = pysam.idxstats(filename)
    for row in stats.split("\n"):
        fields = row.split("\t")
        if fields[0] != '*' and fields[0] != '':
            chroms.add(fields[0])
    return chroms


def get_sv_pofo(feed_list,
                   alignment_file):
    '''
    This function maps each read to heterozygous variants and returns a list of  
    haplotype 1, haplotype 2, and unphased variants mapped to each read.
    '''
    sv_hp_dict1= defaultdict(int)
    sv_hp_dict2= defaultdict(int)
    samfile = pysam.AlignmentFile(alignment_file, 'rb')
    for varinfo in feed_list:
        chrom,position,sv_reads = varinfo
        try:
            sam_pileup= samfile.pileup(chrom,
                                        position,
                                        position+1,
                                        truncate=True)
        except:#The cordiniate is not found so ignore it
            warnings.warn("Variant {} {} did not find or do not have any map "
                          "reads in the alignment file. Check if correct"
                          "bam file is given or bam is indexed and not corrupted."
                          " Skipping it.".format(chrom,
                                                    position+1))
            continue
        for pileupcolumn in sam_pileup:
            pileupcolumn.set_min_base_quality(0)
            if pileupcolumn.pos == position:
                for pileupread in pileupcolumn.pileups:
                    read_id= pileupread.alignment.query_name
                    try:
                        hp_tag= pileupread.alignment.get_tag("HP")
                    except:
                        continue
                    if read_id in sv_reads:
                        if hp_tag==1:
                            sv_hp_dict1[(chrom,position)] += 1
                            sv_hp_dict2[(chrom,position)] += 0
                        elif hp_tag==2:
                            sv_hp_dict1[(chrom,position)] += 0
                            sv_hp_dict2[(chrom,position)] += 1
                
    return sv_hp_dict1, sv_hp_dict2


def openalignment(alignment_file,
                  window):
    '''
    Opens an alignment file and creates bam iterator
    '''
    bam = pysam.AlignmentFile(alignment_file, 'rb')
    if window is not None:
        window_chrom = window.split(':')[0]
        if len(window.split(':')) == 2:
            window_margin= window.split(':')[1].split('-')
            if len(window_margin) == 2:
                window_start = int(window_margin[0])
                window_end = int(window_margin[1])
                bamiter = bam.fetch(window_chrom, window_start, window_end)
                count= bam.count(window_chrom, window_start, window_end)
            else:
                window_start = int(window_margin[0])
                bamiter = bam.fetch(window_chrom, window_start)
                count= bam.count(window_chrom, window_start)
        else:
            try:
                bamiter = bam.fetch(window_chrom)
                count= bam.count(window_chrom)
            except:
                count= 0
                bamiter= ""
    else:
        bamiter = bam.fetch(until_eof=True)
        count = 0
    return bamiter, bam, count


def openfile(file):
    '''
    Opens a file
    '''
    if file.endswith('.gz'):
        opened_file = gzip.open(file,'rt')
    elif file.endswith('bz') or file.endswith('bz2'):
        opened_file = bz2.open(file,'rt')
    else:
        opened_file = open(file,'rt')
    return opened_file


def PofO_dmr(known_dmr, 
             chrom,
             out,
             min_cg,
             cpg_difference):
    '''
    This function maps differentially methylated CpGs 
    to the known iDMRs for PofO assignment 
    '''
    tb_calldml= tabix.open(out+"_temp_callDML.tsv.gz")
    tb_dmltest= tabix.open(out+"_temp_DMLtest.tsv.gz")
    out_meth= open(out+"_CpG-Methylation-Status-at-DMRs.tsv",'a')
    dmr_file= openfile(known_dmr)
    header= next(dmr_file).rstrip()
    chrom_hp_origin_count= defaultdict(lambda: defaultdict(int))
    chrom_hp_origin_count_all_cg= defaultdict(lambda: defaultdict(int))
    chrom_hp_origin_count_diff_cg= defaultdict(lambda: defaultdict(int))
    chrom_hp_origin_count_dmr= defaultdict(lambda: defaultdict(int))
    for line in dmr_file:
        line= line.rstrip().split('\t')
        dmr_chrom= line[0]
        if dmr_chrom!=chrom:
            continue
        dmr_start= int(line[1]) -1
        dmr_end= int(line[2])
        origin= line[3].lower()
        if origin not in ["maternal","paternal"]:
            warnings.warn("iDMR: {} does not have a valid origin "
                          "(must be paternal or maternal. case insensitive). "
                          "This iDMR will not be used for PofO assignment and PofO"
                          " assignment score calculation.".format('\t'.join(line)))
        try:
            records_all = tb_dmltest.query(dmr_chrom, dmr_start, dmr_end)
        except:
            warnings.warn("iDMR {}:{}-{} was ignored because it does not have any CpG in "
                          "DMLtest file.".format(dmr_chrom, dmr_start, dmr_end))
            records_all = "NA"
        try:
            records = tb_calldml.query(dmr_chrom, dmr_start, dmr_end)
        except:
            warnings.warn("iDMR {}:{}-{} was ignored because it does not have any CpG in "
                          "DMLtest file.".format(dmr_chrom, dmr_start, dmr_end))
            records = "NA"
        num_cg = 0
        hp1_freq= 0
        hp2_freq= 0
        diff_cg_hp1 = 0
        diff_cg_hp2 = 0
        if records_all != "NA":
            for record_all in records_all:
                num_cg += 1
                hp1_freq += float(record_all[3])
                hp2_freq += float(record_all[4])
        if records != "NA":
            for record in records:
                if (float(record[3]) - float(record[4])) > 0:
                    diff_cg_hp1 += 1
                elif (float(record[4]) - float(record[3])) > 0:
                    diff_cg_hp2 += 1
        if num_cg < 1:
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+
                           '\tNA\tNA\tNA\tNA\tIgnored:No_CpG\n')
            continue
        hp1_freq= hp1_freq/num_cg
        hp2_freq= hp2_freq/num_cg
        if num_cg < min_cg and abs(diff_cg_hp1 - diff_cg_hp2) / num_cg < cpg_difference:
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                           str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+
                           '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' + 
                           'Ignored:DidNotMeet_min_cg_And_cpg_difference\n')
            continue
        elif num_cg < min_cg and diff_cg_hp1 == 0 and diff_cg_hp2 == 0:
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                           str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+
                           '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' + 
                           'Ignored:DidNotMeet_min_cg_And_DifferentiallyMethylated'
                           'CpGsIsZeroInBothHaplotypes\n')
            continue
        elif num_cg < min_cg:
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                           str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+
                           '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' +
                           'Ignored:DidNotMeet_min_cg\n')
            continue
        elif abs(diff_cg_hp1 - diff_cg_hp2) / num_cg < cpg_difference:
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                           str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+
                           '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' +
                           'Ignored:DidNotMeet_cpg_difference\n')
            continue
        elif diff_cg_hp1 == 0 and diff_cg_hp2 == 0: 
            out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                           str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+ 
                           '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' + 
                           'Ignored:DifferentiallyMethylatedCpGsIsZeroInBothHaplotypes\n')
            continue
        out_meth.write('\t'.join(line)+'\t'+str(num_cg)+'\t'+
                       str(diff_cg_hp1)+'\t'+str(diff_cg_hp2)+
                       '\t'+str(hp1_freq) + '\t' + str(hp2_freq)+'\t' + 
                       'Included\n')  
        if origin == 'maternal':
            chrom_hp_origin_count[(dmr_chrom, 'maternal')]['HP1'] += diff_cg_hp1
            chrom_hp_origin_count[(dmr_chrom, 'maternal')]['HP2'] += diff_cg_hp2
            chrom_hp_origin_count[(dmr_chrom, 'paternal')]['HP2'] += diff_cg_hp1
            chrom_hp_origin_count[(dmr_chrom, 'paternal')]['HP1'] += diff_cg_hp2
            if diff_cg_hp1 > diff_cg_hp2:
                chrom_hp_origin_count_dmr[(dmr_chrom, 'maternal')]['HP1'] += 1
                chrom_hp_origin_count_dmr[(dmr_chrom, 'paternal')]['HP2'] += 1
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'maternal')]['HP1'] += num_cg
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'paternal')]['HP2'] += num_cg
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'maternal')]['HP1'] += diff_cg_hp1+diff_cg_hp2
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'paternal')]['HP2'] += diff_cg_hp1+diff_cg_hp2
            elif diff_cg_hp1 < diff_cg_hp2:
                chrom_hp_origin_count_dmr[(dmr_chrom, 'maternal')]['HP2'] += 1
                chrom_hp_origin_count_dmr[(dmr_chrom, 'paternal')]['HP1'] += 1
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'maternal')]['HP2'] += num_cg
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'paternal')]['HP1'] += num_cg
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'maternal')]['HP2'] += diff_cg_hp1+diff_cg_hp2
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'paternal')]['HP1'] += diff_cg_hp1+diff_cg_hp2
        if origin == 'paternal':
            chrom_hp_origin_count[(dmr_chrom, 'maternal')]['HP1'] += diff_cg_hp2 
            chrom_hp_origin_count[(dmr_chrom, 'maternal')]['HP2'] += diff_cg_hp1
            chrom_hp_origin_count[(dmr_chrom, 'paternal')]['HP2'] += diff_cg_hp2 
            chrom_hp_origin_count[(dmr_chrom, 'paternal')]['HP1'] += diff_cg_hp1
            if diff_cg_hp1 > diff_cg_hp2:
                chrom_hp_origin_count_dmr[(dmr_chrom, 'maternal')]['HP2'] += 1
                chrom_hp_origin_count_dmr[(dmr_chrom, 'paternal')]['HP1'] += 1
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'maternal')]['HP2'] += num_cg
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'paternal')]['HP1'] += num_cg
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'maternal')]['HP2'] += diff_cg_hp1+diff_cg_hp2
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'paternal')]['HP1'] += diff_cg_hp1+diff_cg_hp2
            elif diff_cg_hp1 < diff_cg_hp2:
                chrom_hp_origin_count_dmr[(dmr_chrom, 'maternal')]['HP1'] += 1
                chrom_hp_origin_count_dmr[(dmr_chrom, 'paternal')]['HP2'] += 1
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'maternal')]['HP1'] += num_cg
                chrom_hp_origin_count_all_cg[(dmr_chrom, 'paternal')]['HP2'] += num_cg
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'maternal')]['HP1'] += diff_cg_hp1+diff_cg_hp2
                chrom_hp_origin_count_diff_cg[(dmr_chrom, 'paternal')]['HP2'] += diff_cg_hp1+diff_cg_hp2
    dmr_file.close()
    out_meth.close()
    return (chrom_hp_origin_count, 
            chrom_hp_origin_count_all_cg, 
            chrom_hp_origin_count_diff_cg,
            chrom_hp_origin_count_dmr)


def out_freq_methbam(out,
                     processes,
                     reference,
                     pbcg,
                     pb_tech):
    out_freqhp1= out + '_temp_NonPofO_HP1-HP2_MethylationHP1.tsv'
    out_freqhp2= out + '_temp_NonPofO_HP1-HP2_MethylationHP2.tsv'
    out_dir=os.path.dirname(out)
    out_pref= os.path.basename(out)
    if not pb_tech:
        subprocess.run("modkit pileup -t {} --prefix {} "
                       "--partition-tag HP --combine-strands --cpg -r {} "
                       "{} {}".format(processes,out_pref+"_temp-NonPofO_CpGModFreq",
                                      reference,out+"_temp-NonPofO.bam",out_dir),
                       shell=True,
                       check=True)
        
        subprocess.run("awk -F'\t' '$4==\"m\" {{print $1,$2,$3,$10,$12,$12/$10}}' OFS='\t' {} | "
                       "sed '1i Chromosome\tStart\tEnd\tCov\tMod\tFreq' > {} "
                        "".format(out+"_temp-NonPofO_CpGModFreq_1.bed",
                                          out_freqhp1),
                        shell=True,
                        check=True)
        subprocess.run("awk -F'\t' '$4==\"m\" {{print $1,$2,$3,$10,$12,$12/$10}}' OFS='\t' {} | "
                       "sed '1i Chromosome\tStart\tEnd\tCov\tMod\tFreq' > {} "
                        "".format(out+"_temp-NonPofO_CpGModFreq_2.bed",
                                          out_freqhp2),
                        shell=True,
                        check=True)
    else:
        
        subprocess.run("aligned_bam_to_cpg_scores --bam {} --output-prefix {}"
                       " --model {} --threads {} --modsites-mode reference "
                       "--ref {}".format(out+"_temp-NonPofO.bam",
                                         out+"_temp-NonPofO_CpGModFreq",
                                         pbcg,
                                         processes,reference),
                       shell=True,
                       check=True)
        
        subprocess.run("awk -F'\t' '{{print $1,$2,$3,$6,$7,$7/$6}}' OFS='\t' {} | "
                       "sed '1i Chromosome\tStart\tEnd\tCov\tMod\tFreq' > {} "
                        "".format(out+"_temp-NonPofO_CpGModFreq.hap1.bed",
                                          out_freqhp1),
                        shell=True,
                        check=True)
        subprocess.run("awk -F'\t' '{{print $1,$2,$3,$6,$7,$7/$6}}' OFS='\t' {} | "
                       "sed '1i Chromosome\tStart\tEnd\tCov\tMod\tFreq' > {} "
                        "".format(out+"_temp-NonPofO_CpGModFreq.hap2.bed",
                                          out_freqhp2),
                        shell=True,
                        check=True)
    subprocess.run("rm {}*".format(out+"_temp-NonPofO_CpGModFreq"),
                    shell=True,
                    check=True)
                  

def out_pofo_freq(hp_fre,
                  chrom,
                  output):
    '''
    Outputs PofO assigned methylation frequency files.
    '''
    if os.path.exists(hp_fre):
        with openfile(hp_fre) as file:
            next(file)
            for line in file:
                line=line.rstrip().split('\t')
                if line[0]==chrom:
                    output.write('\t'.join(line)+'\n')
    else:
        with openfile(hp_fre+".gz") as file:
            next(file)
            for line in file:
                line=line.rstrip().split('\t')
                if line[0]==chrom:
                    output.write('\t'.join(line)+'\n')


def vcf2dict(vcf_strand,
             chrom):
    """
    Process and converts the strand-seq vcf file to a dictionary
    for downstream use.
    """
    strand_phased_vars= 0
    vcf_dict= defaultdict(dict)
    vcf_file = openfile(vcf_strand)
    for line in vcf_file:
        line = line.rstrip().split('\t')
        if line[0]!=chrom:
            continue
        if line[9].startswith(('.|0','0|.','1|.','.|1')):
            line[9] = line[9].replace(".|0","1|0"
                                                ).replace(".|1","0|1"
                                                          ).replace("1|.","1|0"
                                                                    ).replace("0|.","0|1")
            warnings.warn("{}:{} variant in strand-seq vcf has .|0 or 0|. "
                          "or .|1 or 1|. phased genotype. Note that it "
                          "will be interpreted as 1|0 or 0|1 or 0|1 or "
                          "1|0".format(line[0], line[1]))
        if line[9].startswith(('1|0','0|1')):
            strand_phased_vars += 1
            vcf_dict[line[0]][line[1]] = line[9].split(':')[0]
    vcf_file.close()
    return vcf_dict, strand_phased_vars


def check_vcfs(vcf,
               vcf_strand,
               chrom):
    '''
    Checks how many of the phased variants in strand-seq vcf have the same   
    position in input vcf.
    '''
    common_strand= 0
    vcf_dict, strand_phased_vars= vcf2dict(vcf_strand, chrom)
    with openfile(vcf) as vf:
        for line in vf:
            line=line.rstrip().split('\t')
            if line[0]!=chrom:
                continue
            if (line[9].startswith(('0/1','1/0','0|1','1|0'))
                and
                (line[0] in vcf_dict and 
                 line[1] in vcf_dict[line[0]])):
                common_strand += 1
    print("Out of {} reference heterozygous phased variants in {} in "
            "strand-seq vcf, {} variants had the same position "
            "in input vcf file."
            "".format(strand_phased_vars, chrom,
                            common_strand))
   
    
def strand_vcf2dict_phased(vcf_strand,
                           vcf,
                           include_all_variants,
                           chrom):
    '''
    Intersects input vcf and strand-seq vcf and stores phased 
    and unphased heterozygous variants into a dictionary for read phasing.
    '''
    check_vcfs(vcf, vcf_strand, None, chrom)
    final_dict= defaultdict(set)
    vcf_dict, strand_phased_vars= vcf2dict(vcf_strand,chrom)
    with openfile(vcf) as vf:
        for line in vf:
            line=line.rstrip().split('\t')
            if line[0]!=chrom:
                continue
            if not include_all_variants and line[6] not in ["PASS","."]:
                continue
            if (line[9].startswith(('0/1','1/0','0|1','1|0'))
                and
                (line[0] in vcf_dict and 
                 line[1] in vcf_dict[line[0]])):
                final_dict[line[0]].add((vcf_dict[line[0]][line[1]],
                                        int(line[1])-1,
                                        line[3].upper(),
                                        line[4].upper()))
            elif line[9].startswith(('0/1','1/0','0|1','1|0')):
                final_dict[line[0]].add(("0/1",
                                        int(line[1])-1,
                                        line[3].upper(),
                                        line[4].upper()))
            elif line[9].startswith(('1/2','1|2','2/1','2|1')):
                final_dict[line[0]].add(("1/2",
                                        int(line[1])-1,
                                        line[3].upper(),
                                        (line[4].split(',')[0].upper(),
                                        line[4].split(',')[1].upper())))
    return final_dict, strand_phased_vars
                

def vcf2dict_phased(blocks, 
                    vcf_strand,
                    hapRatio,
                    minvariantblock,
                    vcf,
                    include_all_variants,
                    chrom):
    '''
    In case --phased option is given, this function uses phased variants
    from strand-seq to correct PofO phasing switches across phased blocks
    . This function then intersects input vcf and strand-seq vcf and
    stores phased and unphased heterozygous variants into a 
    dictionary for read phasing.
    '''
    check_vcfs(vcf, vcf_strand,chrom)
    final_dict= defaultdict(set)
    phase_block_stat= defaultdict(int)
    vcf_dict, strand_phased_vars= vcf2dict(vcf_strand,chrom)
    tb_vcf= tabix.open(os.path.abspath(vcf))
    for block in blocks:
        b_chrom, b_start, b_end= block
        if b_chrom!=chrom:
            continue
        agreement_count= 0
        disagreement_count= 0
        try:
            records_whats = tb_vcf.query(b_chrom, b_start-1, b_end+1)
            records_whats = list(records_whats)
        except:
            warnings.warn("{}:{}-{} block cannot be extracted from vcf file. "
                          "Make sure file is indexed. Skipping it.".format(b_chrom, b_start, b_end))
            continue
        if b_chrom in vcf_dict:
            disagreement_sites = set()
            agreement_sites = set()
            for vcf_line in records_whats:
                if vcf_line[1] in vcf_dict[vcf_line[0]]:
                    if vcf_line[9].startswith("1|0") and vcf_dict[vcf_line[0]][vcf_line[1]] == "1|0":
                        agreement_sites.add(vcf_line[1])
                        agreement_count += 1
                    elif vcf_line[9].startswith("1|0") and vcf_dict[vcf_line[0]][vcf_line[1]] == "0|1":
                        disagreement_sites.add(vcf_line[1])
                        disagreement_count += 1
                    elif vcf_line[9].startswith("0|1") and vcf_dict[vcf_line[0]][vcf_line[1]] == "0|1":
                        agreement_sites.add(vcf_line[1])
                        agreement_count += 1
                    elif vcf_line[9].startswith("0|1") and vcf_dict[vcf_line[0]][vcf_line[1]] == "1|0":
                        disagreement_sites.add(vcf_line[1])
                        disagreement_count += 1                    
            if agreement_count > disagreement_count:
                phase_block_stat[(b_chrom, str(b_start),"agreement")]+=agreement_count
                phase_block_stat[(b_chrom, str(b_start),"disagreement")]+=disagreement_count
            else:
                phase_block_stat[(b_chrom, str(b_start),"agreement")]+=disagreement_count
                phase_block_stat[(b_chrom, str(b_start),"disagreement")]+=agreement_count
            
            for vcf_line in records_whats:
                if vcf_line[1] in vcf_dict[vcf_line[0]]:
                    continue
                if vcf_line[9].startswith(('1|0','0|1')):
                    if (agreement_count > disagreement_count and
                        agreement_count >= minvariantblock and 
                        agreement_count/(agreement_count+disagreement_count) >= hapRatio and
                        vcf_line[1] not in disagreement_sites):
                        vcf_dict[vcf_line[0]][vcf_line[1]] = vcf_line[9].split(':')[0]
                    elif (disagreement_count > agreement_count and
                          disagreement_count >= minvariantblock and 
                          disagreement_count/(agreement_count+disagreement_count) >= hapRatio and
                          vcf_line[1] not in agreement_sites):
                        vcf_dict[vcf_line[0]][vcf_line[1]] = vcf_line[9].split(':')[0][::-1]
    with openfile(vcf) as vf:
        for line in vf:
            line=line.rstrip().split('\t')
            if line[0]!=chrom:
                continue
            if not include_all_variants and line[6] not in ["PASS","."]:
                continue
            if (line[9].startswith(('0/1','1/0','0|1','1|0'))
                and
                (line[0] in vcf_dict and 
                 line[1] in vcf_dict[line[0]])):
                final_dict[line[0]].add((vcf_dict[line[0]][line[1]],
                                        int(line[1])-1,
                                        line[3].upper(),
                                        line[4].upper()))
            elif line[9].startswith(('0/1','1/0','0|1','1|0')):
                final_dict[line[0]].add(("0/1",
                                        int(line[1])-1,
                                        line[3].upper(),
                                        line[4].upper()))
            elif line[9].startswith(('1/2','1|2','2/1','2|1')):
                final_dict[line[0]].add(("1/2",
                                        int(line[1])-1,
                                        line[3].upper(),
                                        (line[4].split(',')[0].upper(),
                                        line[4].split(',')[1].upper())))

    vcf_dict.clear()
    return final_dict, strand_phased_vars, phase_block_stat


def per_read_variant(vcf_dict,
                     bam_file,
                     chunk,
                     processes,
                     MappingQuality,
                     IncludeSupplementary):
    '''
    This function extracts per-read information for variants.
    '''
    variant_dict= defaultdict(set)
    read_dict_HP1= defaultdict(int)
    read_dict_HP2= defaultdict(int)
    per_var_cov = defaultdict(int)
    for chrom,feed_list in vcf_dict.items():
        bamiter, bam, count = openalignment(bam_file, chrom)
        if count > 0:
            feed_list = [list(feed_list)[x:x+chunk]
                                  for x in range(0, len(feed_list),
                                                chunk)]
            feed_list = [feed_list[x:x+processes]
                                  for x in range(0, len(feed_list),
                                                processes)]
            description= "Tagging variants to reads from {}: ".format(chrom)
            with tqdm(total=len(feed_list),
                desc=description,
                bar_format="{l_bar}{bar} [ Estimated time left: {remaining} ]"
                                  ) as pbar:
                for vcf_info_list in feed_list:
                    p= mp.Pool(len(vcf_info_list))
                    results= p.starmap(get_variant_info,
                                        list(zip(vcf_info_list,
                                                repeat(bam_file),
                                                repeat(chrom),
                                                repeat(MappingQuality),
                                                repeat(IncludeSupplementary))))
                    p.close()
                    p.join()
                    for result in results:
                        if result is not None:
                            read_var_list, read_hap_list, read_var_cov= result
                            for var_info in read_var_list:
                                variant_dict[var_info[0:-1]].add(var_info[-1])
                            for read_hap in read_hap_list:
                                if read_hap[-1] == 1:
                                    read_dict_HP1[read_hap[0:2]] += 1
                                    read_dict_HP2[read_hap[0:2]] += 0
                                elif read_hap[-1] == 2:
                                    read_dict_HP1[read_hap[0:2]] += 0
                                    read_dict_HP2[read_hap[0:2]] += 1
                            for key,val in read_var_cov.items():
                                per_var_cov[key] += val
                            
                    pbar.update(1)
        else:
            warnings.warn("{} does not have any mapped reads in alignment "
                          "file Or alignment is truncated or corrupt indexed. "
                          "Skipping it.".format(chrom))
            
    return (variant_dict, read_dict_HP1, 
            read_dict_HP2, per_var_cov)


def get_block(vcf):
    '''
    In case --phased option is provided this function extracts "
    "phased blocks from vcf file.
    '''
    blocks_dict= defaultdict(set)
    final= list()
    with openfile(vcf) as vp:
        for line in vp:
            if line.startswith("#"):
                continue
            line=line.rstrip().split('\t')
            if "|" in line[9].split(":")[0]:
               blocks_dict[(line[0],line[9].split(":")[-1])].add(int(line[1]))
    for key, val in blocks_dict.items():
        val= sorted(val)
        final.append((key[0], val[0], val[-1]))
        blocks_dict[key]= (str(val[0]), str(val[-1]))
    return final, blocks_dict


# def get_indels(vcf):
#     '''
#     Extracts heterozygous indels from input vcf file. 
#     '''
#     indels= set()
#     with openfile(vcf) as vcf_file:
#         for line in vcf_file:
#             if line.startswith('#'):
#                 continue
#             line= line.rstrip().split('\t')
#             if line[9].startswith(("0|1","1|0","0/1","1/0")):
#                 if len(line[3]) > 1 or len(line[4]) > 1:
#                     indels.add((line[0],str(int(line[1])-1)))
#             # Non-reference het indels (e.g. 1/2) are ignored as 
#             #they will be in unphased column of per-read info file
#     return indels


def main(args):
    '''
    The main function that uses user's inputs and other functions to phase and
    assign PofO to variants and methylation.
    '''
    hapRatio = args.hapratio
    minvariant= args.min_variant
    minvariantblock= args.min_variant_block
    min_cg= args.min_cg
    bam_file = os.path.abspath(args.bam)
    processes = args.processes
    dss_processes= args.dss_processes
    if dss_processes is None:
        dss_processes= processes
    cpg_difference= args.cpg_difference
    chunk = args.chunk_size
    min_read_reassignment= args.min_read_number
    out = os.path.abspath(args.output)
    reference= os.path.abspath(args.reference)
    vcf= os.path.abspath(args.vcf)
    if not os.path.isfile(vcf+".tbi"):
        raise Exception("It seems that vcf file "
                        "is not index by tabix.")
    vcf_tb= tabix.open(vcf)
    known_dmr= os.path.abspath(args.known_dmr)
    sites_to_ignore= set()
    if args.black_list is not None:
        if not os.path.isfile(vcf+".tbi"):
            raise Exception("Black list is given but it seems that"
                            " the vcf file is not indexed (file ends with"
                            " .tbi was not found).")
        tb_vcf= tabix.open(vcf)
        black_list= os.path.abspath(args.black_list)
        with openfile(black_list) as bl:
            for line in bl:
                line= line.rstrip().split('\t')
                try:
                    records= tb_vcf.query(line[0], int(line[1]), int(line[2])+1)
                except:
                    warnings.warn("{}:{}-{} region from black list does not exist in the "
                          "vcf file. Skipping it.".format(line[0],line[1] , line[2]))
                    records= "NA"
                if records != "NA":
                    for record in records:
                        sites_to_ignore.add((record[0],str(int(record[1])-1)))
    assignment_file= open(out + '_PofO_Assignment.vcf', 'w')
    info_out_dict= defaultdict(lambda: defaultdict(int))
    chrom_hp_origin = defaultdict(dict)
    chroms= dict()
    with openfile(vcf) as vf_file:
        for line in vf_file:
            if line.startswith("##"):
                assignment_file.write(line)
            elif line.startswith("#"):
                assignment_file.write(line.rstrip()+"\tNumAllReads\t"
                                       "NumReadsHP1\tNumReadsHP2\tNumReads_HP1_Ref/Left_Allele"
                                       "\tNumReads_HP2_Ref/Left_Allele\tNumReads_HP1_Alt/Right_Allele"
                                       "\tNumReads_HP2_Alt/Right_Allele\tNumReads_Maternal_Ref/Left_Allele"
                                       "\tNumReads_Paternal_Ref/Left_Allele\tNumReads_Maternal_Alt/Right_Allele"
                                       "\tNumReads_Paternal_Alt/Right_Allele\t"
                                       "MeanNumherOfHP1-PhasedVariantsAccrossReads\t"
                                       "MeanNumherOfHP2-PhasedVariantsAccrossReads\t"
                                       "BlockStart\tBlockEnd\t"
                                       "NumberOfSupportiveStrandSeqPhasedVariantsAtThePhasedBlock\t"
                                       "NumberOfConflictingStrandSeqPhasedVariantsAtThePhasedBlock\n")
            else:
                line=line.rstrip().split('\t')
                chroms[line[0]]= int(line[1])
    assignment_file.close()
    out_meth= open(out+"_CpG-Methylation-Status-at-DMRs.tsv",'w')
    dmr_file= openfile(known_dmr)
    header= next(dmr_file).rstrip()
    out_meth.write(header+"\tAll_CpGs_At_iDMR_CouldBeExaminedInBothHaplotypes\t"
                   "DifferentiallyMethylatedCpGs_HypermethylatedOnHP1\t"
                   "DifferentiallyMethylatedCpGs_HypermethylatedOnHP2\t"
                   "MethylationFrequency_HP1\tMethylationFrequency_HP2\t"
                   "Included_Or_Ignored_For_PofO_Assignment\n")
    dmr_file.close()
    out_meth.close()
    out_dmltest= open(out+"_DMLtest.tsv",'w')
    out_dmltest.write("chr\tstart\tend\tmu1\tmu2\tdiff\tdiff.se\tstat\tphi1\t"
                      "phi2\tpval\tfdr\n")
    out_dmltest.close()
    out_calldml= open(out+"_callDML.tsv",'w')
    out_calldml.write("chr\tstart\tend\tmu1\tmu2\tdiff\tdiff.se\tstat\tphi1\t"
                      "phi2\tpval\tfdr\tpostprob.overThreshold\n")
    out_calldml.close()
    subprocess.run("samtools view -H {} | sed '/^@PG/d'"
                   " | bgzip > {}".format(bam_file,
                                          out+"_Temp_PofO_Tagged.sam.gz"),
                   shell=True,
                   check=True)
    bam_choms= getChromsFromBAM(bam_file)
    if args.sv_vcf is not None:
        all_sv_files=args.sv_vcf
        for sv_file in all_sv_files:
            sv_assignment_file= open(out +"_"+os.path.basename(sv_file) + 
                                     '_PofO_Assignment_SVs.vcf', 'w')
            with openfile(sv_file) as vf:
                for line in vf:
                    if line.startswith("##"):
                        sv_assignment_file.write(line)
                    elif line.startswith("#"):
                        sv_assignment_file.write(line.rstrip()+"\tNumHp1ReadsFromColumn8\t"
                                                 "NumHp2ReadsFromColumn8"
                                                 "\tNumMaternalReadsFromColumn8"
                                                 "\tNumPaternalReadsFromColumn8\n")
            sv_assignment_file.close()
    
    subprocess.run("samtools version | grep ^samtools"
                   " && echo longphase `longphase --version`",
                   shell=True,
                   check=True)
    bam_chrom_check= set()
    for chrom in sorted(chroms.keys()):
        print("#############  Processing chromosome {}  #############".format(chrom))
        if args.strand_vcf is not None and not args.phased:
            # print("Using strand-seq phased vcf only with"
            #               " {}.".format(known_dmr.split('/')[-1]))
            vcf_strand = os.path.abspath(args.strand_vcf)
            final_dict, strand_phased_vars= strand_vcf2dict_phased(vcf_strand, 
                                                           vcf, 
                                                           args.include_all_variants,
                                                           chrom)
        elif args.strand_vcf is not None and args.phased:
            # print("Using both strand-seq phased and phased blocks in"
            #               "vcf with {}.".format(known_dmr.split('/')[-1]))
            vcf_strand = os.path.abspath(args.strand_vcf)
            blocks_phased, blocks_dict= get_block(vcf)
            (final_dict,
             strand_phased_vars,
             phase_block_stat)= vcf2dict_phased(blocks_phased, 
                                                vcf_strand,
                                                hapRatio,
                                                minvariantblock,
                                                vcf,
                                                args.include_all_variants,
                                                chrom)
        else:
            raise Exception("No strand-seq vcf is given.")
        
        if strand_phased_vars==0:
            warnings.warn("No phased strand-seq variant in {}. Skipping it.".format(chrom))
            assignment_file= open(out + '_PofO_Assignment.vcf', 'a')
            records_chrom= vcf_tb.query(chrom, 0, chroms[chrom]+1)
            for line in records_chrom:
                if 'PS' in line[8].split(":"):
                    ps_index= line[8].split(":").index('PS')
                    new_ps= line[8].split(":")
                    new_hp= line[9].split(":")
                    new_ps.pop(ps_index)
                    new_hp.pop(ps_index)
                else:
                    new_ps= line[8].split(":")
                    new_hp= line[9].split(":")
                assignment_file.write('\t'.join(line[0:8]+
                                        [':'.join(new_ps)]+
                                        [':'.join(new_hp).replace("|", "/").
                                                          replace("1/0", "0/1").
                                                          replace("2/1", "1/2")]+
                                        ["NA"]*17)+'\n')
            assignment_file.close()
            if args.sv_vcf is not None:
                all_sv_files=args.sv_vcf
                for sv_file in all_sv_files:
                    sv_assignment_file= open(out +"_"+os.path.basename(sv_file) + 
                                             '_PofO_Assignment_SVs.vcf', 'a')
                    with openfile(sv_file) as vf:
                        for line in vf:
                            line=line.rstrip().split("\t")
                            if line[0]==chrom:
                                gt=line[9].split(":")[0]
                                if 'PS' in line[8].split(":"):
                                    ps_index= line[8].split(":").index('PS')
                                    new_ps= line[8].split(":")
                                    new_hp= line[9].split(":")
                                    new_ps.pop(ps_index)
                                    new_hp.pop(ps_index)
                                else:
                                    new_ps= line[8].split(":")
                                    new_hp= line[9].split(":")
                                sv_assignment_file.write('\t'.join(line[0:8]+
                                                              [':'.join(new_ps)]+
                                                              [line[9].replace("|", "/").
                                                                       replace("1/0", "0/1").
                                                                       replace("2/1", "1/2")]+
                                                              ["NA"]*4)+'\n')
                    sv_assignment_file.close()
            continue
        
        (variant_dict,
         read_dict_HP1, 
         read_dict_HP2,
         per_var_cov)= per_read_variant(final_dict,
                                        bam_file,
                                        chunk,
                                        processes,
                                        args.mapping_quality,
                                        args.include_supplementary)
        if not read_dict_HP1 and not read_dict_HP1:
            continue
        reads_hap= dict()
        variant_dict_HP1= defaultdict(lambda: defaultdict(int))
        variant_dict_HP2= defaultdict(lambda: defaultdict(int))
        variant_dict_HP1_ave= defaultdict(lambda: defaultdict(int))
        variant_dict_HP2_ave= defaultdict(lambda: defaultdict(int))
        for key,val in read_dict_HP1.items():
            hp1_count= val
            hp2_count= read_dict_HP2[key]
            if (hp1_count > hp2_count and 
                hp1_count/(hp1_count+hp2_count) >= hapRatio and 
                hp1_count >= minvariant):
                reads_hap[key]= 1
            elif (hp2_count > hp1_count and 
                  hp2_count/(hp1_count+hp2_count) >= hapRatio and 
                  hp2_count >= minvariant):
                reads_hap[key]= 2
        for key,val in variant_dict.items():
            read_count = 0
            hp1_count = 0
            hp2_count = 0
            for read in val:
                read_count += 1
                hp1_count += read_dict_HP1[(key[0],read)]
                hp2_count += read_dict_HP2[(key[0],read)]
                if (key[0],read) in reads_hap:
                    if reads_hap[(key[0],read)] == 1:
                        variant_dict_HP1[(key[0],key[1])][key[2]] += 1 
                    elif reads_hap[(key[0],read)] == 2:
                        variant_dict_HP2[(key[0],key[1])][key[2]] += 1
            variant_dict_HP1_ave[(key[0],key[1])] = round(hp1_count/read_count,3)
            variant_dict_HP2_ave[(key[0],key[1])] = round(hp2_count/read_count,3)
        read_dict_HP1.clear()
        read_dict_HP2.clear()
        reads_hap.clear()
        out_non_pofo = out + '_temp_NonPofO_HP1-HP2_Reassignment.vcf'
        re_assignment= open(out_non_pofo,'w')
        records_chrom= vcf_tb.query(chrom, 0, chroms[chrom]+1)
        with openfile(vcf) as vf_file:
            for line in vf_file:
                if line.startswith("#"):
                    re_assignment.write(line)
        for line in records_chrom:
            if not args.include_all_variants and line[6] not in ["PASS","."]:
                continue
            if line[9].startswith(("0/1","1/0","0|1","1|0")):
                hp1_count_alt= variant_dict_HP1[(line[0],int(line[1])-1)][line[4].upper()]
                hp2_count_alt= variant_dict_HP2[(line[0],int(line[1])-1)][line[4].upper()]
                hp1_count_ref= variant_dict_HP1[(line[0],int(line[1])-1)][line[3].upper()]
                hp2_count_ref= variant_dict_HP2[(line[0],int(line[1])-1)][line[3].upper()]
                if hp1_count_alt > 0 or hp1_count_ref > 0:
                    hp1_alt_ratio= hp1_count_alt/(hp1_count_alt+hp1_count_ref)
                    hp1_ref_ratio= hp1_count_ref/(hp1_count_alt+hp1_count_ref)
                else:
                    hp1_alt_ratio= 0
                    hp1_ref_ratio= 0
                if hp2_count_alt > 0 or hp2_count_ref > 0:
                    hp2_alt_ratio= hp2_count_alt/(hp2_count_alt+hp2_count_ref)
                    hp2_ref_ratio= hp2_count_ref/(hp2_count_alt+hp2_count_ref)
                else:
                    hp2_alt_ratio= 0
                    hp2_ref_ratio= 0
                if ((hp1_count_alt > hp2_count_alt and
                     hp1_alt_ratio > hp2_alt_ratio and
                     hp1_count_alt >= min_read_reassignment) or
                    (hp2_count_ref > hp1_count_ref and
                     hp2_ref_ratio > hp1_ref_ratio and
                     hp2_count_ref >= min_read_reassignment)):
                    re_assignment.write('\t'.join(line[0:8]+["GT","1|0"])+'\n')
                                                  # [':'.join(new_ps)+":PS"]+
                                                  # ["1|0:"+':'.join(new_hp[1:])+":HP2|HP1"])+'\n')
                elif ((hp2_count_alt > hp1_count_alt and
                       hp2_alt_ratio > hp1_alt_ratio and
                       hp2_count_alt >= min_read_reassignment) or
                      (hp1_count_ref > hp2_count_ref and
                       hp1_ref_ratio > hp2_ref_ratio and
                       hp1_count_ref >= min_read_reassignment)):
                    re_assignment.write('\t'.join(line[0:8]+["GT","0|1"])+'\n')

        re_assignment.close()
        variant_dict_HP1.clear()
        variant_dict_HP2.clear()
        subprocess.run("bgzip -f {0} && tabix -p vcf -f {0}.gz"
                       "".format(out_non_pofo),
           shell=True,
           check=True)
        try:
            if args.include_supplementary:
                subprocess.run("whatshap haplotag --ignore-read-groups --tag-supplementary "
                               "--skip-missing-contigs --regions {} "
                               "--output-threads {} --reference {} -o {} {} "
                               "{}".format(chrom,processes,reference,out+"_temp-NonPofO.bam",
                                           out_non_pofo+".gz",bam_file),
                               shell=True,
                               check=True)
            else:
                subprocess.run("whatshap haplotag --ignore-read-groups "
                               "--skip-missing-contigs --regions {} "
                               "--output-threads {} --reference {} -o {} {} "
                               "{}".format(chrom,processes,reference,out+"_temp-NonPofO.bam",
                                           out_non_pofo+".gz",bam_file),
                               shell=True,
                               check=True)
        except:
            raise Exception("python subprocess for longphase NonPofO bam failed.")
        try:
            subprocess.run("samtools index -@ {} {}".format(processes,
                                                            out+"_temp-NonPofO.bam"),
                           shell=True,
                           check=True)
        except:
            raise Exception("python subprocess for samtools index NonPofO bam failed.")    
        subprocess.run("samtools view -d HP:1 -@ {} {} | "
                       "awk -F'\t' '{{print $3,$1,1}}' OFS='\t' "
                       "> {}".format(processes,out+"_temp-NonPofO.bam",
                                                         out+"_temp-NonPofO_reads.tsv"),
                       shell=True,
                       check=True)
        subprocess.run("samtools view -d HP:2 -@ {} {} | "
                       "awk -F'\t' '{{print $3,$1,2}}' OFS='\t' "
                       ">> {}".format(processes,out+"_temp-NonPofO.bam",
                                                         out+"_temp-NonPofO_reads.tsv"),
                       shell=True,
                       check=True)
        
        reads_hap= dict()
        variant_dict_HP1= defaultdict(lambda: defaultdict(int))
        variant_dict_HP2= defaultdict(lambda: defaultdict(int))
        variant_dict_HP1_all= defaultdict(int)
        variant_dict_HP2_all= defaultdict(int)
        with openfile(out+"_temp-NonPofO_reads.tsv") as nonpofo:
            for line in nonpofo:
                line=line.rstrip().split('\t')
                if line[2]=="1":
                    reads_hap[(line[0],line[1])]= 1
                elif line[2]=="2":
                    reads_hap[(line[0],line[1])]= 2
    
        for key,val in variant_dict.items():
            for read in val:
                if (key[0],read) in reads_hap:
                    if reads_hap[(key[0],read)] == 1:
                        variant_dict_HP1[(key[0],key[1])][key[2]] += 1 
                        variant_dict_HP1_all[(key[0],key[1])] += 1
                    elif reads_hap[(key[0],read)] == 2:
                        variant_dict_HP2[(key[0],key[1])][key[2]] += 1 
                        variant_dict_HP2_all[(key[0],key[1])] += 1
        variant_dict.clear()
        
        out_freqhp1= out + '_temp_NonPofO_HP1-HP2_MethylationHP1.tsv'
        out_freqhp2= out + '_temp_NonPofO_HP1-HP2_MethylationHP2.tsv'
        out_freq_methbam(out, processes, reference,
                         os.path.abspath(args.pb_cpg_tools_model), args.pacbio)
    
        subprocess.run("{} {} {} {} {} {} {} {} {} {} {}".format("Rscript",
                                                    os.path.join(os.path.dirname(
                                                                os.path.realpath(__file__)
                                                                    ),
                                                              "DMA_UsingDSS.R"),
                                                    out_freqhp1,
                                                    out_freqhp2,
                                                    out+"_temp",
                                                    args.equal_disp,
                                                    args.smoothing_flag,
                                                    args.smoothing_span,
                                                    args.delta_cutoff,
                                                    args.pvalue,
                                                    dss_processes),
                       shell=True,
                       check=True)
        subprocess.run("sed '1d' {0} >> {1} && "
                       "bgzip -f {0} && tabix -f -S 1 -p bed {0}.gz"
                       "".format(out+"_temp_DMLtest.tsv", out+"_DMLtest.tsv"),
           shell=True,
           check=True)
        subprocess.run("sed '1d' {0} >> {1} && "
                       "bgzip -f {0} && tabix -f -S 1 -p bed {0}.gz"
                       "".format(out+"_temp_callDML.tsv",out+"_callDML.tsv"),
           shell=True,
           check=True)
        (chrom_hp_origin_count,
         chrom_hp_origin_count_all_cg,
         chrom_hp_origin_count_diff_cg,
         chrom_hp_origin_count_dmr)= PofO_dmr(known_dmr, 
                                              chrom,
                                              out,
                                              min_cg,
                                              cpg_difference)
        if chrom_hp_origin_count:
            for key, val in chrom_hp_origin_count.items():
                chrom, origin= key
                if origin.lower() == 'maternal':
                    hp1_cg_count= val['HP1']
                    hp2_cg_count= val['HP2']
                    hp1_dmr_count= chrom_hp_origin_count_dmr[(chrom, 'maternal')]['HP1']
                    hp2_dmr_count= chrom_hp_origin_count_dmr[(chrom, 'maternal')]['HP2']
                    hp1_allcg_count= chrom_hp_origin_count_all_cg[(chrom, 'maternal')]['HP1']
                    hp2_allcg_count= chrom_hp_origin_count_all_cg[(chrom, 'maternal')]['HP2']
                    hp1_alldiffcg_count= chrom_hp_origin_count_diff_cg[(chrom, 'maternal')]['HP1']
                    hp2_alldiffcg_count= chrom_hp_origin_count_diff_cg[(chrom, 'maternal')]['HP2']
                    if hp1_cg_count > hp2_cg_count:
                        chrom_hp_origin[chrom]['HP1'] = ['maternal',
                                                         hp1_cg_count, hp2_cg_count,
                                                         hp1_dmr_count, hp2_dmr_count,
                                                         hp1_alldiffcg_count, hp2_alldiffcg_count,
                                                         hp1_allcg_count, hp2_allcg_count]
                                                          
                        chrom_hp_origin[chrom]['HP2'] = ['paternal',
                                                         hp1_cg_count, hp2_cg_count,
                                                         hp1_dmr_count, hp2_dmr_count,
                                                         hp1_alldiffcg_count, hp2_alldiffcg_count,
                                                         hp1_allcg_count, hp2_allcg_count]
                    elif hp2_cg_count > hp1_cg_count:
                        chrom_hp_origin[chrom]['HP2'] = ['maternal',
                                                         hp2_cg_count, hp1_cg_count,
                                                         hp2_dmr_count, hp1_dmr_count,
                                                         hp2_alldiffcg_count, hp1_alldiffcg_count,
                                                         hp2_allcg_count, hp1_allcg_count]
                        chrom_hp_origin[chrom]['HP1'] = ['paternal',
                                                         hp2_cg_count, hp1_cg_count,
                                                         hp2_dmr_count, hp1_dmr_count,
                                                         hp2_alldiffcg_count, hp1_alldiffcg_count,
                                                         hp2_allcg_count, hp1_allcg_count]
                elif origin.lower() == 'paternal':
                    hp1_cg_count= val['HP1']
                    hp2_cg_count= val['HP2']
                    hp1_dmr_count= chrom_hp_origin_count_dmr[(chrom, 'paternal')]['HP1']
                    hp2_dmr_count= chrom_hp_origin_count_dmr[(chrom, 'paternal')]['HP2']
                    hp1_allcg_count= chrom_hp_origin_count_all_cg[(chrom, 'paternal')]['HP1']
                    hp2_allcg_count= chrom_hp_origin_count_all_cg[(chrom, 'paternal')]['HP2']
                    hp1_alldiffcg_count= chrom_hp_origin_count_diff_cg[(chrom, 'paternal')]['HP1']
                    hp2_alldiffcg_count= chrom_hp_origin_count_diff_cg[(chrom, 'paternal')]['HP2']
                    if hp1_cg_count > hp2_cg_count:
                        chrom_hp_origin[chrom]['HP1'] = ['paternal',
                                                         hp1_cg_count, hp2_cg_count,
                                                         hp1_dmr_count, hp2_dmr_count,
                                                         hp1_alldiffcg_count, hp2_alldiffcg_count,
                                                         hp1_allcg_count, hp2_allcg_count]
                        chrom_hp_origin[chrom]['HP2'] = ['maternal',
                                                         hp1_cg_count, hp2_cg_count,
                                                         hp1_dmr_count, hp2_dmr_count,
                                                         hp1_alldiffcg_count, hp2_alldiffcg_count,
                                                         hp1_allcg_count, hp2_allcg_count]
                    elif hp2_cg_count > hp1_cg_count:
                        chrom_hp_origin[chrom]['HP2'] = ['paternal',
                                                         hp2_cg_count, hp1_cg_count,
                                                         hp2_dmr_count, hp1_dmr_count,
                                                         hp2_alldiffcg_count, hp1_alldiffcg_count,
                                                         hp2_allcg_count, hp1_allcg_count]
                        chrom_hp_origin[chrom]['HP1'] = ['maternal',
                                                         hp2_cg_count, hp1_cg_count,
                                                         hp2_dmr_count, hp1_dmr_count,
                                                         hp2_alldiffcg_count, hp1_alldiffcg_count,
                                                         hp2_allcg_count, hp1_allcg_count]
        if chrom in chrom_hp_origin:
            if chrom_hp_origin[chrom]['HP1'][0] == 'maternal':
                bam_chrom_check.add(chrom)
                subprocess.run("samtools view -@ {} {}"
                               " | bgzip >> {}".format(processes, out+"_temp-NonPofO.bam",
                                                      out+"_Temp_PofO_Tagged.sam.gz"),
                               shell=True,
                               check=True)
            elif chrom_hp_origin[chrom]['HP2'][0] == 'maternal':
                bam_chrom_check.add(chrom)
                subprocess.run("samtools view -@ {} {} | "
                               "sed 's/HP:i:1/HP:i:paternal/g;s/HP:i:2/HP:i:1/g;s/HP:i:paternal/HP:i:2/g'"
                               " | bgzip >> {}".format(processes, out+"_temp-NonPofO.bam",
                                                      out+"_Temp_PofO_Tagged.sam.gz"),
                               shell=True,
                               check=True)
            
        assignment_file= open(out + '_PofO_Assignment.vcf', 'a')
        records_chrom= vcf_tb.query(chrom, 0, chroms[chrom]+1)
        for line in records_chrom:
            block_id= line[9].split(":")[-1]
            if 'PS' in line[8].split(":"):
                ps_index= line[8].split(":").index('PS')
                new_ps= line[8].split(":")
                new_hp= line[9].split(":")
                new_ps.pop(ps_index)
                new_hp.pop(ps_index)
            else:
                new_ps= line[8].split(":")
                new_hp= line[9].split(":")
            
            if not args.include_all_variants and line[6] not in ["PASS","."]:
                assignment_file.write('\t'.join(line[0:8]+
                                      [':'.join(new_ps)]+
                                      [':'.join(new_hp).replace("|", "/").
                                                       replace("1/0", "0/1").
                                                       replace("2/1", "1/2")]+
                                      ["NA"]*17)+'\n')
                continue
            
            if line[9].startswith(("0/1","1/0","0|1","1|0",
                                   "1/2","1|2","2/1","2|1")):
                if ((len(line[3]) == 1 and len(line[4]) == 1) or 
                    (len(line[3]) == 1 and len(line[4]) == 3 and "," in line[4])):
                    info_out_dict[line[0]]["all_het_snvs"] += 1
                else:
                    info_out_dict[line[0]]["all_het_indels"] += 1
            if line[9].startswith(("0/1","1/0","0|1","1|0")):
                hp1_count_alt= variant_dict_HP1[(line[0],int(line[1])-1)][line[4].upper()]
                hp2_count_alt= variant_dict_HP2[(line[0],int(line[1])-1)][line[4].upper()]
                hp1_count_ref= variant_dict_HP1[(line[0],int(line[1])-1)][line[3].upper()]
                hp2_count_ref= variant_dict_HP2[(line[0],int(line[1])-1)][line[3].upper()]
                if hp1_count_alt > 0 or hp1_count_ref > 0:
                    hp1_alt_ratio= hp1_count_alt/(hp1_count_alt+hp1_count_ref)
                    hp1_ref_ratio= hp1_count_ref/(hp1_count_alt+hp1_count_ref)
                else:
                    hp1_alt_ratio= 0
                    hp1_ref_ratio= 0
                if hp2_count_alt > 0 or hp2_count_ref > 0:
                    hp2_alt_ratio= hp2_count_alt/(hp2_count_alt+hp2_count_ref)
                    hp2_ref_ratio= hp2_count_ref/(hp2_count_alt+hp2_count_ref)
                else:
                    hp2_alt_ratio= 0
                    hp2_ref_ratio= 0
                hp1_cov= str(variant_dict_HP1_all[(line[0],int(line[1])-1)])
                hp2_cov= str(variant_dict_HP2_all[(line[0],int(line[1])-1)])
                all_cov= str(per_var_cov[(line[0],int(line[1])-1)])
                
                additional_info1=[all_cov,hp1_cov,hp2_cov,
                                 str(hp1_count_ref), str(hp2_count_ref),
                                 str(hp1_count_alt),str(hp2_count_alt)]
                if (line[0],block_id,"agreement") in phase_block_stat:
                    additional_info2= [str(variant_dict_HP1_ave[(line[0],int(line[1])-1)]),
                                    str(variant_dict_HP2_ave[(line[0],int(line[1])-1)]),
                                    blocks_dict[(line[0],block_id)][0],
                                    blocks_dict[(line[0],block_id)][1],
                                    str(phase_block_stat[(line[0],block_id,"agreement")]),
                                    str(phase_block_stat[(line[0],block_id,"disagreement")])]
                else:
                    additional_info2= [str(variant_dict_HP1_ave[(line[0],int(line[1])-1)]),
                                       str(variant_dict_HP2_ave[(line[0],int(line[1])-1)]),
                                       "NA","NA","NA","NA"]
                if line[0] in chrom_hp_origin:
                    if ((hp1_count_alt > hp2_count_alt and
                         hp1_alt_ratio > hp2_alt_ratio and
                         hp1_count_alt >= min_read_reassignment) or
                        (hp2_count_ref > hp1_count_ref and
                         hp2_ref_ratio > hp1_ref_ratio and
                         hp2_count_ref >= min_read_reassignment)):
                        if len(line[3]) == 1 and len(line[4]) == 1:
                            info_out_dict[line[0]]["pofo_het_snvs"] += 1
                        else:
                            info_out_dict[line[0]]["pofo_het_indels"] += 1
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'maternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                [':'.join(new_ps)+":PS"]+
                                                ["0|1:"+':'.join(new_hp[1:])+":Mat|Pat"]+
                                                additional_info1+
                                                [str(hp1_count_ref), str(hp2_count_ref),
                                                 str(hp2_count_alt),str(hp1_count_alt)]+
                                                 additional_info2)+'\n')
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'paternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                      [':'.join(new_ps)+":PS"]+
                                                      ["1|0:"+':'.join(new_hp[1:])+":Pat|Mat"]+
                                                      additional_info1+
                                                      [str(hp2_count_ref), str(hp1_count_ref),
                                                       str(hp1_count_alt),str(hp2_count_alt)]+
                                                      additional_info2)+'\n')
                    elif ((hp2_count_alt > hp1_count_alt and
                           hp2_alt_ratio > hp1_alt_ratio and
                           hp2_count_alt >= min_read_reassignment) or
                          (hp1_count_ref > hp2_count_ref and
                           hp1_ref_ratio > hp2_ref_ratio and
                           hp1_count_ref >= min_read_reassignment)):
                        if len(line[3]) == 1 and len(line[4]) == 1:
                            info_out_dict[line[0]]["pofo_het_snvs"] += 1
                        else:
                            info_out_dict[line[0]]["pofo_het_indels"] += 1
                        if chrom_hp_origin[line[0]]['HP1'][0] == 'maternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                          [':'.join(new_ps)+":PS"]+
                                                          ["0|1:"+':'.join(new_hp[1:])+":Mat|Pat"]+
                                                          additional_info1+
                                                          [str(hp2_count_ref), str(hp1_count_ref),
                                                           str(hp1_count_alt),str(hp2_count_alt)]+
                                                          additional_info2)+'\n')
                        if chrom_hp_origin[line[0]]['HP1'][0] == 'paternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                          [':'.join(new_ps)+":PS"]+
                                                          ["1|0:"+':'.join(new_hp[1:])+":Pat|Mat"]+
                                                          additional_info1 +
                                                          [str(hp1_count_ref), str(hp2_count_ref),
                                                           str(hp2_count_alt),str(hp1_count_alt)]+
                                                          additional_info2)+'\n')
                    else:
                        assignment_file.write('\t'.join(line[0:8]+
                                                      [':'.join(new_ps)]+
                                                      [':'.join(new_hp).replace("|", "/").
                                                                       replace("1/0", "0/1")]+
                                                      additional_info1 +
                                                       ["aaaaaaaaaaaaaaaNA"]*4 +
                                                       additional_info2)+'\n')
                else:
                    assignment_file.write('\t'.join(line[0:8]+
                                                  [':'.join(new_ps)]+
                                                  [':'.join(new_hp).replace("|", "/").
                                                                   replace("1/0", "0/1")]+
                                                  additional_info1 +
                                                   ["bbbbbbbbbbbbNA"]*4 +
                                                   additional_info2)+'\n')
                
            elif line[9].startswith(('1/2','1|2','2/1','2|1')):
                hp1_count_alt= variant_dict_HP1[(line[0],int(line[1])-1)][line[4].split(',')[1].upper()]
                hp2_count_alt= variant_dict_HP2[(line[0],int(line[1])-1)][line[4].split(',')[1].upper()]
                hp1_count_ref= variant_dict_HP1[(line[0],int(line[1])-1)][line[4].split(',')[0].upper()]
                hp2_count_ref= variant_dict_HP2[(line[0],int(line[1])-1)][line[4].split(',')[0].upper()]
                if hp1_count_alt > 0 or hp1_count_ref > 0:
                    hp1_alt_ratio= hp1_count_alt/(hp1_count_alt+hp1_count_ref)
                    hp1_ref_ratio= hp1_count_ref/(hp1_count_alt+hp1_count_ref)
                else:
                    hp1_alt_ratio= 0
                    hp1_ref_ratio= 0
                if hp2_count_alt > 0 or hp2_count_ref > 0:
                    hp2_alt_ratio= hp2_count_alt/(hp2_count_alt+hp2_count_ref)
                    hp2_ref_ratio= hp2_count_ref/(hp2_count_alt+hp2_count_ref)
                else:
                    hp2_alt_ratio= 0
                    hp2_ref_ratio= 0
                hp1_cov= str(variant_dict_HP1_all[(line[0],int(line[1])-1)])
                hp2_cov= str(variant_dict_HP2_all[(line[0],int(line[1])-1)])
                all_cov= str(per_var_cov[(line[0],int(line[1])-1)])
                additional_info1=[all_cov,hp1_cov,hp2_cov,
                                 str(hp1_count_ref), str(hp2_count_ref),
                                 str(hp1_count_alt),str(hp2_count_alt)]
                if (line[0],block_id,"agreement") in phase_block_stat:
                    additional_info2= [str(variant_dict_HP1_ave[(line[0],int(line[1])-1)]),
                                    str(variant_dict_HP2_ave[(line[0],int(line[1])-1)]),
                                    blocks_dict[(line[0],block_id)][0],
                                    blocks_dict[(line[0],block_id)][1],
                                    str(phase_block_stat[(line[0],block_id,"agreement")]),
                                    str(phase_block_stat[(line[0],block_id,"disagreement")])]
                else:
                    additional_info2= [str(variant_dict_HP1_ave[(line[0],int(line[1])-1)]),
                                       str(variant_dict_HP2_ave[(line[0],int(line[1])-1)]),
                                       "NA","NA","NA","NA"]
                if line[0] in chrom_hp_origin:    
                    if ((hp1_count_alt > hp2_count_alt and
                         hp1_alt_ratio > hp2_alt_ratio and
                         hp1_count_alt >= min_read_reassignment) or
                        (hp2_count_ref > hp1_count_ref and
                         hp2_ref_ratio > hp1_ref_ratio and
                         hp2_count_ref >= min_read_reassignment)):
                        if len(line[3]) == 1 and len(line[4]) == 3:
                            info_out_dict[line[0]]["pofo_het_snvs"] += 1
                        else:
                            info_out_dict[line[0]]["pofo_het_indels"] += 1
                        
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'maternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                [':'.join(new_ps)+":PS"]+
                                                ["1|2:"+':'.join(new_hp[1:])+":Ref_Mat|Pat"]+
                                                additional_info1 +
                                                [str(hp1_count_ref), str(hp2_count_ref),
                                                 str(hp2_count_alt),str(hp1_count_alt)] +
                                                additional_info2)+'\n')
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'paternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                [':'.join(new_ps)+":PS"]+
                                                ["1|2:"+':'.join(new_hp[1:])+":Ref_Pat|Mat"]+
                                                additional_info1 +
                                                [str(hp2_count_ref), str(hp1_count_ref),
                                                 str(hp1_count_alt),str(hp2_count_alt)] +
                                                additional_info2)+'\n')
                            
                    elif ((hp2_count_alt > hp1_count_alt and
                           hp2_alt_ratio > hp1_alt_ratio and
                           hp2_count_alt >= min_read_reassignment) or
                          (hp1_count_ref > hp2_count_ref and
                           hp1_ref_ratio > hp2_ref_ratio and
                           hp1_count_ref >= min_read_reassignment)):
                        if len(line[3]) == 1 and len(line[4]) == 3:
                            info_out_dict[line[0]]["pofo_het_snvs"] += 1
                        else:
                            info_out_dict[line[0]]["pofo_het_indels"] += 1
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'maternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                [':'.join(new_ps)+":PS"]+
                                                ["1|2:"+':'.join(new_hp[1:])+":Ref_Pat|Mat"]+
                                                additional_info1 +
                                                [str(hp1_count_ref), str(hp2_count_ref),
                                                 str(hp2_count_alt),str(hp1_count_alt)] +
                                                additional_info2)+'\n')
                        if chrom_hp_origin[line[0]]['HP2'][0] == 'paternal':
                            assignment_file.write('\t'.join(line[0:8]+
                                                [':'.join(new_ps)+":PS"]+
                                                ["1|2:"+':'.join(new_hp[1:])+":Ref_Mat|Pat"]+
                                                additional_info1 +
                                                [str(hp2_count_ref), str(hp1_count_ref),
                                                 str(hp1_count_alt),str(hp2_count_alt)] +
                                                additional_info2)+'\n')
                        
                    else:
                        assignment_file.write('\t'.join(line[0:8]+
                                            [':'.join(new_ps)]+
                                            [':'.join(new_hp).replace("|", "/").
                                                             replace("2/1", "1/2")]+
                                            additional_info1 +
                                             ["ccccccccccccccNA"]*4 +
                                             additional_info2)+'\n')
                else:
                    assignment_file.write('\t'.join(line[0:8]+
                                        [':'.join(new_ps)]+
                                        [':'.join(new_hp).replace("|", "/").
                                                         replace("2/1", "1/2")]+
                                        additional_info1 +
                                         ["ddddddddddddddNA"]*4 +
                                         additional_info2)+'\n')
            else:
                assignment_file.write('\t'.join(line[0:8]+
                                              [':'.join(new_ps)]+
                                              [':'.join(new_hp).replace("|", "/")]+
                                              ["NA"]*17)+'\n')
        assignment_file.close()                
        if args.sv_vcf is not None:
            print("Assigning PofO to SVs from {}".format(chrom))
            all_sv_files=args.sv_vcf
            for sv_file in all_sv_files:
                sv_assignment_file= open(out +"_"+os.path.basename(sv_file) + 
                                         '_PofO_Assignment_SVs.vcf', 'a')
                with openfile(sv_file) as vf:
                    for line in vf:
                        line=line.rstrip().split("\t")
                        if line[0]!=chrom:
                            continue
                        gt=line[9].split(":")[0]
                        if 'PS' in line[8].split(":"):
                            ps_index= line[8].split(":").index('PS')
                            new_ps= line[8].split(":")
                            new_hp= line[9].split(":")
                            new_ps.pop(ps_index)
                            new_hp.pop(ps_index)
                        else:
                            new_ps= line[8].split(":")
                            new_hp= line[9].split(":")
                        if not args.include_all_variants and line[6] not in ["PASS","."]:
                            sv_assignment_file.write('\t'.join(line[0:8]+
                                                  [':'.join(new_ps)]+
                                                  [':'.join(new_hp).replace("|", "/").
                                                                   replace("1/0", "0/1").
                                                                   replace("2/1", "1/2")]+
                                                  ["NA"]*4)+'\n')
                            continue
                        if line[0] in chrom_hp_origin:
                            if gt in ["0/1","1/0","0|1","1|0"] and "RNAMES=" in line[7]:
                                hp1_count= 0
                                hp2_count= 0
                                for read_ID in line[7].split("RNAMES=")[1].split(";")[0].split(","):
                                    if (chrom,read_ID) in reads_hap:
                                        if reads_hap[(chrom,read_ID)]==1:
                                            hp1_count+= 1
                                        elif reads_hap[(chrom,read_ID)]==2:
                                            hp2_count+= 1
                                if (hp1_count > hp2_count and
                                    hp1_count >= min_read_reassignment):
                                    if chrom_hp_origin[line[0]]['HP2'][0] == 'maternal':
                                        sv_assignment_file.write('\t'.join(line[0:8]+
                                                                  [':'.join(new_ps)+":PS"]+
                                                                  ["0|1:"+':'.join(new_hp[1:])+":Mat|Pat"]+
                                                                  [str(hp1_count), str(hp2_count),
                                                                   str(hp2_count), str(hp1_count)])+'\n')
                                    if chrom_hp_origin[line[0]]['HP2'][0] == 'paternal':
                                        sv_assignment_file.write('\t'.join(line[0:8]+
                                                                  [':'.join(new_ps)+":PS"]+
                                                                  ["1|0:"+':'.join(new_hp[1:])+":Pat|Mat"]+
                                                                  [str(hp1_count), str(hp2_count),
                                                                   str(hp1_count), str(hp2_count)])+'\n')
                                elif (hp2_count > hp1_count and
                                      hp2_count >= min_read_reassignment):
                                    if chrom_hp_origin[line[0]]['HP1'][0] == 'maternal':
                                        sv_assignment_file.write('\t'.join(line[0:8]+
                                                                      [':'.join(new_ps)+":PS"]+
                                                                      ["0|1:"+':'.join(new_hp[1:])+":Mat|Pat"]+
                                                                      [str(hp1_count), str(hp2_count),
                                                                       str(hp1_count), str(hp2_count)])+'\n')
                                    if chrom_hp_origin[line[0]]['HP1'][0] == 'paternal':
                                        sv_assignment_file.write('\t'.join(line[0:8]+
                                                                      [':'.join(new_ps)+":PS"]+
                                                                      ["1|0:"+':'.join(new_hp[1:])+":Pat|Mat"]+
                                                                      [str(hp1_count), str(hp2_count),
                                                                       str(hp2_count), str(hp1_count)])+'\n')
                                else:
                                    sv_assignment_file.write('\t'.join(line[0:8]+
                                                                  [':'.join(new_ps)]+
                                                                  [line[9].replace("|", "/").
                                                                            replace("1/0", "0/1")]+
                                                                  [str(hp1_count), str(hp2_count),
                                                                   "NA","NA"])+'\n')
                                        
                            else:
                                sv_assignment_file.write('\t'.join(line[0:8]+
                                                              [':'.join(new_ps)]+
                                                              [line[9].replace("|", "/").
                                                               replace("1/0", "0/1").
                                                               replace("2/1", "1/2")]+
                                                              ["NA"]*4)+'\n')
                        else:
                            sv_assignment_file.write('\t'.join(line[0:8]+
                                                          [':'.join(new_ps)]+
                                                          [line[9].replace("|", "/").
                                                           replace("1/0", "0/1").
                                                           replace("2/1", "1/2")]+
                                                          ["NA"]*4)+'\n')
                sv_assignment_file.close()
        subprocess.run("rm {}*".format(out+"_temp"),
                        shell=True,
                        check=True)
    reads_hap.clear()
    subprocess.run("cut -f1-2,4-5,7,9- {} | sed '/^##/d'"
                   " > {}".format(out + '_PofO_Assignment.vcf',
                                  out + '_PofO_Assignment_Info.tsv'),
                    shell=True,
                    check=True)
    subprocess.run("cut -f1-10 {0} > {1} "
                   "&& mv {1} {0}".format(out + '_PofO_Assignment.vcf',
                                               out + '_Temp_PofO_Assignment.vcf'),
                    shell=True,
                    check=True)
    if args.sv_vcf is not None:
        subprocess.run("cut -f1-2,4-5,7,9- {} | sed '/^##/d'"
                       " > {}".format(out +"_"+os.path.basename(sv_file) + 
                                      '_PofO_Assignment_SVs.vcf',
                                      out +"_"+os.path.basename(sv_file) + 
                                      '_PofO_Assignment_SVs_Info.tsv'),
                        shell=True,
                        check=True)
        subprocess.run("cut -f1-10 {0} > {1} "
                       "&& mv {1} {0}".format(out +"_"+os.path.basename(sv_file) + 
                                              '_PofO_Assignment_SVs.vcf',
                                              out +"_Temp_"+os.path.basename(sv_file) + 
                                              '_PofO_Assignment_SVs.vcf'),
                        shell=True,
                        check=True)
    
    subprocess.run("samtools view --remove-tag HP -f4 -@ {} {}"
                        " | bgzip >> {}".format(processes, bam_file,
                                                out+"_Temp_PofO_Tagged.sam.gz"),
                        shell=True,
                        check=True)
    for chrom in bam_choms - bam_chrom_check:
        subprocess.run("samtools view --remove-tag HP -@ {} {} {}"
                            " | bgzip >> {}".format(processes, bam_file, chrom,
                                                    out+"_Temp_PofO_Tagged.sam.gz"),
                            shell=True,
                            check=True)
    subprocess.run("gunzip -c {} | samtools sort -@ {} -O CRAM --write-index "
                   "--reference {} -o {}".format(out+"_Temp_PofO_Tagged.sam.gz",
                                                 processes, reference,
                                                 out+"_PofO_Tagged.cram"),
                   shell=True,
                   check=True)
    subprocess.run("rm {}*".format(out+"_Temp"),
                    shell=True,
                    check=True)
                        
    out_scores= open(out + '_PofO_Scores.tsv','w')
    out_scores.write("Chromosome\tOrigin_HP1\tOrigin_HP2\tPofO_Assignment_Score\t"
                     "Num_Differentially_Methylated_CGs_Supported_PofO_Assignment\t"
                     "Num_Differentially_Methylated_CGs_Conflicted_PofO_Assignment\t"
                     "Num_iDMRs_Supported_PofO_Assignment\t"
                     "Num_iDMRs_Conflicted_PofO_Assignment\t"
                     "Num_All_Differentially_Methylated_CGs_At_Supporting_iDMRs\t"
                     "Num_All_Differentially_Methylated_CGs_At_Conflicting_iDMRs\t"
                     "Num_All_CGs_CouldBeExaminedInBothHaplotypes_At_Supporting_iDMRs\t"
                     "Num_All_CGs_CouldBeExaminedInBothHaplotypes_At_Conflicting_iDMRs\n")
    for chrom,val in chrom_hp_origin.items():
        for hp,score in val.items():
            if hp=="HP1":
                origin_hp1= score[0]
                if origin_hp1 == "maternal":
                    origin_hp2= "Paternal"
                elif origin_hp1 == "paternal":
                    origin_hp2= "Maternal"
                out_scores.write('\t'.join([chrom,origin_hp1,origin_hp2,
                                            str(score[1]/(score[1]+score[2]))] +
                                           list(map(str,score[1:])))+'\n')
    out_scores.close()
    if not args.include_all_variants:
        print("Per chromosome info for the variants with \"PASS\""
              " or \".\" in FILTER column in the input vcf file:")
    else:
        print("Per chromosome info for the variants in the input vcf file:")
    print("chrom\tall_het_snvs\tall_het_indels\t"
          "pofo_assigned_het_snvs\tpofo_assigned_het_indels")
    for key,val in info_out_dict.items():
        print('\t'.join(map(str,[key,val["all_het_snvs"],val["all_het_indels"],
                            val["pofo_het_snvs"],val["pofo_het_indels"]])))


"""
Specific argument parser.
"""
parser = argparse.ArgumentParser(prog='patmat.py', add_help=False,
                                 description="Phasing reads and Methylation "
                                             "using strand-seq and nanopore to determine "
                                             "PofO of each homologous chromosome "
                                             "in a single sample.")
required = parser.add_argument_group("Required arguments")
required.add_argument("--bam", "-b",
                      action="store",
                      type=str,
                      required=True,
                      help="The path to the coordinate sorted bam file with methylation"
                      " tag.")
required.add_argument("--output", "-o",
                      action="store",
                      type=str,
                      required=True,
                      help=("The path to directory and prefix to save "
                            "files. e.g path/to/directory/prefix"))
required.add_argument("--vcf", "-v",
                  action="store",
                  type=str,
                  required=True,
                  default= None,
                  help="The path to the vcf file.")
required.add_argument("--strand_vcf", "-stv",
                      action="store",
                      type=str,
                      required=True,
                      help="The path to the chromosome-scale phased vcf file."
                           " This is the input vcf file that has been phased "
                           "using strand-seq data.")
required.add_argument("--reference", "-ref",
                      action="store",
                      type=str,
                      required=False,
                      help=("If you have given a bam file with methylation tag"
                            " for the --tool_and_callthresh option, then you "
                            "must also give the path to the reference file. "
                            "File must be indexed using samtools faidx."))
optional = parser.add_argument_group("Optional arguments")

# optional.add_argument("--callthresh", "-tc",
#                            action="store",
#                            type=str,
#                            required=False,
#                            default="methbam:0.4",
#                            help=("Software you have used for methylation "
#                                  "calling:Call threshold. Supported files "
#                                  "include methbam (Methylation bam format produced by "
#                                  "guppy basecaller) and per-read CpG methylation calls from "
#                                  "nanoplish (or f5c>=v0.7), megalodon, and deepsignal. "
#                                  "For example, nanopolish:1.5 is when methylation"
#                                  " calling performed by nanopolish and a CpG with"
#                                  " llr >= 1.5 will be considered as methylated "
#                                  "and llr <= -1.5 as unmethylated, anything "
#                                  "in between will be considered as ambiguous"
#                                  " call and ignored. For methbam, megalodon, and"
#                                  " deepsignl call threshold will be delta probability (0-1)"
#                                  ". For example threshold 0.4 means any call >=0.7"
#                                  " is methylated and <=0.3 is not and between"
#                                  " 0.3-0.7 will be ignored. Default is methbam:0.4."
#                                  " If methbam is selected you must also provide path"
#                                  " to the reference file using --reference option."))
optional.add_argument("--pacbio", "-pb",
                            action="store_true",
                            required=False,
                            help="Select this if the reads are from PacBio HiFi.")

optional.add_argument("--known_dmr", "-kd",
                  action="store",
                  type=str,
                  required= False,
                  default= os.path.join(os.path.dirname(
                                                os.path.realpath(__file__)
                                                    ),
                                             "Imprinted_DMR_List_V1.tsv"),
                  help="The path to the input file for known imprinted DMRs. "
                        "File must have the following information in the "
                        "following column order: "
                        "chromosome\tstart\tend\tMethylatedAlleleOrigin "
                        "where the methylated allele origin must be either "
                        "maternal or paternal (First row must be header). "
                        "By default, we use iDMR list in repo's patmat directory.")
optional.add_argument("--phased", "-ph",
                      action="store_true",
                      required=False,
                      help=("Select this option if your input vcf is a phased vcf file"
                            " using long-reads using whatshap or longphase. "
                            "In this case phased-blocks and phased variants "
                            "inside them will be used and strand-seq phased data"
                            " are used to correct the switches accross phase blocks. "
                            "This can be useful when the chromosome-scale"
                            " phased variants from strand-seq are very sparse. "
                            "If selected, vcf file must be indexed using tabix."
                            " Note that phased blocks will be extract from vcf file and"
                            " assumption is that the number after last : sign in "
                            "column 10 is the block ID, as with vcfs phased by "
                            "whatshap or longphase."))
optional.add_argument("--pb_cpg_tools_model", "-pbcg",
                      action="store",
                            type=str,
                            required=False,
                            default=os.path.join('/'.join(os.path.dirname(
                                                    os.path.realpath(__file__)
                                                        ).split('/')[0:-1]),
                                                 "third_parties",
                                                 "pb-CpG-tools-v2.3.2-x86_64-unknown-linux-gnu",
                                                 "models/pileup_calling_model.v1.tflite"),
                      help=("If the data is from PacBio HiFi and you want to perform "
                            "enhanced methylation detection, specify the"
                            " path to the model file . By default "
                            "pileup_calling_model.v1.tflite will be used"))

optional.add_argument("--sv_vcf", "-sv_vcf",
                      action="store",
                      type=str,
                      nargs='+',
                      required=False,
                      default=None,
                      help=("Path to the Structural variation (SV) call"
                            " file if you wish to also add PofO to SVs."
                            " if multiple files are gived, they must be separated by"
                            " space and give the absolute path to each file."
                            " File(s) must include read names (e.g.RNAMES=read1,"
                            "read2,read3) in the 8th column of vcf ."))
optional.add_argument("--black_list", "-bl",
                      action="store",
                      type=str,
                      required=False,
                      default= None,
                      help="List of regions to ignore phased variants at them."
                      " Three first columns must be chromosome\tstart\tend."
                      " If a black list is given the vcf file must be indexed using tabix.")
optional.add_argument("--hapratio", "-hr",
                      action="store",
                      type=float,
                      required=False,
                      default=0.75,
                      help=("0-1. Minimum ratio of variants a read must have "
                            "from a haplotype to assign it to that haplotype. "
                            "Default is 0.75. Note that if you also provide "
                            "--phased option, this option will be "
                            "also used to correct phased-block switches"
                            " using Strand-seq phased variants. In this case,"
                            " it is the minimum ratio of phased variants "
                            "at a block that supports the decision based "
                            "on strand-seq phased variants."))
optional.add_argument("--mapping_quality", "-mq",
                      action="store",
                      type=int,
                      required=False,
                      default=20,
                      help=("An integer value to specify threshold for "
                            "filtering reads based on mapping quality for "
                            "PofO assignment to variants. "
                            "Default is >=20"))
optional.add_argument("--min_variant", "-mv",
                      action="store",
                      type=int,
                      required=False,
                      default=1,
                      help=("Minimum number of phased variants a read must have "
                            "to be considered during variant rephasing."
                            ". Default= 1."))
optional.add_argument("--min_variant_block", "-mvb",
                      action="store",
                      type=int,
                      required=False,
                      default=2,
                      help=("Minimum number of concordant phased variants must a phased block "
                            "have to be used during phase block switch correction"
                            ". Default= 2."))
optional.add_argument("--min_read_number", "-mr",
                      action="store",
                      type=int,
                      required=False,
                      default=2,
                      help=("Minimum number of reads to support a variant"
                            " to assign to each haplotype. Default= 2"))
optional.add_argument("--min_cg", "-mcg",
                      action="store",
                      type=int,
                      required=False,
                      default= 5,
                      help=("Minimum number of CpGs an iDMR must have to "
                            " consider it for PofO assignment. Default is 5."))
optional.add_argument("--cpg_difference", "-cd",
                      action="store",
                      type=float,
                      required=False,
                      default= 0.1,
                      help=("Minimum cut off for the fraction of CpGs between haplotypes "
                            "must be differentially methylated at an iDMR to "
                            "consider it for PofO assignment. Default is 0.1."))
optional.add_argument("--include_all_variants", "-iav",
                      action="store_true",
                      required=False,
                      help="By default, only variants that have \"PASS\" or \".\" "
                           " in the FILTER column of the input vcf file will be used"
                           " during phasing and PofO assignment. Select this flag "
                           "if you want to use all the variants.")
optional.add_argument("--include_supplementary", "-is",
                      action="store_true",
                      required=False,
                      help="Also include supplementary reads.")
# optional.add_argument("--include_indels", "-ind",
#                       action="store_true",
#                       required=False,
#                       help="Also include indels for read phasing to haplotypes.")
optional.add_argument("--processes", "-p",
                      action="store",
                      type=int,
                      required=False,
                      default=4,
                      help="Number of parallel processes. Default is 4.")
optional.add_argument("--chunk_size", "-cs",
                      action="store",
                      type=int,
                      required=False,
                      default=500,
                      help=("Chunk per process. Default is 500"))
optional = parser.add_argument_group("Optional arguments. The following options "
                                     "are DSS options for differential methylation"
                                     " analysis to find differentially methylated "
                                     "CpGs between haplotypes")
optional.add_argument("--delta_cutoff", "-dc",
                            action="store",
                            type=float,
                            default=0.075,
                            required=False,
                            help=("0-1. A threshold for defining differentially "
                                  "methylated loci (DML) or CpGs. In DML testing"
                                  " procedure, hypothesis test that the two groups "
                                  "means are equal is conducted at each CpG site. "
                                  "Here if delta is specified, the function will "
                                  "compute the posterior probability that the "
                                  "difference of the means are greater than delta,"
                                  " and then call DML based on that. Default is 0.075."))
optional.add_argument("--pvalue", "-pv",
                      action="store",
                      type=float,
                      required=False,
                      default= 0.001,
                      help=("0-1. When delta is not specified, this is the "
                            "threshold of p-value for defining DML and "
                            "loci with p-value less than this threshold "
                            "will be deemed DMLs. When delta is specified, "
                            "CpG sites with posterior probability greater than"
                            " 1-pvalue_threshold are deemed DML. Default is 0.001"))
optional.add_argument("--smoothing_span", "-sms",
                            action="store",
                            type=int,
                            default=500,
                            required=False,
                            help=("The size of smoothing window, in "
                                  "basepairs. Default is 500."))
optional.add_argument("--smoothing_flag", "-smf",
                        action="store",
                        type=str,
                        default="TRUE",
                        required=False,
                        help=("TRUE/FALSE. A flag to indicate whether to apply "
                              "smoothing in estimating mean methylation levels."
                              " For more instructions see the DSS R package guide. "
                              "Default is TRUE."))
optional.add_argument("--equal_disp", "-ed",
                        action="store",
                        type=str,
                        default="FALSE",
                        required=False,
                        help=("TRUE/FALSE. A flag to indicate whether the "
                              "dispersion in two groups are deemed equal or not. "
                              "For more instructions see the DSS R package guide. "
                              "Default is FALSE Because there is no biological"
                              " replicate here, you should specify either "
                              "equal_disp TRUE or smoothing_flag TRUE. "
                              "Do not specify both as FALSE."))
optional.add_argument("--dss_processes", "-dp",
                      action="store",
                      type=int,
                      required=False,
                      help="Number of parallel processes use for DSS "
                            "differential methylation analysis. If not given, it will "
                            "be the same as --processes option. Differential methylation "
                            " analysis usually requires high memory and if there"
                            " is not enough memory, specify less number processes"
                            " using this flag for DSS to allocate available memory "
                            "for less processes.")
optional = parser.add_argument_group("Help and version options")
optional.add_argument('--version', action='version', 
                      version='%(prog)s 1.4.0_dev',
                      help= "Print program's version and exit")
optional.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS,
                      help="Print this help and exit.")
args = parser.parse_args()

if __name__ == '__main__':
    main(args)