#!/usr/bin/env python3
#
# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
#
# This file is part of HISAT 2.
#
# HISAT 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HISAT 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
#


import sys, os, subprocess, re
import inspect, random
import math
from datetime import datetime, date, time
from argparse import ArgumentParser, FileType
from copy import deepcopy
import hisatgenotype_typing_common as typing_common, hisatgenotype_gene_typing as gene_typing, hisatgenotype_assembly_graph as assembly_graph


"""
   var: ['single', 3300, 'G']
   exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]]
"""
def var_in_exon(var, exons):
    exonic = False
    var_type, var_left, var_data = var
    var_right = var_left
    if var_type == "deletion":
        var_right = var_left + int(var_data) - 1
    for exon_left, exon_right in exons:
        if var_left >= exon_left and var_right <= exon_right:
            return True
    return False


"""
Report variant IDs whose var is within exonic regions
"""
def get_exonic_vars(Vars, exons):
    vars = set()
    for var_id, var in Vars.items():
        var_type, var_left, var_data = var
        var_right = var_left
        if var_type == "deletion":
            var_right = var_left + int(var_data) - 1
        for exon_left, exon_right in exons:
            if var_left >= exon_left and var_right <= exon_right:
                vars.add(var_id)
                
    return vars


"""
Get representative alleles among those that share the same exonic sequences
"""
def get_rep_alleles(Links, exon_vars):
    allele_vars = {}
    for var, alleles in Links.items():
        if var not in exon_vars:
            continue
        for allele in alleles:
            if allele not in allele_vars:
                allele_vars[allele] = set()
            allele_vars[allele].add(var)

    allele_groups = {}
    for allele, vars in allele_vars.items():
        vars = '-'.join(vars)
        if vars not in allele_groups:
            allele_groups[vars] = []
        allele_groups[vars].append(allele)

    allele_reps = {} # allele representatives
    allele_rep_groups = {} # allele groups by allele representatives
    for allele_members in allele_groups.values():
        assert len(allele_members) > 0
        allele_rep = allele_members[0]
        allele_rep_groups[allele_rep] = allele_members
        for allele_member in allele_members:
            assert allele_member not in allele_reps
            allele_reps[allele_member] = allele_rep

    return allele_reps, allele_rep_groups
    

"""
"""
def error_correct(ref_seq,
                  read_seq,
                  read_pos,
                  mpileup,
                  Vars,
                  Var_list,
                  cmp_list,
                  debug = False):
    if debug:
        print >> sys.stderr, cmp_list
        print >> sys.stderr, read_seq

    num_correction = 0
    i = 0
    while i < len(cmp_list):
        type, left, length = cmp_list[i][:3]
        assert length > 0
        if left >= len(ref_seq):
            break
        if type == "match":
            middle_cmp_list = []
            last_j = 0
            for j in range(length):
                if read_pos + j >= len(read_seq) or \
                   left + j >= len(ref_seq):
                    continue
                
                read_bp, ref_bp = read_seq[read_pos + j], ref_seq[left + j]
                assert left + j < len(mpileup)
                nt_set = mpileup[left + j][0]
                if len(nt_set) > 0 and read_bp not in nt_set:
                    read_bp = 'N' if len(nt_set) > 1 else nt_set[0]                    
                    read_seq = read_seq[:read_pos + j] + read_bp + read_seq[read_pos + j + 1:]
                    assert read_bp != ref_bp
                    new_cmp = ["mismatch", left + j, 1, "unknown"]
                    num_correction += 1
                    if read_bp != 'N':
                        var_idx = typing_common.lower_bound(Var_list, left + j)
                        while var_idx < len(Var_list):
                            var_pos, var_id = Var_list[var_idx]
                            if var_pos > left + j:
                                break
                            if var_pos == left + j:
                                var_type, _, var_data = Vars[var_id]
                                if var_type == "single" and read_bp == var_data:
                                    new_cmp[3] = var_id
                                    break                                                        
                            var_idx += 1
                    if j > last_j:
                        middle_cmp_list.append(["match", left + last_j, j- last_j])
                    middle_cmp_list.append(new_cmp)
                    last_j = j + 1
            if last_j < length:
                middle_cmp_list.append(["match", left + last_j, length - last_j])

            assert len(middle_cmp_list) > 0
            cmp_list = cmp_list[:i] + middle_cmp_list + cmp_list[i+1:]
            i += (len(middle_cmp_list) - 1)
        else:
            assert type == "mismatch"
            read_bp, ref_bp = read_seq[read_pos], ref_seq[left]
            assert left < len(mpileup)
            nt_set = mpileup[left][0]

            if debug:
                print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]

            if len(nt_set) > 0 and read_bp not in nt_set:
                read_bp = 'N' if len(nt_set) > 1 else nt_set[0]
                read_seq = read_seq[:read_pos] + read_bp + read_seq[read_pos+1:]
                if read_bp == 'N':
                    cmp_list[i][3] = "unknown"
                elif read_bp == ref_bp:
                    cmp_list[i] = ["match", left, 1]
                    num_correction += 1
                else:
                    cmp_list[i][3] = "unknown"
                    var_idx = typing_common.lower_bound(Var_list, left)
                    while var_idx < len(Var_list):
                        var_pos, var_id = Var_list[var_idx]
                        if var_pos > left:
                            break
                        if var_pos == left:
                            var_type, _, var_data = Vars[var_id]
                            if var_type == "single" and read_bp == var_data:
                                cmp_list[i][3] = var_id
                                break                                                        
                        var_idx += 1

                if debug:
                    print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]
                    print >> sys.stderr, cmp_list[i]

        read_pos += length
        i += 1

    # Combine matches
    i = 0
    while i < len(cmp_list):
        type, left, length = cmp_list[i][:3]
        if type == "match" and i + 1 < len(cmp_list):
            type2, left2, length2 = cmp_list[i+1][:3]
            if type2 == "match":
                cmp_list[i] = [type, left, length + length2]
                cmp_list = cmp_list[:i+1] + cmp_list[i+2:]
                continue
        i += 1

    if debug:
        print >> sys.stderr, cmp_list
        print >> sys.stderr, read_seq
                            
    return cmp_list, read_seq, num_correction


"""
"""
def typing(simulation,
           base_fname,
           locus_list,
           genotype_genome,
           partial,
           partial_alleles,
           refGenes,
           Genes,
           Gene_names,
           Gene_lengths,
           refGene_loci,
           Vars,
           Var_list,
           Links,
           aligners,
           num_editdist,
           assembly,
           output_base,
           error_correction,
           allow_discordant,
           display_alleles,
           fastq,
           read_fname,
           alignment_fname,
           num_frag_list,
           read_len,
           fragment_len,
           threads,
           best_alleles,
           verbose):
    if simulation:
        test_passed = {}
    report_file = open(output_base + ".report", 'w')
    for aligner, index_type in aligners:
        for f_ in [sys.stderr, report_file]:
            if index_type == "graph":
                print >> f_, "\n\t\t%s %s" % (aligner, index_type)
            else:
                print >> f_, "\n\t\t%s %s" % (aligner, index_type)

        remove_alignment_file = False
        if alignment_fname == "":
            # Align reads, and sort the alignments into a BAM file
            remove_alignment_file = True
            if simulation:
                alignment_fname = "%s_output.bam" % base_fname
            else:
                alignment_fname = read_fname[0].split('/')[-1]
                alignment_fname = alignment_fname.split('.')[0] + ".bam"
                
            typing_common.align_reads(aligner,
                                      simulation,
                                      genotype_genome if genotype_genome != "" else (base_fname + "." + index_type),
                                      index_type,
                                      base_fname,
                                      read_fname,
                                      fastq,
                                      threads,
                                      alignment_fname,
                                      verbose)
            
        for test_Gene_names in locus_list:
            if simulation:
                gene = test_Gene_names[0].split('*')[0]
            else:
                gene = test_Gene_names
            ref_allele = refGenes[gene]
            ref_seq = Genes[gene][ref_allele]
            ref_locus = refGene_loci[gene]
            ref_exons = ref_locus[-1]
            
            novel_var_count = 0        
            gene_vars, gene_var_list = deepcopy(Vars[gene]), deepcopy(Var_list[gene])
            cur_maxright = -1
            gene_var_maxrights = {}
            for var_pos, var_id in gene_var_list:
                var_type, var_pos, var_data = gene_vars[var_id]
                if var_type == "deletion":
                    var_pos = var_pos + int(var_data) - 1
                cur_maxright = max(cur_maxright, var_pos)
                gene_var_maxrights[var_id] = cur_maxright
                    
            var_count = {}
            def add_novel_var(gene_vars,
                              gene_var_list,
                              novel_var_count,
                              var_type,
                              var_pos,
                              var_data):
                var_idx = typing_common.lower_bound(gene_var_list, var_pos)
                while var_idx < len(gene_var_list):
                    pos_, id_ = gene_var_list[var_idx]
                    if pos_ > var_pos:
                        break
                    if pos_ == var_pos:
                        type_, _, data_ = gene_vars[id_]
                        assert type_ != var_type or data_ != var_data
                        if type_ != var_type:
                            if var_type == "insertion":
                                break
                            elif var_type == "single" and type_ == "deletion":
                                break
                        else:
                            if var_data < data_:
                                break
                    var_idx += 1
                var_id = "nv%d" % novel_var_count
                assert var_id not in gene_vars
                gene_vars[var_id] = [var_type, var_pos, var_data]
                gene_var_list.insert(var_idx, [var_pos, var_id])                
                return var_id, novel_var_count + 1

            if not os.path.exists(alignment_fname + ".bai"):
                os.system("samtools index %s" % alignment_fname)
            # Read alignments
            alignview_cmd = ["samtools",
                             "view",
                             alignment_fname]
            base_locus = 0
            if genotype_genome != "":
                _, chr, left, right = ref_locus[:4]
                alignview_cmd += ["%s:%d-%d" % (chr, left+1, right+1)]
                base_locus = left

            if index_type == "graph":
                alignview_cmd += [ref_allele]
                mpileup = typing_common.get_mpileup(alignview_cmd,
                                                    ref_seq,
                                                    base_locus,
                                                    gene_vars,
                                                    allow_discordant)

                if base_fname == "codis":
                    pair_interdist = typing_common.get_pair_interdist(alignview_cmd,
                                                                      simulation,
                                                                      verbose)
                else:
                    pair_interdist = None

                bamview_proc = subprocess.Popen(alignview_cmd,
                                                stdout=subprocess.PIPE,
                                                stderr=open("/dev/null", 'w'))

                sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
                alignview_proc = subprocess.Popen(sort_read_cmd,
                                                  stdin=bamview_proc.stdout,
                                                  stdout=subprocess.PIPE,
                                                  stderr=open("/dev/null", 'w'))
            else:
                alignview_proc = subprocess.Popen(alignview_cmd,
                                             stdout=subprocess.PIPE,
                                             stderr=open("/dev/null", 'w'))

            # List of nodes that represent alleles
            allele_vars = {}
            for _, var_id in gene_var_list:
                allele_list = Links[var_id]
                for allele_id in allele_list:
                    if allele_id not in Genes[gene]:
                        continue
                    if allele_id not in allele_vars:
                        allele_vars[allele_id] = [var_id]
                    else:
                        allele_vars[allele_id].append(var_id)

            # Extract variants that are within exons
            exon_vars = get_exonic_vars(gene_vars, ref_exons)

            # Store nodes that represent alleles
            allele_nodes = {}
            def create_allele_node(allele_name):
                if allele_name in allele_nodes:
                    return allele_nodes[allele_name]
                if allele_name in allele_vars:
                    var_ids = allele_vars[allele_name]
                else:
                    var_ids = []
                seq = list(ref_seq)  # sequence that node represents
                var = ["" for i in range(len(ref_seq))]  # how sequence is related to backbone
                for var_id in var_ids:
                    assert var_id in gene_vars
                    var_type, var_pos, var_data = gene_vars[var_id]
                    assert var_pos >= 0 and var_pos < len(ref_seq)
                    if var_type == "single":
                        seq[var_pos] = var_data
                        var[var_pos] = var_id
                    elif var_type == "deletion":
                        del_len = int(var_data)
                        assert var_pos + del_len <= len(ref_seq)
                        seq[var_pos:var_pos + del_len] = ['D'] * del_len
                        var[var_pos:var_pos + del_len] = [var_id] * del_len
                    else:
                        # DK - to be implemented for insertions
                        assert var_type == "insertion"

                qual = ' ' * len(seq)
                allele_node = assembly_graph.Node(allele_name,
                                                  0,
                                                  seq,
                                                  qual,
                                                  var,
                                                  ref_seq,
                                                  gene_vars,
                                                  mpileup,
                                                  simulation)
                allele_nodes[allele_name] = allele_node
                return allele_node

            true_allele_nodes = {}
            if simulation:
                for allele_name in test_Gene_names:
                    true_allele_nodes[allele_name] = create_allele_node(allele_name)

            display_allele_nodes = {}
            for display_allele in display_alleles:
                display_allele_nodes[display_allele] = create_allele_node(display_allele)

            # Assembly graph
            asm_graph = assembly_graph.Graph(ref_seq,
                                             gene_vars,
                                             ref_exons,
                                             partial_alleles,
                                             true_allele_nodes,
                                             {}, # predicted_allele_nodes, which is empty for now
                                             display_allele_nodes,
                                             simulation)

            # Choose allele representives from those that share the same exonic sequences
            allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars)
            allele_rep_set = set(allele_reps.values())

            # For checking alternative alignments near the ends of alignments
            Alts_left, Alts_right = typing_common.get_alternatives(ref_seq,
                                                                   allele_vars,
                                                                   gene_vars,
                                                                   gene_var_list,
                                                                   verbose >= 2)

            def haplotype_alts_list(haplotype_alts, left = True):
                haplotype_list = []
                for haplotype in haplotype_alts.keys():
                    if left:
                        pos = int(haplotype.split('-')[-1])
                    else:
                        pos = int(haplotype.split('-')[0])
                    haplotype_list.append([pos, haplotype])
                return sorted(haplotype_list, cmp = lambda a, b: a[0] - b[0])

            Alts_left_list, Alts_right_list = haplotype_alts_list(Alts_left, True), haplotype_alts_list(Alts_right, False)

            # Count alleles
            Gene_counts, Gene_cmpt = {}, {}
            Gene_gen_counts, Gene_gen_cmpt = {}, {}
            num_reads, num_pairs = 0, 0

            # For debugging purposes
            debug_allele_names = set(test_Gene_names) if simulation and verbose >= 2 else set()

            # Read information
            prev_read_id = None
            prev_right_pos = 0
            prev_lines = []
            left_read_ids, right_read_ids = set(), set()
            if index_type == "graph":
                # nodes for reads
                read_nodes = []
                read_vars_list = []

                # 
                def add_count(count_per_read, ht, add):
                    orig_ht = ht
                    ht = ht.split('-')

                    assert len(ht) >= 2
                    left, right = int(ht[0]), int(ht[-1])
                    assert left <= right

                    ht = ht[1:-1]
                    alleles = set(Genes[gene].keys()) - set([ref_allele])
                    for i in range(len(ht)):
                        var_id = ht[i]
                        if var_id.startswith("nv"):
                            continue
                        alleles &= set(Links[var_id])
                    ht = set(ht)

                    tmp_alleles = set()
                    var_idx = typing_common.lower_bound(gene_var_list, right + 1)
                    var_idx = min(var_idx, len(gene_var_list) - 1)
                    while var_idx >= 0:
                        _, var_id = gene_var_list[var_idx]
                        if var_id.startswith("nv") or var_id in ht:
                            var_idx -= 1
                            continue
                        if var_id in gene_var_maxrights and gene_var_maxrights[var_id] < left:
                            break
                        var_type, var_left, var_data = gene_vars[var_id]
                        var_right = var_left
                        if var_type == "deletion":
                            var_right = var_left + int(var_data) - 1
                        if (var_left >= left and var_left <= right) or \
                           (var_right >= left and var_right <= right):
                            tmp_alleles |= set(Links[var_id])
                        var_idx -= 1                        
                    alleles -= tmp_alleles
                    
                    for allele in alleles:
                        count_per_read[allele] += add

                    return len(alleles)

                # Identify best pairs
                def choose_pairs(left_positive_hts, right_positive_hts):
                    if len(left_positive_hts) > 0 and \
                       len(right_positive_hts) > 0 and \
                       max(len(left_positive_hts), len(right_positive_hts)) >= 2:
                        expected_inter_dist = pair_interdist
                        """
                        if simulation:
                            expected_inter_dist = fragment_len - read_len * 2
                        """
                            
                        best_diff = sys.maxint
                        picked = []                                
                        for left_ht_str in left_positive_hts:
                            left_ht = left_ht_str.split('-')
                            l_left, l_right = int(left_ht[0]), int(left_ht[-1])
                            for right_ht_str in right_positive_hts:
                                right_ht = right_ht_str.split('-')
                                r_left, r_right = int(right_ht[0]), int(right_ht[-1])
                                if l_right < r_right:
                                    inter_dist = r_left - l_right - 1
                                else:
                                    inter_dist = l_left - r_right - 1

                                cur_diff = abs(expected_inter_dist - inter_dist)
                                if best_diff > cur_diff:
                                    best_diff = cur_diff
                                    picked = [[left_ht_str, right_ht_str]]
                                elif best_diff == cur_diff:
                                    picked.append([left_ht_str, right_ht_str])

                        assert len(picked) > 0

                        left_positive_hts, right_positive_hts = set(), set()
                        for left_ht_str, right_ht_str in picked:
                            left_positive_hts.add(left_ht_str)
                            right_positive_hts.add(right_ht_str)

                    return left_positive_hts, right_positive_hts

                def get_exon_haplotypes(ht, exons):
                    if len(exons) <= 0:
                        return []
                    
                    debug_ht = deepcopy(ht)
                    ht = ht.split('-')
                    assert len(ht) >= 2
                    ht[0], ht[-1] = int(ht[0]), int(ht[-1])
                    exon_hts = []
                    for e_left, e_right in exons:
                        assert len(ht) >= 2
                        ht_left, ht_right = ht[0], ht[-1]
                        if e_left > ht_right or e_right < ht_left:
                            continue

                        new_ht = deepcopy(ht)
                        if ht_left < e_left:
                            split = False
                            for i in range(1, len(new_ht) - 1):
                                var_id = new_ht[i]
                                type, left, data = gene_vars[var_id]
                                if (type != "deletion" and left >= e_left) or \
                                   (type == "deletion" and left - 1 >= e_left):
                                    ht_left = e_left
                                    new_ht = [ht_left] + new_ht[i:]
                                    split = True
                                    break
                                if type == "deletion":
                                    right = left + int(data)
                                    if right >= e_left:
                                        ht_left = right
                                        new_ht = [right] + new_ht[i+1:]
                                        split = True
                                        break
                            if not split:
                                ht_left = e_left
                                new_ht = [ht_left, ht_right]
                        assert ht_left >= e_left
                        if ht_right > e_right:
                            split = False
                            for i in reversed(range(1, len(new_ht) - 1)):
                                var_id = new_ht[i]
                                type, right, data = gene_vars[var_id]
                                if type == "deletion":
                                    right = right + int(data) - 1
                                if (type != "deletion" and right <= e_right) or \
                                   (type == "deletion" and right + 1 <= e_right):
                                    ht_right = e_right
                                    new_ht = new_ht[:i+1] + [ht_right]
                                    split = True
                                    break
                                if type == "deletion":
                                    left = right - int(data)
                                    if left <= e_right:
                                        ht_right = left
                                        new_ht = new_ht[:i] + [ht_right]
                                        split = True
                                        break
                            if not split:
                                ht_right = e_right
                                new_ht = [ht_left, ht_right]

                        if len(new_ht) == 2:
                            new_ht = "%d-%d" % (new_ht[0], new_ht[-1])
                        else:
                            assert len(new_ht) > 2
                            new_ht = "%d-%s-%d" % (new_ht[0], '-'.join(new_ht[1:-1]), new_ht[-1])
                        assert ht_left <= ht_right
                        exon_hts.append(new_ht)

                    return exon_hts

                # Positive evidence for left and right reads
                left_positive_hts, right_positive_hts = set(), set()
                
                # Cigar regular expression
                cigar_re = re.compile('\d+\w')
                for line in alignview_proc.stdout:
                    line = line.strip()
                    cols = line.split()
                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
                    node_read_id = orig_read_id = read_id
                    if simulation:
                        read_id = read_id.split('|')[0]
                    read_seq, read_qual = cols[9], cols[10]
                    flag, pos = int(flag), int(pos)
                    pos -= (base_locus + 1)
                    if pos < 0:
                        continue

                    # Unalined?
                    if flag & 0x4 != 0:
                        if simulation and verbose >= 2:
                            print "Unaligned"
                            print "\t", line
                        continue

                    # Concordantly mapped?
                    if flag & 0x2 != 0:
                        concordant = True
                    else:
                        concordant = False

                    NM, Zs, MD, NH = "", "", "", ""
                    for i in range(11, len(cols)):
                        col = cols[i]
                        if col.startswith("Zs"):
                            Zs = col[5:]
                        elif col.startswith("MD"):
                            MD = col[5:]
                        elif col.startswith("NM"):
                            NM = int(col[5:])
                        elif col.startswith("NH"):
                            NH = int(col[5:])

                    if NM > num_editdist:
                        continue

                    # Only consider unique alignment
                    if NH > 1:
                        continue

                    # Concordantly aligned mate pairs
                    if not allow_discordant and not concordant:
                        continue

                    # Left read?
                    is_left_read = flag & 0x40 != 0
                    if is_left_read:
                        if read_id in left_read_ids:
                            continue
                        left_read_ids.add(read_id)
                        if not simulation:
                            node_read_id += '|L'
                    else: # Right read?
                        assert flag & 0x80 != 0
                        if read_id in right_read_ids:
                            continue
                        right_read_ids.add(read_id)
                        if not simulation:
                            node_read_id += '|R'

                    if Zs:
                        Zs = Zs.split(',')             

                    assert MD != ""
                    MD_str_pos, MD_len = 0, 0
                    Zs_pos, Zs_i = 0, 0
                    for _i in range(len(Zs)):
                        Zs[_i] = Zs[_i].split('|')
                        Zs[_i][0] = int(Zs[_i][0])
                    if Zs_i < len(Zs):
                        Zs_pos += Zs[Zs_i][0]
                    read_pos, left_pos = 0, pos
                    right_pos = left_pos
                    cigars = cigar_re.findall(cigar_str)
                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
                    cmp_list = []
                    num_error_correction = 0
                    likely_misalignment = False

                    # Extract variants w.r.t backbone from CIGAR string
                    softclip = [0, 0]
                    for i in range(len(cigars)):
                        cigar_op, length = cigars[i]
                        if cigar_op == 'M':
                            first = True
                            MD_len_used = 0
                            cmp_list_i = len(cmp_list)
                            while True:
                                if not first or MD_len == 0:
                                    if MD[MD_str_pos].isdigit():
                                        num = int(MD[MD_str_pos])
                                        MD_str_pos += 1
                                        while MD_str_pos < len(MD):
                                            if MD[MD_str_pos].isdigit():
                                                num = num * 10 + int(MD[MD_str_pos])
                                                MD_str_pos += 1
                                            else:
                                                break
                                        MD_len += num
                                # Insertion or full match followed
                                if MD_len >= length:
                                    MD_len -= length
                                    if length > MD_len_used:
                                        cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
                                    break
                                first = False
                                read_base = read_seq[read_pos + MD_len]
                                MD_ref_base = MD[MD_str_pos]
                                MD_str_pos += 1
                                assert MD_ref_base in "ACGT"
                                if MD_len > MD_len_used:
                                    cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])

                                _var_id = "unknown"
                                if read_pos + MD_len == Zs_pos and Zs_i < len(Zs):
                                    assert Zs[Zs_i][1] == 'S'
                                    _var_id = Zs[Zs_i][2]
                                    Zs_i += 1
                                    Zs_pos += 1
                                    if Zs_i < len(Zs):
                                        Zs_pos += Zs[Zs_i][0]
                                else:
                                    # Search for a known (yet not indexed) variant or a novel variant
                                    ref_pos = right_pos + MD_len
                                    var_idx = typing_common.lower_bound(gene_var_list, ref_pos)
                                    while var_idx < len(gene_var_list):
                                        var_pos, var_id = gene_var_list[var_idx]
                                        if var_pos > ref_pos:
                                            break
                                        if var_pos == ref_pos:
                                            var_type, _, var_data = gene_vars[var_id]
                                            if var_type == "single" and var_data == read_base:
                                                _var_id = var_id
                                                break
                                        var_idx += 1

                                cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id])
                                MD_len_used = MD_len + 1
                                MD_len += 1
                                # Full match
                                if MD_len == length:
                                    MD_len = 0
                                    break

                            # Correction for sequencing errors and update for cmp_list
                            if error_correction:
                                assert cmp_list_i < len(cmp_list)
                                new_cmp_list, read_seq, _num_error_correction = error_correct(ref_seq,
                                                                                              read_seq,
                                                                                              read_pos,
                                                                                              mpileup,
                                                                                              gene_vars,
                                                                                              gene_var_list,
                                                                                              cmp_list[cmp_list_i:],
                                                                                              node_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583|L")
                                cmp_list = cmp_list[:cmp_list_i] + new_cmp_list
                                num_error_correction += _num_error_correction

                        elif cigar_op == 'I':
                            _var_id = "unknown"
                            if read_pos == Zs_pos and Zs_i < len(Zs):
                                assert Zs[Zs_i][1] == 'I'
                                _var_id = Zs[Zs_i][2]
                                Zs_i += 1
                                if Zs_i < len(Zs):
                                    Zs_pos += Zs[Zs_i][0]
                            else:
                                # Search for a known (yet not indexed) variant or a novel variant
                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
                                while var_idx < len(gene_var_list):
                                    var_pos, var_id = gene_var_list[var_idx]
                                    if var_pos > right_pos:
                                        break
                                    if var_pos == right_pos:
                                        var_type, _, var_data = gene_vars[var_id]
                                        if var_type == "insertion" and len(var_data) == length:
                                            _var_id = var_id
                                            break
                                    var_idx += 1                            
                            cmp_list.append(["insertion", right_pos, length, _var_id])
                            if 'N' in read_seq[read_pos:read_pos+length]:
                                likely_misalignment = True
                                
                        elif cigar_op == 'D':
                            if MD[MD_str_pos] == '0':
                                MD_str_pos += 1
                            assert MD[MD_str_pos] == '^'
                            MD_str_pos += 1
                            while MD_str_pos < len(MD):
                                if not MD[MD_str_pos] in "ACGT":
                                    break
                                MD_str_pos += 1
                            _var_id = "unknown"
                            if read_pos == Zs_pos and \
                               Zs_i < len(Zs) and \
                               Zs[Zs_i][1] == 'D':
                                _var_id = Zs[Zs_i][2]
                                Zs_i += 1
                                if Zs_i < len(Zs):
                                    Zs_pos += Zs[Zs_i][0]
                            else:
                                # Search for a known (yet not indexed) variant or a novel variant
                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
                                while var_idx < len(gene_var_list):
                                    var_pos, var_id = gene_var_list[var_idx]
                                    if var_pos > right_pos:
                                        break
                                    if var_pos == right_pos:
                                        var_type, _, var_data = gene_vars[var_id]
                                        if var_type == "deletion" and int(var_data) == length:
                                            _var_id = var_id
                                            break
                                    var_idx += 1

                            cmp_list.append(["deletion", right_pos, length, _var_id])

                            # Check if this deletion is artificial alignment
                            if right_pos < len(mpileup):
                                del_count, nt_count = 0, 0
                                for nt, value in mpileup[right_pos][1].items():
                                    count = value[0]
                                    if nt == 'D':
                                        del_count += count
                                    else:
                                        nt_count += count

                                # DK - debugging purposes
                                if base_fname == "hla":
                                    if del_count * 6 < nt_count: # and nt_count >= 15:
                                        likely_misalignment = True
                            
                        elif cigar_op == 'S':
                            if i == 0:
                                softclip[0] = length
                                Zs_pos += length
                            else:
                                assert i + 1 == len(cigars)
                                softclip[1] = length
                        else:                    
                            assert cigar_op == 'N'
                            assert False
                            cmp_list.append(["intron", right_pos, length])

                        if cigar_op in "MND":
                            right_pos += length

                        if cigar_op in "MIS":
                            read_pos += length

                    # Remove softclip in cigar and modify read_seq and read_qual accordingly
                    if sum(softclip) > 0:
                        if softclip[0] > 0:
                            cigars = cigars[1:]
                            read_seq = read_seq[softclip[0]:]
                            read_qual = read_qual[softclip[0]:]
                        if softclip[1] > 0:
                            cigars = cigars[:-1]
                            read_seq = read_seq[:-softclip[1]]
                            read_qual = read_qual[:-softclip[1]]

                        cigar_str = ""
                        for type, length in cigars:
                            cigar_str += str(length)
                            cigar_str += type
                   
                    if right_pos > len(ref_seq):
                        continue

                    if num_error_correction > max(1, num_editdist):
                        continue
                        
                    if likely_misalignment:
                        continue

                    # Add novel variants
                    read_pos = 0
                    for cmp_i in range(len(cmp_list)):
                        type_, pos_, length_ = cmp_list[cmp_i][:3]
                        if type_ != "match":
                            var_id_ = cmp_list[cmp_i][3]
                            if var_id_ == "unknown":
                                add = True
                                if type_ == "mismatch":
                                    data_ = read_seq[read_pos]
                                    if data_ == 'N':
                                        add = False
                                elif type_ == "deletion":
                                    data_ = str(length_)
                                else:
                                    assert type_ == "insertion"
                                    data_ = read_seq[read_pos:read_pos + length_]
                                if add:
                                    var_id, novel_var_count = add_novel_var(gene_vars,
                                                                            gene_var_list,
                                                                            novel_var_count,
                                                                            type_ if type_ != "mismatch" else "single",
                                                                            pos_,
                                                                            data_)
                                    cmp_list[cmp_i][3] = var_id
                            if var_id not in var_count:
                                var_count[var_id] = 1
                            else:
                                var_count[var_id] += 1
                                
                        if type_ != "deletion":
                            read_pos += length_

                    # Count the number of reads aligned uniquely with some constraints
                    num_reads += 1

                    def add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, include_alleles = set()):
                        max_count = max(Gene_count_per_read.values())
                        cur_cmpt = set()
                        for allele, count in Gene_count_per_read.items():
                            if count < max_count:
                                continue
                            if len(include_alleles) > 0 and allele not in include_alleles:
                                continue
                            
                            cur_cmpt.add(allele)                    
                            if allele not in Gene_counts:
                                Gene_counts[allele] = 1
                            else:
                                Gene_counts[allele] += 1

                        if len(cur_cmpt) == 0:
                            return ""

                        if verbose >= 2:
                            alleles = ["", ""]
                            allele1_found, allele2_found = False, False
                            if alleles[0] != "":
                                for allele, count in Gene_count_per_read.items():
                                    if count < max_count:
                                        continue
                                    if allele == alleles[0]:
                                        allele1_found = True
                                    elif allele == alleles[1]:
                                        allele2_found = True
                                if allele1_found != allele2_found:
                                    print >> sys.stderr, alleles[0], Gene_count_per_read[alleles[0]]
                                    print >> sys.stderr, alleles[1], Gene_count_per_read[alleles[1]]
                                    if allele1_found:
                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, Gene_count_per_read[alleles[1]]))
                                    else:
                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, Gene_count_per_read[alleles[0]]))

                        cur_cmpt = sorted(list(cur_cmpt))
                        cur_cmpt = '-'.join(cur_cmpt)
                        if not cur_cmpt in Gene_cmpt:
                            Gene_cmpt[cur_cmpt] = 1
                        else:
                            Gene_cmpt[cur_cmpt] += 1

                        return cur_cmpt

                    if read_id != prev_read_id:
                        if prev_read_id != None:
                            num_pairs += 1
                            if base_fname == "codis" and gene == "D18S51":
                                left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)

                            for positive_ht in left_positive_hts | right_positive_hts:
                                exon_hts = get_exon_haplotypes(positive_ht, ref_exons)

                                if prev_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583":
                                    print "positive_ht:", positive_ht, "exon_hts:", exon_hts
                                    
                                for exon_ht in exon_hts:
                                    add_count(Gene_count_per_read, exon_ht, 1)
                                add_count(Gene_gen_count_per_read, positive_ht, 1)

                            # DK - debugging purposes
                            if prev_read_id.startswith("a30"):
                                print Gene_gen_count_per_read

                            # DK - debugging purposes
                            """
                            debug_allele_id = "A*02:406"
                            assert debug_allele_id in Gene_count_per_read
                            debug_max_read_count = max(Gene_count_per_read.values())
                            debug_read_count = Gene_count_per_read[debug_allele_id]
                            if debug_read_count == debug_max_read_count and \
                               Gene_count_per_read["A*11:01:01:01"] < debug_max_read_count and \
                               Gene_count_per_read["A*02:01:01:01"] < debug_max_read_count:
                                print prev_read_id
                                None
                            if prev_read_id == "HSQ1008:175:C0JVFACXX:5:1109:17665:21583":
                                for line in prev_lines:
                                    print line
                                print "left_positive_hts :", left_positive_hts
                                print "right_positive_hts:", right_positive_hts
                                print "exon:", debug_read_count, "max:", debug_max_read_count
                                print "gen:", Gene_gen_count_per_read[debug_allele_id], "max:", max(Gene_gen_count_per_read.values())

                                for allele_id, count in Gene_count_per_read.items():
                                    if count == debug_max_read_count:
                                        None
                                        # print "allele max:", allele_id, count
                                # sys.exit(1)
                                None
                            """
                                

                            cur_cmpt, cur_cmpt_gen = "", ""
                            if base_fname == "hla":
                                cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, allele_rep_set)
                                cur_cmpt_gen = add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
                            else:
                                cur_cmpt = add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
                            for read_id_, read_node in read_nodes:
                                asm_graph.add_node(read_id_,
                                                   read_node,
                                                   simulation)
                            read_nodes, read_var_list = [], []
                            if simulation and \
                               verbose >= 2 and \
                               base_fname in ["hla", "codis"]:
                                cur_cmpt = cur_cmpt.split('-') if cur_cmpt != "" else set()
                                cur_cmpt_gen = cur_cmpt_gen.split('-') if cur_cmpt_gen != "" else set()
                                show_debug = (partial and cur_cmpt != "" and not set(cur_cmpt) & set(test_Gene_names)) or \
                                              (not partial and cur_cmpt_gen != "" and not set(cur_cmpt_gen) & set(test_Gene_names))
                                              
                                if show_debug:
                                    print "%s are chosen instead of %s" % (cur_cmpt if partial else cur_cmpt_gen, '-'.join(test_Gene_names))
                                    for prev_line in prev_lines:
                                        print "\t", prev_line

                            prev_lines = []

                        left_positive_hts, right_positive_hts = set(), set()
                        
                        Gene_count_per_read, Gene_gen_count_per_read = {}, {}
                        for Gene_name in Gene_names[gene]:
                            if Gene_name.find("BACKBONE") != -1:
                                continue
                            Gene_count_per_read[Gene_name] = 0
                            Gene_gen_count_per_read[Gene_name] = 0

                    prev_lines.append(line)

                    # Remove mismatches due to unknown or novel variants
                    cmp_list2 = []
                    for cmp in cmp_list:
                        cmp = deepcopy(cmp)
                        type, pos, length = cmp[:3]
                        if type == "match":
                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
                                cmp_list2[-1][2] += length
                            else:
                                cmp_list2.append(cmp)
                        elif type == "mismatch" and \
                             (cmp[3] == "unknown" or cmp[3].startswith("nv")):
                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
                                cmp_list2[-1][2] += 1
                            else:
                                cmp_list2.append(["match", pos, 1])
                        else:
                            cmp_list2.append(cmp)
                    cmp_list_left, cmp_list_right, cmp_left_alts, cmp_right_alts = \
                    typing_common.identify_ambigious_diffs(ref_seq,
                                                           gene_vars,
                                                           Alts_left,
                                                           Alts_right,
                                                           Alts_left_list,
                                                           Alts_right_list,
                                                           cmp_list2,
                                                           verbose,
                                                           orig_read_id.startswith("a30|R"))  # debug?

                    mid_ht = []
                    for cmp in cmp_list2[cmp_list_left:cmp_list_right+1]:
                        type = cmp[0]
                        if type not in ["mismatch", "deletion", "insertion"]:
                            continue                            
                        var_id = cmp[3]
                        if var_id == "unknown" or var_id.startswith("nv"):
                            continue
                        mid_ht.append(var_id)

                    for l in range(len(cmp_left_alts)):
                        left_ht = cmp_left_alts[l].split('-')
                        left_ht += mid_ht
                        for r in range(len(cmp_right_alts)):
                            right_ht = cmp_right_alts[r].split('-')
                            ht = left_ht + right_ht
                            if len(ht) <= 0:
                                continue
                            ht_str = '-'.join(ht)
                            if is_left_read:
                                left_positive_hts.add(ht_str)
                            else:
                                right_positive_hts.add(ht_str)

                    # DK - debugging purposes
                    DK_debug = False
                    if orig_read_id.startswith("a30|R"):
                        DK_debug = True
                        print line
                        print cmp_list
                        print "positive hts:", left_positive_hts, right_positive_hts
                        print "cmp_list [%d, %d]" % (cmp_list_left, cmp_list_right)

                    # Node
                    read_node_pos, read_node_seq, read_node_qual, read_node_var = -1, [], [], []
                    read_vars = []
                    ref_pos, read_pos = left_pos, 0
                    cmp_i = 0
                    while cmp_i < len(cmp_list):
                        cmp = cmp_list[cmp_i]
                        type, length = cmp[0], cmp[2]
                        if type in ["match", "mismatch"]:
                            if read_node_pos < 0:
                                read_node_pos = ref_pos
                        if type == "match":
                            read_node_seq += list(read_seq[read_pos:read_pos+length])
                            read_node_qual += list(read_qual[read_pos:read_pos+length])
                            read_node_var += ([''] * length)
                            read_pos += length
                        elif type == "mismatch":
                            var_id = cmp[3]
                            read_base, qual = read_seq[read_pos], read_qual[read_pos]
                            read_node_seq += [read_base]
                            read_node_qual += [qual]
                            read_node_var.append(var_id)
                            read_pos += 1
                        elif type == "insertion":
                            var_id = cmp[3]
                            ins_len = length
                            ins_seq = read_seq[read_pos:read_pos+ins_len]
                            read_node_seq += ["I%s" % nt for nt in ins_seq]
                            read_node_qual += list(read_qual[read_pos:read_pos+ins_len])
                            read_node_var += ([var_id] * ins_len)                                        
                            read_pos += length
                        elif type == "deletion":
                            var_id = cmp[3]
                            del_len = length
                            read_node_seq += (['D'] * del_len)
                            read_node_qual += ([''] * del_len)
                            if len(read_node_seq) > len(read_node_var):
                                assert len(read_node_seq) == len(read_node_var) + del_len
                                read_node_var += ([var_id] * del_len)
                        else:
                            assert type == "intron"
                        cmp_i += 1

                    # Node
                    if assembly:
                        read_nodes.append([node_read_id,
                                           assembly_graph.Node(node_read_id,
                                                               read_node_pos,
                                                               read_node_seq,
                                                               read_node_qual,
                                                               read_node_var,
                                                               ref_seq,
                                                               gene_vars,
                                                               mpileup,
                                                               simulation)])

                    prev_read_id = read_id
                    prev_right_pos = right_pos

                if prev_read_id != None:
                    num_pairs += 1
                    if base_fname == "codis" and gene == "D18S51":
                        left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)                            
                    for positive_ht in left_positive_hts | right_positive_hts:
                        exon_hts = get_exon_haplotypes(positive_ht, ref_exons)
                        for exon_ht in exon_hts:
                            add_count(Gene_count_per_read, exon_ht, 1)
                        add_count(Gene_gen_count_per_read, positive_ht, 1)

                    if base_fname == "hla":
                        add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, allele_rep_set)
                    add_stat(Gene_gen_cmpt, Gene_gen_counts, Gene_gen_count_per_read)
                    for read_id_, read_node in read_nodes:
                        asm_graph.add_node(read_id_,
                                           read_node,
                                           simulation)
                    read_nodes, read_var_list = [], []

                if num_reads <= 0:
                    continue

                for f_ in [sys.stderr, report_file]:
                    print >> f_, "\t\t\t%d reads and %d pairs are aligned" % (num_reads, num_pairs)
                
            else:
                assert index_type == "linear"
                def add_alleles(alleles):
                    if not allele in Gene_counts:
                        Gene_counts[allele] = 1
                    else:
                        Gene_counts[allele] += 1

                    cur_cmpt = sorted(list(alleles))
                    cur_cmpt = '-'.join(cur_cmpt)
                    if not cur_cmpt in Gene_cmpt:
                        Gene_cmpt[cur_cmpt] = 1
                    else:
                        Gene_cmpt[cur_cmpt] += 1

                prev_read_id, prev_AS = None, None
                alleles = set()
                for line in alignview_proc.stdout:
                    cols = line[:-1].split()
                    read_id, flag, allele = cols[:3]
                    flag = int(flag)
                    if flag & 0x4 != 0:
                        continue
                    if not allele.startswith(gene):
                        continue
                    if allele.find("BACKBONE") != -1:
                        continue

                    AS = None
                    for i in range(11, len(cols)):
                        col = cols[i]
                        if col.startswith("AS"):
                            AS = int(col[5:])
                    assert AS != None
                    if read_id != prev_read_id:
                        if alleles:
                            if aligner == "hisat2" or \
                                    (aligner == "bowtie2" and len(alleles) < 10):
                                add_alleles(alleles)
                            alleles = set()
                        prev_AS = None
                    if prev_AS != None and AS < prev_AS:
                        continue
                    prev_read_id = read_id
                    prev_AS = AS
                    alleles.add(allele)

                if alleles:
                    add_alleles(alleles)

            if base_fname != "hla":
                Gene_cmpt, Gene_counts = Gene_gen_cmpt, Gene_gen_counts
                
            Gene_counts = [[allele, count] for allele, count in Gene_counts.items()]
            def Gene_count_cmp(a, b):
                if a[1] != b[1]:
                    return b[1] - a[1]
                assert a[0] != b[0]
                if a[0] < b[0]:
                    return -1
                else:
                    return 1
            Gene_counts = sorted(Gene_counts, cmp=Gene_count_cmp)
            for count_i in range(len(Gene_counts)):
                count = Gene_counts[count_i]
                if simulation:
                    found = False
                    for test_Gene_name in test_Gene_names:
                        if count[0] == test_Gene_name:
                            for f_ in [sys.stderr, report_file]:
                                print >> f_, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_Gene_name, count[1])
                            found = True
                    if count_i < 5 and not found:
                        for f_ in [sys.stderr, report_file]:
                            print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
                else:
                    for f_ in [sys.stderr, report_file]:
                        print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
                    if count_i >= 9:
                        break
            for f_ in [sys.stderr, report_file]:
                print >> f_

            # Calculate the abundance of representative alleles on exonic sequences
            if base_fname == "hla":
                # Incorporate non representative alleles (full length alleles)
                Gene_prob = typing_common.single_abundance(Gene_cmpt,
                                                           Gene_lengths[gene],
                                                           True) # exonic sequence

                gen_alleles = set()
                gen_prob_sum = 0.0
                for prob_i in range(len(Gene_prob)):
                    allele, prob = Gene_prob[prob_i][:2]
                    if prob_i >= 10 and prob < 0.03:
                        break
                    if allele in partial_alleles:
                        continue

                    gen_prob_sum += prob
                    gen_alleles |= set(allele_rep_groups[allele])

                if len(gen_alleles) > 0:
                    Gene_gen_cmpt2 = {}
                    for cmpt, value in Gene_gen_cmpt.items():
                        cmpt2 = []
                        for allele in cmpt.split('-'):
                            if allele in gen_alleles:
                                cmpt2.append(allele)
                        if len(cmpt2) == 0:
                            continue
                        cmpt2 = '-'.join(cmpt2)
                        if cmpt2 not in Gene_gen_cmpt2:
                            Gene_gen_cmpt2[cmpt2] = value
                        else:
                            Gene_gen_cmpt2[cmpt2] += value
                    Gene_gen_cmpt = Gene_gen_cmpt2
                    Gene_gen_prob = typing_common.single_abundance(Gene_gen_cmpt,
                                                                   Gene_lengths[gene],
                                                                   False) # whole gene sequence
                    
                    Gene_combined_prob = {}
                    for allele, prob in Gene_prob:
                        if allele not in gen_alleles:
                            Gene_combined_prob[allele] = prob
                    for allele, prob in Gene_gen_prob:
                        Gene_combined_prob[allele] = prob * gen_prob_sum
                    Gene_prob = [[allele, prob] for allele, prob in Gene_combined_prob.items()]
                    Gene_prob = sorted(Gene_prob, cmp=typing_common.Gene_prob_cmp)
            else:
                Gene_prob = typing_common.single_abundance(Gene_cmpt, Gene_lengths[gene])

            if index_type == "graph" and assembly:
                allele_node_order = []
                predicted_allele_nodes = {}
                for allele_name, prob in Gene_prob:
                    if prob < 0.1: # abundance of 10%
                        break
                    predicted_allele_nodes[allele_name] = create_allele_node(allele_name)
                    allele_node_order.append([allele_name, prob])
                    if len(predicted_allele_nodes) >= 2:
                        break
                asm_graph.predicted_allele_nodes = predicted_allele_nodes
                asm_graph.allele_node_order = allele_node_order

                # Start drawing assembly graph
                asm_graph.begin_draw(output_base)

                # Draw assembly graph
                begin_y = asm_graph.draw(0, "Initial graph")
                begin_y += 200
                
                # Apply De Bruijn graph
                asm_graph.guided_DeBruijn()

                # Draw assembly graph
                begin_y = asm_graph.draw(begin_y, "Asssembly")
                begin_y += 200

                # Draw assembly graph
                asm_graph.nodes = asm_graph.nodes2
                asm_graph.to_node, asm_graph.from_node = {}, {}
                begin_y = asm_graph.draw(begin_y, "Assembly with known alleles")

                # End drawing assembly graph
                asm_graph.end_draw()

                # Compare two alleles
                if simulation and len(test_Gene_names) == 2:
                    allele_name1, allele_name2 = test_Gene_names
                    print >> sys.stderr, allele_name1, "vs.", allele_name2
                    asm_graph.print_node_comparison(asm_graph.true_allele_nodes)

                def compare_alleles(vars1, vars2, print_output = True):
                    skip = True
                    var_i, var_j = 0, 0
                    exon_i = 0
                    mismatches = 0
                    while var_i < len(vars1) and var_j < len(vars2):
                        cmp_var_id, node_var_id = vars1[var_i], vars2[var_j]
                        cmp_var, node_var = gene_vars[cmp_var_id], gene_vars[node_var_id]

                        min_pos = min(cmp_var[1], node_var[1])
                        cmp_var_in_exon, node_var_in_exon = False, False
                        while exon_i < len(ref_exons):
                            exon_left, exon_right = ref_exons[exon_i]
                            if min_pos <= exon_right:
                                if cmp_var[1] >= exon_left and cmp_var[1] <= exon_right:
                                    cmp_var_in_exon = True
                                else:
                                    cmp_var_in_exon = False
                                if node_var[1] >= exon_left and node_var[1] <= exon_right:
                                    node_var_in_exon = True
                                else:
                                    node_var_in_exon = False                                
                                break
                            exon_i += 1
                        
                        if cmp_var_id == node_var_id:
                            skip = False
                            if print_output:
                                if cmp_var_in_exon:
                                    print >> sys.stderr, "\033[94mexon%d\033[00m" % (exon_i + 1),
                                print >> sys.stderr, cmp_var_id, cmp_var, "\t\t\t", mpileup[cmp_var[1]]
                            var_i += 1; var_j += 1
                            continue
                        if cmp_var[1] <= node_var[1]:
                            if not skip:
                                if (var_i > 0 and var_i + 1 < len(vars1)) or cmp_var[0] != "deletion":
                                    if print_output:
                                        if cmp_var_in_exon:
                                            for f_ in [sys.stderr, report_file]:
                                                print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
                                        for f_ in [sys.stderr, report_file]:
                                            print >> f_, "***", cmp_var_id, cmp_var, "==", "\t\t\t", mpileup[cmp_var[1]]
                                    mismatches += 1
                            var_i += 1
                        else:
                            if print_output:
                                if node_var_in_exon:
                                    for f_ in [sys.stderr, report_file]:
                                        print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
                                for f_ in [sys.stderr, report_file]:
                                    print >> f_, "*** ==", node_var_id, node_var, "\t\t\t", mpileup[node_var[1]]
                            mismatches += 1
                            var_j += 1
                            
                    return mismatches
                    
                tmp_nodes = asm_graph.nodes
                print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes)
                count = 0
                for id, node in tmp_nodes.items():
                    count += 1
                    if count > 10:
                        break
                    node_vars = node.get_var_ids()
                    node.print_info(); print >> sys.stderr
                    if node.id in asm_graph.to_node:
                        for id2, at in asm_graph.to_node[node.id]:
                            print >> sys.stderr, "\tat %d ==> %s" % (at, id2)

                    if simulation:
                        cmp_Gene_names = test_Gene_names
                    else:
                        cmp_Gene_names = [allele_name for allele_name, _ in allele_node_order]
                        
                    alleles, cmp_vars, max_common = [], [], -sys.maxint
                    for cmp_Gene_name in cmp_Gene_names:
                        tmp_vars = allele_nodes[cmp_Gene_name].get_var_ids(node.left, node.right)
                        tmp_common = len(set(node_vars) & set(tmp_vars))
                        tmp_common -= len(set(node_vars) | set(tmp_vars))
                        if max_common < tmp_common:
                            max_common = tmp_common
                            alleles = [[cmp_Gene_name, tmp_vars]]
                        elif max_common == tmp_common:
                            alleles.append([cmp_Gene_name, tmp_vars])

                    for allele_name, cmp_vars in alleles:
                        for f_ in [sys.stderr, report_file]:
                            print >> f_, "vs.", allele_name
                        compare_alleles(cmp_vars, node_vars)

                    print >> sys.stderr
                    print >> sys.stderr


            # Identify alleles that perfectly or closesly match assembled alleles
            for node_name, node in asm_graph.nodes.items():
                vars = set(node.get_var_ids())

                max_allele_names, max_common = [], -sys.maxint
                for allele_name, vars2 in allele_vars.items():
                    vars2 = set(vars2)
                    tmp_common = len(vars & vars2) - len(vars | vars2)
                    if tmp_common > max_common:
                        max_common = tmp_common
                        max_allele_names = [allele_name]                        
                    elif tmp_common == max_common:
                        max_allele_names.append(allele_name)

                for f_ in [sys.stderr, report_file]:
                    print >> f_, "Genomic:", node_name
                    node_vars = node.get_var_ids()
                    min_mismatches = sys.maxint
                    for max_allele_name in max_allele_names:
                        cmp_vars = allele_vars[max_allele_name]
                        cmp_vars = sorted(cmp_vars, cmp=lambda a, b: int(a[2:]) - int(b[2:]))
                        print_output = False
                        tmp_mismatches = compare_alleles(cmp_vars, node_vars, print_output)
                        print >> f_, "\t\t%s:" % max_allele_name, max_common, tmp_mismatches
                        if tmp_mismatches < min_mismatches:
                            min_mismatches = tmp_mismatches
                    if min_mismatches > 0:
                        print >> f_, "Novel allele"
                    else:
                        print >> f_, "Known allele"

            """
            allele_exon_vars = {}
            for allele_name, vars in allele_vars.items():
                allele_exon_vars[allele_name] = set(vars) & exon_vars

            for node_name, node in asm_graph.nodes.items():
                vars = []
                for left, right in ref_exons:
                    vars += node.get_var_ids(left, right)
                vars = set(vars) & exon_vars

                max_allele_names, max_common = [], -sys.maxint
                for allele_name, vars2 in allele_exon_vars.items():
                    tmp_common = len(vars & vars2) - len(vars | vars2)
                    if tmp_common > max_common:
                        max_common = tmp_common
                        max_allele_names = [allele_name]                        
                    elif tmp_common == max_common:
                        max_allele_names.append(allele_name)

                for f_ in [sys.stderr, report_file]:
                    print >> f_, "Exonic:", node_name
                    for max_allele_name in max_allele_names:
                        print >> f_, "\t\t%s:" % max_allele_name, max_common
            """
            
            success = [False for i in range(len(test_Gene_names))]
            found_list = [False for i in range(len(test_Gene_names))]
            for prob_i in range(len(Gene_prob)):
                prob = Gene_prob[prob_i]
                found = False
                _allele_rep = prob[0]
                """
                if partial and exonic_only:
                    _fields = _allele_rep.split(':')
                    if len(_fields) == 4:
                        _allele_rep = ':'.join(_fields[:-1])
                """

                if simulation:
                    for name_i in range(len(test_Gene_names)):
                        test_Gene_name = test_Gene_names[name_i]
                        if prob[0] == test_Gene_name:
                            rank_i = prob_i
                            while rank_i > 0:
                                if prob == Gene_prob[rank_i - 1][1]:
                                    rank_i -= 1
                                else:
                                    break
                            for f_ in [sys.stderr, report_file]:
                                print >> f_, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_Gene_name, prob[1] * 100.0)
                            if rank_i < len(success):
                                success[rank_i] = True
                            found_list[name_i] = True
                            found = True
                    # DK - for debugging purposes
                    if not False in found_list and prob_i >= 10:
                        break
                if not found:
                    for f_ in [sys.stderr, report_file]:
                        print >> f_, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0)
                    if best_alleles and prob_i < 2:
                        for f_ in [sys.stderr, report_file]:
                            print >> f_, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0)
                if not simulation and prob_i >= 9:
                    break
                if prob_i >= 19:
                    break
            print >> sys.stderr

            if simulation and not False in success:
                aligner_type = "%s %s" % (aligner, index_type)
                if not aligner_type in test_passed:
                    test_passed[aligner_type] = 1
                else:
                    test_passed[aligner_type] += 1

        if remove_alignment_file and not simulation:
            os.system("rm %s*" % (alignment_fname))

    report_file.close()
    if simulation:
        return test_passed

    
"""
"""
def read_backbone_alleles(genotype_genome, refGene_loci, Genes):
    for gene_name in refGene_loci:
        allele_name, chr, left, right = refGene_loci[gene_name][:4]
        seq_extract_cmd = ["samtools",
                           "faidx",
                           "%s.fa" % genotype_genome,
                           "%s:%d-%d" % (chr, left+1, right+1)]

        length = right - left + 1
        proc = subprocess.Popen(seq_extract_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
        seq = ""
        for line in proc.stdout:
            line = line.strip()
            if line.startswith('>'):
                continue
            seq += line
        assert len(seq) == length
        assert gene_name not in Genes
        Genes[gene_name] = {}
        Genes[gene_name][allele_name] = seq

        
"""
"""
def read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes):
    for gene_name in Genes:
        # Assert there is only one allele per gene, which is a backbone allele
        assert len(Genes[gene_name]) == 1
        backbone_allele_name, backbone_seq = Genes[gene_name].items()[0]
        gene_vars, gene_var_list = Vars[gene_name], Var_list[gene_name]
        allele_vars = {}
        for _, var_id in gene_var_list:
            for allele_name in Links[var_id]:
                if allele_name not in allele_vars:
                    allele_vars[allele_name] = []
                allele_vars[allele_name].append(var_id)

        for allele_name, vars in allele_vars.items():
            seq = ""
            prev_pos = 0
            for var_id in vars:
                type, pos, data = gene_vars[var_id]
                assert prev_pos <= pos
                if pos > prev_pos:
                    seq += backbone_seq[prev_pos:pos]
                if type == "single":
                    prev_pos = pos + 1
                    seq += data
                elif type == "deletion":
                    prev_pos = pos + int(data)
                else:
                    assert type == "insertion"
                    seq += data
                    prev_pos = pos
            if prev_pos < len(backbone_seq):
                seq += backbone_seq[prev_pos:]
            Genes[gene_name][allele_name] = seq
            
    
"""
"""
def read_Gene_alleles(fname, Genes):
    for line in open(fname):
        if line.startswith(">"):
            allele_name = line.strip().split()[0][1:]
            gene_name = allele_name.split('*')[0]
            if not gene_name in Genes:
                Genes[gene_name] = {}
            if not allele_name in Genes[gene_name]:
                Genes[gene_name][allele_name] = ""
        else:
            Genes[gene_name][allele_name] += line.strip()
    return Genes


"""
"""
def read_Gene_vars(fname):
    Vars, Var_list = {}, {}
    for line in open(fname):
        var_id, var_type, allele, pos, data = line.strip().split('\t')
        pos = int(pos)
        gene = allele.split('*')[0]
        if not gene in Vars:
            Vars[gene] = {}
            assert not gene in Var_list
            Var_list[gene] = []
            
        assert not var_id in Vars[gene]
        Vars[gene][var_id] = [var_type, pos, data]
        Var_list[gene].append([pos, var_id])
        
    for gene, in_var_list in Var_list.items():
        Var_list[gene] = sorted(in_var_list)

    return Vars, Var_list


"""
"""
def read_Gene_vars_genotype_genome(fname, refGene_loci):
    loci = {}
    for gene, values in refGene_loci.items():
        allele_name, chr, left, right = values[:4]
        if chr not in loci:
            loci[chr] = []
        loci[chr].append([allele_name, left, right])
        
    Vars, Var_list = {}, {}
    for line in open(fname):
        var_id, var_type, var_chr, pos, data = line.strip().split('\t')
        if var_chr not in loci:
            continue
        pos = int(pos)
        found = False
        for allele_name, left, right in loci[var_chr]:
            if pos >= left and pos <= right:
                found = True
                break
        if not found:
            continue
        
        gene = allele_name.split('*')[0]
        if not gene in Vars:
            Vars[gene] = {}
            assert not gene in Var_list
            Var_list[gene] = []
            
        assert not var_id in Vars[gene]
        Vars[gene][var_id] = [var_type, pos - left, data]
        Var_list[gene].append([pos - left, var_id])
        
    for gene, in_var_list in Var_list.items():
        Var_list[gene] = sorted(in_var_list)

    return Vars, Var_list


"""
"""
def read_Gene_links(fname):
    Links = {}
    for line in open(fname):
        var_id, alleles = line.strip().split('\t')
        alleles = alleles.split()
        assert not var_id in Links
        Links[var_id] = alleles

    return Links


"""
"""
def genotyping_locus(base_fname,
                     locus_list,
                     genotype_genome,
                     only_locus_list,
                     partial,
                     aligners,
                     read_fname,
                     fastq,
                     alignment_fname,
                     threads,
                     simulate_interval,
                     read_len,
                     fragment_len,
                     best_alleles,
                     num_editdist,
                     perbase_errorrate,
                     perbase_snprate,
                     skip_fragment_regions,
                     assembly,
                     output_base,
                     error_correction,
                     discordant,
                     display_alleles,
                     verbose,
                     debug_instr):
    if not os.path.exists("hisatgenotype_db"):
        typing_common.clone_hisatgenotype_database()

    simulation = (read_fname == [] and alignment_fname == "")

    # Download human genome and HISAT2 index
    HISAT2_fnames = ["grch38",
                     "genome.fa",
                     "genome.fa.fai"]

    if not typing_common.check_files(HISAT2_fnames):
        typing_common.download_genome_and_index()

    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
    if genotype_genome != "":
        if os.path.exists("%s.locus" % base_fname):
            left = 0
            Gene_genes = []
            BACKBONE = False
            for line in open("%s.locus" % base_fname):
                Gene_name = line.strip().split()[0]
                if Gene_name.find("BACKBONE") != -1:
                    BACKBONE = True
                Gene_gene = Gene_name.split('*')[0]
                Gene_genes.append(Gene_gene)
            delete_hla_files = False
            if not BACKBONE:
                delete_hla_files = True
            if len(locus_list) == 0:
                locus_list = Gene_genes
            if not set(locus_list).issubset(set(Gene_genes)):
                delete_hla_files = True
            if delete_hla_files:
                os.system("rm %s*" % base_fname)

    # Extract variants, backbone sequence, and other sequeces  
    if genotype_genome != "":
        genome_fnames = [genotype_genome + ".fa",
                         genotype_genome + ".fa.fai",
                         genotype_genome + ".locus",
                         genotype_genome + ".snp",
                         genotype_genome + ".index.snp",
                         genotype_genome + ".haplotype",
                         genotype_genome + ".link",
                         genotype_genome + ".clnsig",
                         genotype_genome + ".coord",
                         genotype_genome + ".partial"]
        for i in range(8):
            genome_fnames.append(genotype_genome + ".%d.ht2" % (i+1))

        if not typing_common.check_files(genome_fnames):
            print >> sys.stderr, "Error: some of the following files are not available:", ' '.join(genome_fnames)
            sys.exit(1)
    else:
        typing_common.extract_database_if_not_exists(base_fname,
                                                     only_locus_list,
                                                     30,              # inter_gap
                                                     50,              # intra_gap
                                                     partial,
                                                     verbose >= 1)        
        for aligner, index_type in aligners:
            typing_common.build_index_if_not_exists(base_fname,
                                                    aligner,
                                                    index_type,
                                                    threads,
                                                    verbose >= 1)

    # Read partial alleles
    partial_alleles = set()
    if genotype_genome != "":
        for line in open("%s.partial" % genotype_genome):
            family, allele_name = line.strip().split('\t')
            if family == base_fname:
                partial_alleles.add(allele_name)

    else:
        for line in open("%s.partial" % base_fname):
            partial_alleles.add(line.strip())

    # Read alleles (names and sequences)
    refGenes, refGene_loci = {}, {}
    for line in open("%s.locus" % (genotype_genome if genotype_genome != "" else base_fname)):
        fields = line.strip().split()
        if genotype_genome != "" and base_fname != fields[0].lower():
            continue
        if genotype_genome != "":
            _, Gene_name, chr, left, right, exon_str, strand = fields
        else:
            Gene_name, chr, left, right, _, exon_str, strand = fields
        Gene_gene = Gene_name.split('*')[0]
        assert not Gene_gene in refGenes
        refGenes[Gene_gene] = Gene_name
        left, right = int(left), int(right)
        exons = []
        for exon in exon_str.split(','):
            exon_left, exon_right = exon.split('-')
            exons.append([int(exon_left), int(exon_right)])
        refGene_loci[Gene_gene] = [Gene_name, chr, left, right, exons]
    Genes = {}
    if len(locus_list) == 0:
        locus_list = refGene_loci.keys()

    # Read HLA variants, and link information
    if genotype_genome:
        Vars, Var_list = read_Gene_vars_genotype_genome("%s.snp" % genotype_genome, refGene_loci)
        Links = read_Gene_links("%s.link" % genotype_genome)
    else:
        Vars, Var_list = read_Gene_vars("%s.snp" % base_fname)
        Links = read_Gene_links("%s.link" % base_fname)

    # Read allele sequences
    if genotype_genome != "":
        read_backbone_alleles(genotype_genome, refGene_loci, Genes)
        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)        
    else:
        read_Gene_alleles(base_fname + "_backbone.fa", Genes)
        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)

    # Sanity Check
    if os.path.exists(base_fname + "_backbone.fa") and \
       os.path.exists(base_fname + "_sequences.fa"):
        Genes2 = {}
        read_Gene_alleles(base_fname + "_backbone.fa", Genes2)
        read_Gene_alleles(base_fname + "_sequences.fa", Genes2)
        for gene_name, alleles in Genes.items():
            assert gene_name in Genes2
            for allele_name, allele_seq in alleles.items():
                assert allele_name in Genes2[gene_name]
                allele_seq2 = Genes2[gene_name][allele_name]
                assert allele_seq == allele_seq2

    # HLA gene alleles
    Gene_names = {}
    for Gene_gene, data in Genes.items():
        Gene_names[Gene_gene] = list(data.keys())

    # HLA gene allele lengths
    Gene_lengths = {}
    for Gene_gene, Gene_alleles in Genes.items():
        Gene_lengths[Gene_gene] = {}
        for allele_name, seq in Gene_alleles.items():
            Gene_lengths[Gene_gene][allele_name] = len(seq)

    # Test HLA typing
    test_list = []
    if simulation:
        basic_test, pair_test = True, False
        if debug_instr and "pair" in debug_instr:
            basic_test, pair_test = False, True

        test_passed = {}
        test_list = []
        genes = list(set(locus_list) & set(Gene_names.keys()))
        if basic_test:
            for gene in genes:
                Gene_gene_alleles = Gene_names[gene]
                for allele in Gene_gene_alleles:
                    if allele.find("BACKBONE") != -1:
                        continue
                    test_list.append([[allele]])
                random.shuffle(test_list)
        if pair_test:
            test_size = 200
            allele_count = 2
            for test_i in range(test_size):
                test_pairs = []
                for gene in genes:
                    Gene_gene_alleles = []

                    for allele in Gene_names[gene]:
                        if allele.find("BACKBONE") != -1:
                            continue

                        if "full" in debug:
                            if allele in partial_alleles:
                                continue

                        Gene_gene_alleles.append(allele)
                    nums = [i for i in range(len(Gene_gene_alleles))]
                    random.shuffle(nums)
                    test_pairs.append(sorted([Gene_gene_alleles[nums[i]] for i in range(allele_count)]))
                test_list.append(test_pairs)

        if "test_list" in debug_instr:
            test_list = [[debug_instr["test_list"].split('-')]]
            
        for test_i in range(len(test_list)):
            if "test_id" in debug_instr:
                test_ids = debug_instr["test_id"].split('-')
                if str(test_i + 1) not in test_ids:
                    continue

            print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now())
            test_locus_list = test_list[test_i]
            num_frag_list = typing_common.simulate_reads(Genes,
                                                         base_fname,
                                                         test_locus_list,
                                                         Vars,
                                                         Links,
                                                         simulate_interval,
                                                         read_len,
                                                         fragment_len,
                                                         perbase_errorrate,
                                                         perbase_snprate,
                                                         skip_fragment_regions)

            assert len(num_frag_list) == len(test_locus_list)
            for i_ in range(len(test_locus_list)):
                test_Gene_names = test_locus_list[i_]
                num_frag_list_i = num_frag_list[i_]
                assert len(num_frag_list_i) == len(test_Gene_names)
                for j_ in range(len(test_Gene_names)):
                    test_Gene_name = test_Gene_names[j_]
                    gene = test_Gene_name.split('*')[0]
                    test_Gene_seq = Genes[gene][test_Gene_name]
                    seq_type = "partial" if test_Gene_name in partial_alleles else "full"
                    print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_Gene_name, len(test_Gene_seq), seq_type, num_frag_list_i[j_])

            if "single-end" in debug_instr:
                read_fname = ["%s_input_1.fa" % base_fname]
            else:
                read_fname = ["%s_input_1.fa" % base_fname, "%s_input_2.fa" % base_fname]

            fastq = False
            tmp_test_passed = typing(simulation,
                                     base_fname,
                                     test_locus_list,
                                     genotype_genome,
                                     partial,
                                     partial_alleles,
                                     refGenes,
                                     Genes,                       
                                     Gene_names,
                                     Gene_lengths,
                                     refGene_loci,
                                     Vars,
                                     Var_list,
                                     Links,
                                     aligners,
                                     num_editdist,
                                     assembly,
                                     output_base,
                                     error_correction,
                                     discordant,
                                     display_alleles,
                                     fastq,
                                     read_fname,
                                     alignment_fname,
                                     num_frag_list,
                                     read_len,
                                     fragment_len,
                                     threads,
                                     best_alleles,
                                     verbose)

            for aligner_type, passed in tmp_test_passed.items():
                if aligner_type in test_passed:
                    test_passed[aligner_type] += passed
                else:
                    test_passed[aligner_type] = passed

                print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))


        for aligner_type, passed in test_passed.items():
            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
    
    else: # With real reads or BAMs
        print >> sys.stderr, "\t", ' '.join(locus_list)
        typing(simulation,
               base_fname,
               locus_list,
               genotype_genome,
               partial,
               partial_alleles,
               refGenes,
               Genes,                       
               Gene_names,
               Gene_lengths,
               refGene_loci,
               Vars,
               Var_list,
               Links,
               aligners,
               num_editdist,
               assembly,
               output_base,
               error_correction,
               discordant,
               display_alleles,
               fastq,
               read_fname,
               alignment_fname,
               [],
               read_len,
               fragment_len,
               threads,
               best_alleles,
               verbose)


"""
"""
if __name__ == '__main__':
    parser = ArgumentParser(
        description='hisatgenotype_locus')
    parser.add_argument("--base", "--base-fname",
                        dest="base_fname",
                        type=str,
                        default="hla",
                        help="base filename for backbone sequence, variants, and linking info (default: hla)")
    parser.add_argument("--locus-list",
                        dest="locus_list",
                        type=str,
                        default="",
                        help="A comma-separated list of genes (default: empty, all genes)")
    parser.add_argument("--genotype-genome",
                        dest="genotype_genome",
                        type=str,
                        default="",
                        help="Base name for genotype genome, which the program will use instead of region-based small indexes (default: empty)")
    parser.add_argument("-f", "--fasta",
                        dest='fastq',
                        action='store_false',
                        help='FASTA format')
    parser.add_argument("-U",
                        dest="read_fname_U",
                        type=str,
                        default="",
                        help="filename for single-end reads")
    parser.add_argument("-1",
                        dest="read_fname_1",
                        type=str,
                        default="",
                        help="filename for paired-end reads")
    parser.add_argument("-2",
                        dest="read_fname_2",
                        type=str,
                        default="",
                        help="filename for paired-end reads")    
    parser.add_argument("--alignment",
                        dest="alignment_fname",
                        type=str,
                        default="",
                        help="BAM file name")
    parser.add_argument("-p", "--threads",
                        dest="threads",
                        type=int,
                        default=1,
                        help="Number of threads")
    parser.add_argument('--no-partial',
                        dest='partial',
                        action='store_false',
                        help='Include partial alleles (e.g. A_nuc.fasta)')
    parser.add_argument("--aligner-list",
                        dest="aligners",
                        type=str,
                        default="hisat2.graph",
                        help="A comma-separated list of aligners such as hisat2.graph,hisat2.linear,bowtie2.linear (default: hisat2.graph)")
    parser.add_argument("--simulate-interval",
                        dest="simulate_interval",
                        type=int,
                        default=10,
                        help="Reads simulated at every these base pairs (default: 10)")
    parser.add_argument("--read-len",
                        dest="read_len",
                        type=int,
                        default=100,
                        help="Length of simulated reads (default: 100)")
    parser.add_argument("--fragment-len",
                        dest="fragment_len",
                        type=int,
                        default=350,
                        help="Length of fragments (default: 350)")
    parser.add_argument("--best-alleles",
                        dest="best_alleles",
                        action='store_true',
                        help="")
    parser.add_argument("--random-seed",
                        dest="random_seed",
                        type=int,
                        default=1,
                        help="A seeding number for randomness (default: 1)")
    parser.add_argument("--num-editdist",
                        dest="num_editdist",
                        type=int,
                        default=2,
                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
    parser.add_argument("--perbase-errorrate",
                        dest="perbase_errorrate",
                        type=float,
                        default=0.0,
                        help="Per basepair error rate in percentage when simulating reads (default: 0.0)")
    parser.add_argument("--perbase-snprate",
                        dest="perbase_snprate",
                        type=float,
                        default=0.0,
                        help="Per basepair SNP rate in percentage when simulating reads (default: 0.0)")
    parser.add_argument("--skip-fragment-regions",
                        dest="skip_fragment_regions",
                        type=str,
                        default="",
                        help="A comma-separated list of regions from which no reads originate, e.g., 500-600,1200-1400 (default: None).")
    parser.add_argument('-v', '--verbose',
                        dest='verbose',
                        action='store_true',
                        help='also print some statistics to stderr')
    parser.add_argument('--verbose-level',
                        dest='verbose_level',
                        type=int,
                        default=0,
                        help='also print some statistics to stderr (default: 0)')
    parser.add_argument("--debug",
                        dest="debug",
                        type=str,
                        default="",
                        help="e.g., test_id:10,read_id:10000,basic_test")
    parser.add_argument("--output-base", "--assembly-base",
                        dest="output_base",
                        type=str,
                        default="assembly_graph",
                        help="base file name (default: assembly_graph)")
    parser.add_argument("--assembly",
                        dest="assembly",
                        action="store_true",
                        help="Perform assembly")
    parser.add_argument("--no-error-correction",
                        dest="error_correction",
                        action="store_false",
                        help="Correct sequencing errors")
    parser.add_argument("--only-locus-list",
                        dest="only_locus_list",
                        type=str,
                        default="",
                        help="A comma-separated list of genes (default: empty, all genes)")
    parser.add_argument("--discordant",
                        dest="discordant",
                        action="store_true",
                        help="Allow discordantly mapped pairs or singletons")
    parser.add_argument("--display-alleles",
                        dest="display_alleles",
                        type=str,
                        default="",
                        help="A comma-separated list of alleles to display in HTML (default: empty)")

    args = parser.parse_args()
    if args.locus_list == "":
        locus_list = []
    else:
        locus_list = args.locus_list.split(',')
    if args.only_locus_list == "":
        only_locus_list = []
    else:
        locus_list = only_locus_list = args.only_locus_list.split(',')    
    if args.aligners == "":
        print >> sys.stderr, "Error: --aligners must be non-empty."
        sys.exit(1)    
    args.aligners = args.aligners.split(',')
    for i in range(len(args.aligners)):
        args.aligners[i] = args.aligners[i].split('.')
    if args.read_fname_U != "":
        args.read_fname = [args.read_fname_U]
    elif args.read_fname_1 != "" or args.read_fname_2 != "":
        if args.read_fname_1 == "" or args.read_fname_2 == "":
            print >> sys.stderr, "Error: please specify both -1 and -2."
            sys.exit(1)
        args.read_fname = [args.read_fname_1, args.read_fname_2]
    else:
        args.read_fname = []
    if args.alignment_fname != "" and \
            not os.path.exists(args.alignment_fname):
        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
        sys.exit(1)

    if args.verbose and args.verbose_level == 0:
        args.verbose_level = 1
        
    debug = {}
    if args.debug != "":
        for item in args.debug.split(','):
            if ':' in item:
                fields = item.split(':')
                assert len(fields) >= 2
                key, value = fields[0], ':'.join(fields[1:])
                debug[key] = value
            else:
                debug[item] = 1

    if not args.partial:
        print >> sys.stderr, "Warning: --no-partial should be used for debugging purpose only."

    if args.read_len * 2 > args.fragment_len:
        print >> sys.stderr, "Warning: fragment might be too short (%d)" % (args.fragment_len)

    skip_fragment_regions = []
    if args.skip_fragment_regions != "":
        prev_left, prev_right = -1, -1
        for region in args.skip_fragment_regions.split(','):
            left, right = region.split('-')
            left, right = int(left), int(right)
            assert left < right
            assert prev_right < left
            prev_left, prev_right = left, right
            skip_fragment_regions.append([left, right])

    if args.display_alleles == "":
        display_alleles = []
    else:
        display_alleles = args.display_alleles.split(',')

    random.seed(args.random_seed)
    genotyping_locus(args.base_fname,
                     locus_list,
                     args.genotype_genome,
                     only_locus_list,
                     args.partial,
                     args.aligners,
                     args.read_fname,
                     args.fastq,
                     args.alignment_fname,
                     args.threads,
                     args.simulate_interval,
                     args.read_len,
                     args.fragment_len,
                     args.best_alleles,
                     args.num_editdist,
                     args.perbase_errorrate,
                     args.perbase_snprate,
                     skip_fragment_regions,
                     args.assembly,
                     args.output_base,
                     args.error_correction,
                     args.discordant,
                     display_alleles,
                     args.verbose_level,
                     debug)

