#!/usr/bin/env python3

'''
bl_hisat2.py - Given a series of paired-end read files, run hisat2 for each pair of files, and
    generate sorted BAM files for each pair.

Synopsis: bl_hisat2.py tsvfile threads ifile outdir [hisat2 options]

    For each pair of read files (ie. left and right reads) in tsvfile, create a name for the BAM 
    file to be generated from each pair of read files. The name of the BAM file is the longest
    leading string in common between the two filenames, followed by the .sam extension. Thus,

    control1_R1.fastq.gz<TAB>control1_R2.fastq.gz

    would output to 

    control1_R.bam

    tsvfile - a tab-separated value file with each pair of filenames on separate lines
        MUST be the first argument. All hisat2 arguments follow.

    ifile - a genome index file produced by hisat2-build with an extension in the form .X.ht2, where X
        X is a digit from 1-9. This will be used to create the hisat2 option -x <ht2-idx> as described
        in the hisat2 documentation.

    outdir - director for writing .bam files

    The workflow runs hisat2 first to generate temporary .sam files, and then runs samtools sort 
    to sort the files, which is required by stringtie or cufflinks.

    [hisat2 options] - options to be passed to hisat2

@modified: June 1, 2021
@author: Brian Fristensky
@contact: Brian.Fristensky@umanitoba.ca  
'''

"""
optparse is deprecated in favor of argparse as of Python 2.7. However,
 since 2.7 is not always present on many systems, at this writing,
 it is safer to stick with optparse for now. It should be easy
 to change later, since the syntax is very similar between argparse and optparse.
 from optparse import OptionParser
"""
from optparse import OptionParser

import os
import re
import stat
import subprocess
import sys

PROGRAM = "bl_hisat2.py : "
USAGE = "\n\tUSAGE: bl_hisat2.py tsvfile threads ifile outdir [hisat2 options] "

DEBUG = False
if DEBUG :
    print('bl_hisat2: Debugging mode on')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
                TSVFILE = ""
                THREADS = "2"
                IFILE = ""
                OUTDIR = "bamfiles"
                hisat2args = []

     	  Then calls read_args() to fill in their values from command line
          """
        self.TSVFILE = "" 
        self.THREADS = "2"
        self.IFILE = ""
        self.OUTDIR = "bamfiles"
        self.hisat2args = []             
        self.read_args()


        if DEBUG :
            print('------------ Parameters from command line ------') 
            print('    TSVFILE: ' + self.TSVFILE)
            print('    THREADS: ' + str(self.THREADS))
            print('    IFILE: ' + self.IFILE)
            print('    OUTDIR: ' + self.OUTDIR)
            print('    hisat2args: ' + str(self.hisat2args))
            print()  

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
    	"""                  
        self.TSVFILE = sys.argv[1]
        self.THREADS = sys.argv[2]
        self.IFILE = sys.argv[3]
        self.OUTDIR = sys.argv[4]
        self.hisat2args = sys.argv[5:]

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class TSVFiles :
    """
    Methods for reading lists of paired read TSV files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                READPAIRS = []

          """
        self.READPAIRS = []            

    def ReadTSVfile(self,FN) :
        """
        TSV file containing names of paired-end and/or single end read files.
        Paired-end files are on lines such as

        leftreadfile.fq.gz<TAB>rightreadfile.fq.gz

        Single-end files have a each file on a separate line

        reads1.fq.gz
        reads2.fq.gz
        reads3.fq.gz
        """
        TAB = '\t'
        F = open(FN,"r")
        for line in F.readlines() :
            line = line.strip()
            if len(line) > 0 and not line.startswith('#') :
                # get rid of double quotes that enclose fields when some programs write
                # output, and then split by TABs.
                tokens = line.replace('"','').split(TAB)

                # ignore blank fields. Add either single or pair of filenames
                # to list. Only process names from first two fields on a line
                # and ignore other fields. 
                if len(tokens) > 0 :
                    r1 = tokens[0].strip()
                    if len(r1) > 0 :
                        fnames = [r1]
                    else :
                        fnames = []
                    if len(tokens) > 1 :
                        r2 = tokens[1].strip()
                        if len(r2) > 0 :
                            fnames.append(r2)
                    if len(fnames) > 0 :
                        self.READPAIRS.append(fnames)
        if DEBUG :
            print(str(self.READPAIRS))
        F.close()


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def RunHisat2(PR,THREADS,IFILE,OUTDIR,HISAT2ARGS,LOGFILE) :

    # Create a basename for the hisat2 -x option by 
    # removing the .X.ht2 file extension from IFILE
    # Assumes the basename does not contain a dot (.)
    def TruncPath(IFILE) :
        TP = IFILE.split('.')[0]
        return TP 

    # Return the longest identical substring in common
    # between s1 and s2, reading from left to right.
    def FirstDiff(s1,s2) :
        I = 0
        MINLEN=min(len(s1),len(s2))
        DONE = False
        while s1[I] == s2[I] and I < MINLEN :
                I +=1

        return s1[:I]    


    # Construct the command string - - - - - - - - - - - - - - 

    # Dummy command for testing
    #COMSTR=["hisat2","--help"]
    COMSTR=["hisat2"] 

    # Add the -x option, specifying the basename for the index files.
    XPATH = TruncPath(IFILE)
    COMSTR.extend(['-x',XPATH])

    # Add the names of the left and right read files
    if len(PR) > 0 :
        if len(PR) == 1 :
            COMSTR.extend(['-s', PR[0], ' '])
        else:
            COMSTR.extend(['-1', PR[0], '-2', PR[1]])

    # Find a common basename for the SAM file, and add the .sam filename
    # to the command
    ComName = OUTDIR + os.sep + os.path.basename(FirstDiff(PR[0],PR[1]))
    if DEBUG :
        print('ComName: ' + ComName)

    TmpSamName=  '_tmp' + '.sam' # temporary file with unsorted output from hisat2
    COMSTR.extend(['-S',TmpSamName])
   
    # Append the hisat2 options to the command
    COMSTR.extend(['-p', THREADS])
    COMSTR.extend(HISAT2ARGS)

    if DEBUG :
        print('COMSTR: ' + str(COMSTR))

    # Run Hisat2 - - - - - - - - - - - - - - - - -
    LOGFILE.write('======== hisat2 ==========' + '\n')
    LOGFILE.write(str(COMSTR) + '\n')
    LOGFILE.write('\n')
    LOGFILE.flush()
    p = subprocess.Popen(COMSTR,stdout=LOGFILE,stderr=LOGFILE)
    p.wait()
    LOGFILE.write('\n')   

    # Run samtools sort to sort the bam file - - - - - - - - - - - - - - - - -
    BamName= ComName + '.bam' # softed output for use with cufflinks
    COMSTR=['samtools','sort','--threads', THREADS]
    COMSTR.extend(['-o', BamName, TmpSamName]) 
    LOGFILE.write('======== samtools sort ==========' + '\n')
    LOGFILE.write(str(COMSTR) + '\n')
    LOGFILE.write('\n')
    LOGFILE.flush()
    p = subprocess.Popen(COMSTR,stdout=LOGFILE,stderr=LOGFILE)
    p.wait()

    # Run samtools stats to create statistics for the bam file - - - - - - - - - - - -
    CLIST=['samtools','stats','--threads', THREADS, BamName]
    StatName = BamName + ".stats"
    CLIST.extend(['>', StatName]) 
    LOGFILE.write('======== samtools stats ==========' + '\n')
    LOGFILE.write(str(CLIST) + '\n')
    LOGFILE.write('\n')
    LOGFILE.flush()
    COMMAND = " ".join(CLIST)
    print(COMMAND)
    os.system(COMMAND)

    # Delete temporary sam file
    os.remove(TmpSamName)
    LOGFILE.write('\n')
    

#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    # Read parameters from command line
    P = Parameters()

    TF = TSVFiles()
    if not P.TSVFILE == "" :
        TF.ReadTSVfile(P.TSVFILE)

    # Make sure that the output directory exists
    if not os.path.isdir(P.OUTDIR) :
        os.mkdir(P.OUTDIR) 
                             
    LOGFN = os.path.join(P.OUTDIR,'bl_hisat2.log')
    LOGFILE = open(LOGFN,'w')
    LOGFILE.write('\n')

    # Run hisat2
    for PR in TF.READPAIRS :
        RunHisat2(PR,P.THREADS,P.IFILE,P.OUTDIR,P.hisat2args,LOGFILE)
         
    LOGFILE.close()


if __name__ == "__main__":
    main()
#else:
    #used to generate documentation
#    import doctest
#    doctest.testmod()

#if (BM.documentor() or "-test" in sys.argv):
#    pass
#else:
#    main()
