#!/usr/bin/env python

'''
bl_seqreadlist.py - given files with names of readfiles, returns part of the spades command
    line which specifies read files

Synopsis: bl_seqreadlist.py [--tsv tsvfile] [--pe pefile] [--mp pefile] [--long singlereadfile] [--s singlereadfile] \
            --outtype spadescom --fullpaths [--outfile filename]

        --outtype Specifies format required by various probrams that work with reads.
              Currently supported values: spadescom, transratecom, polluxcom, abysscom

@modified: June 18, 2018
@author: Brian Fristensky
@contact: Brian.Fristensky@umanitoba.ca  
'''

"""
optparse is deprecated in favor of argparse as of Python 2.7. However,
 since 2.7 is not always present on many systems, at this writing,
 it is safer to stick with optparse for now. It should be easy
 to change later, since the syntax is very similar between argparse and optparse.
 from optparse import OptionParser
"""
from optparse import OptionParser

import os
import re
import sys



PROGRAM = "bl_seqreadlist.py : "
USAGE = "\n\tUSAGE: bl_seqreadlist.py [--tsv tsvfile] [--pe pefile] [--mp pefile] [--long singlereadfile]  [--s singlereadfile]\
  --outtype filetype --fullpaths [--outfile filename]"

DEBUG = False #Must be false when run by BioLegato
if DEBUG :
    print('bl_seqreadlist: Debugging mode on')


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
                TSVFILE = ""
                PEFILE = ""
                MPFILE = ""
                LONGFILE = ""
                SEFILE = ""
                OUTTYPE = "spadescom"
                FULLPATHS = False
                OUTFILE = ""

          """
        self.TSVFILE = ""
        self.PEFILE = ""
        self.MPFILE = ""
        self.LONGFILE = ""
        self.SEFILE = ""
        self.OUTTYPE = "spadescom"
        self.FULLPATHS = False
        self.OUTFILE = ""            
        self.read_args()


        if DEBUG :
            print('------------ Parameters from command line ------')
            print('    TSVFILE: ' + self.TSVFILE)
            print('    PEFILE: ' + self.PEFILE)
            print('    MPFILE: ' + self.MPFILE)
            print('    LONGFILE: ' + self.LONGFILE)
            print('    SEFILE: ' + self.SEFILE)
            print('    OUTTYPE: ' + self.OUTTYPE)
            print('    FULLPATHS: ' + str(self.FULLPATHS))
            print('    OUTFILE: ' + self.OUTFILE)
            print('')  

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
    	"""
            
        parser = OptionParser()
        parser.add_option("--tsv", dest="tsvfile", action="store", default="",
                          help="TAB-separated value file with names of paired-end and or single-end read files")
        parser.add_option("--pe", dest="pefile", action="store", default="",
                          help="file with names of paired-end read files")
        parser.add_option("--mp", dest="mpfile", action="store", default="",
                          help="file with names of mate-pair read files")
        parser.add_option("--long", dest="longfile", action="store", default="",
                          help="file with names of long read files")
        parser.add_option("--s", dest="sefile", action="store", default="",
                          help="file with names of single read files")
        parser.add_option("--outtype", dest="outtype", action="store", default="spadescom",
                          help="type of output")
        parser.add_option("--fullpaths", dest="fullpaths", action="store_true", default=False,
                          help="output full paths to files")
        parser.add_option("--outfile", dest="outfile", action="store", default="",
                          help="output file")
        (options, args) = parser.parse_args() 
        self.TSVFILE = options.tsvfile
        self.PEFILE = options.pefile
        self.MPFILE = options.mpfile
        self.LONGFILE = options.longfile
        self.SEFILE = options.sefile
        self.OUTTYPE = options.outtype
        self.FULLPATHS = options.fullpaths
        self.OUTFILE = options.outfile

class TSVFiles :
    """
    Methods for reading lists of paired read TSV files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                READPAIRS = []

          """
        self.READPAIRS = []            

    def ReadTSVfile(self,FN,FULLPATHS) :
        """
        TSV file containing names of paired-end and/or single end read files.
        Paired-end files are on lines such as

        leftreadfile.fq.gz<TAB>rightreadfile.fq.gz

        Single-end files have a each file on a separate line

        reads1.fq.gz
        reads2.fq.gz
        reads3.fq.gz
        """
        TAB = '\t'
        F = open(FN,"r")
        for line in F.readlines() :
            line = line.strip()
            if len(line) > 0 and not line.startswith('#') :
                # get rid of double quotes that enclose fields when some programs write
                # output, and then split by TABs.
                tokens = line.replace('"','').split(TAB)

                # ignore blank fields. Add either single or pair of filenames
                # to list. Only process names from first two fields on a line
                # and ignore other fields. 
                if len(tokens) > 0 :
                    r1 = tokens[0].strip()
                    if len(r1) > 0 :
                        fnames = [r1]
                    else :
                        fnames = []
                    if len(tokens) > 1 :
                        r2 = tokens[1].strip()
                        if len(r2) > 0 :
                            fnames.append(r2)
                    if len(fnames) > 0 :
                        self.READPAIRS.append(fnames)
        if FULLPATHS :
            N = len(self.READPAIRS)
            I = 0
            while I < N : 
                M = len(self.READPAIRS[I])
                J = 0
                while J < M :
                    self.READPAIRS[I][J] = os.path.abspath(self.READPAIRS[I][J])
                    J+=1
                I+=1              
        if DEBUG :
            print(str(self.READPAIRS))
        F.close()

class PairedFiles :
    """
    Methods for reading lists of paired read files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                READPAIRS = []

          """
        self.READPAIRS = []            

    def ReadPEfile(self,FN,FULLPATHS) :
        """
        Assumes all data is on a single line of the form:

        left1,right1|left2,right2|left3,right3

        where the pipe character separates pairs of filenames, and
        left and right read pair files are separated by commas.
        """
        F = open(FN,"r")
        line = F.readline() # assumes all data is on a single line
        tokens1 = line.split('|')
        for t1 in tokens1 :
            tokens2 = t1.split(',')
            tokens2[0] = tokens2[0].strip()
            tokens2[1] = tokens2[1].strip()
            if len(tokens2[0]) > 0 and len(tokens2[1]) > 0 :
                self.READPAIRS.append(tokens2)

        if FULLPATHS :
            N = len(self.READPAIRS)
            I = 0
            while I < N : 
                M = len(self.READPAIRS[I])
                J = 0
                while J < M :
                    self.READPAIRS[I][J] = os.path.abspath(self.READPAIRS[I][J])
                    J+=1
                I+=1  

        if DEBUG :
            print(str(self.READPAIRS))
        F.close()

class SingleReadFiles :
    """
    Methods for reading lists of single read files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                SREADFILES = []

          """
        self.SREADFILES = []            

    def ReadSfile(self,FN,FULLPATHS) :
        """
        Assumes all data is on a single line of the form:

        file1|file2|file3...

        where the pipe character separates filenames
        
        """
        F = open(FN,"r")
        line = F.readline() # assumes all data is on a single line
        tokens1 = line.split('|')
        for t1 in tokens1 :
            fname = t1.strip()
            if len(fname) > 0 :
                if FULLPATHS :
                    fname = os.path.abspath(fname)
                self.SREADFILES.append(fname)

        if DEBUG :
            print(str(self.SREADFILES))
        F.close()

def WriteSpadesCom(TF,PF,SF) :
    """
    Write output to be included in the Spades command line.
    """
    COMSTR=""

    if len(TF.READPAIRS) > 0 :
        for P in TF.READPAIRS :
            if len(P) == 1 :
                COMSTR = COMSTR + ' -s ' + P[0] + ' '  
            else:
                COMSTR = COMSTR + ' -1 ' + P[0] + ' -2 ' + P[1]

    if len(PF.READPAIRS) > 0 :
        #NUM = 1
        for P in PF.READPAIRS :
            # The documentation says this syntax should work, but it doesn't
            #COMSTR = COMSTR + ' --pe' + str(NUM) + '-1 ' + P[0] + ' --pe' + str(NUM) + '-2 ' + P[1]
            # Use this instead
            COMSTR = COMSTR + ' -1 ' + P[0] + ' -2 ' + P[1]
            #NUM += 1

    if len(SF.SREADFILES) > 0 :
        for P in SF.SREADFILES :
            COMSTR = COMSTR + ' -s ' + P + ' '  
    print(COMSTR)

def WriteAbyssCom(TF,PF,MP,LF,SF,OFN) :
    """
    Write output to be included in the Abyss command line.
    """

    # Create a part of a string for a given tag eg. lib, pe, long, se
    def MakePhrase(PAIRS,TAG) :
        PHRASE=""
        if len(PAIRS) > 0 :
            TAGLIST = TAG + "='" # used for writing list of libraries used in assembly eg. lib1, lib2...
            READLIST=""
            N=0 # number of paired libraries
            for P in PAIRS :
                N+=1
                TAGLIST = TAGLIST + TAG + str(N) + ' ' # eg lib='lib1 lib2 lib3'
                LSTR = TAG + str(N) + "='" # eg. lib1='exp1_L.fq exp1_R.fq'
                if TAG in ['long','se'] :
                    LSTR = LSTR + P + ' '  
                else:
                    LSTR = LSTR + P[0] + ' ' + P[1]
                LSTR = LSTR + "' "
                READLIST += LSTR
            TAGLIST = TAGLIST + "' "
            PHRASE = TAGLIST + READLIST
        return PHRASE


    COMSTR = MakePhrase(TF.READPAIRS,'lib')
    COMSTR += MakePhrase(PF.READPAIRS,'pe')
    COMSTR += MakePhrase(MP.READPAIRS,'mp')
    COMSTR += MakePhrase(LF.SREADFILES,'long')
    COMSTR += MakePhrase(SF.SREADFILES,'se')

    if OFN == "" :
        print(COMSTR)
    else :
        OUTFILE=open(OFN,'w') 
        OUTFILE.write(COMSTR)
        OUTFILE.close()
    
def WriteTransrateCom(TF,PF) :
    """
    Write output to be included in the Transrate command line.
    Transrate ONLY works with paired-end reads, as of v1.03
    """
    LEFTREADS = ""
    RIGHTREADS = ""
    COMSTR=""

    def AddReads(RP,J) :
        CL = ""
        I = 0
        LEN = len(RP)
        while I < LEN :
            CL = CL + ',' + RP[I][J]
            I += 1
        return CL

    if len(TF.READPAIRS) > 0 :
        LEFTREADS = LEFTREADS + AddReads(TF.READPAIRS,0)
        RIGHTREADS = RIGHTREADS + AddReads(TF.READPAIRS,1)
    if len(PF.READPAIRS) > 0 :
        LEFTREADS = LEFTREADS + AddReads(PF.READPAIRS,0)
        RIGHTREADS = RIGHTREADS + AddReads(PF.READPAIRS,1)

    # AddReads adds a comma before adding every read. We don't want a leading comma,
    # so if there is one, left truncate the read list to remove it. This makes the
    # logic above WAY simpler, than to make the beginning of the left and right
    # read lists a special case.
    if LEFTREADS.startswith(',') :
        LEFTREADS = LEFTREADS[1:]
    if RIGHTREADS.startswith(',') :
        RIGHTREADS = RIGHTREADS[1:]

    COMSTR = '--left=' + LEFTREADS + ' --right=' + RIGHTREADS 
    print(COMSTR)

def WritePolluxCom(TF,PF,SF) :
    """
    Write output to be included in the Spades command line.
    """

    if len(TF.READPAIRS) > 0 :
        for P in TF.READPAIRS :
            if len(P) == 1 :
                COMSTR = ' -i ' + P[0]  
            else:
                COMSTR = ' -i ' + P[0] + ' ' + P[1]
            print(COMSTR)

    if len(PF.READPAIRS) > 0 :
        for P in PF.READPAIRS :
            COMSTR = ' -i ' + P[0] + ' ' + P[1]
            print(COMSTR) 

    if len(SF.SREADFILES) > 0 :
        for P in SF.SREADFILES :
            COMSTR =  ' -i ' + P
            print(COMSTR)

#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    # Read parameters from command line
    P = Parameters()

    # Read paired-end files used for sequence assembly
    TF = TSVFiles()
    if not P.TSVFILE == "" :
        TF.ReadTSVfile(P.TSVFILE,P.FULLPATHS)

    # Read additional read files used for scaffolding
    PF = PairedFiles()
    if not P.PEFILE == "" :
        PF.ReadPEfile(P.PEFILE,P.FULLPATHS)
    MF = PairedFiles()
    if not P.MPFILE == "" :
        MF.ReadPEfile(P.MPFILE,P.FULLPATHS)
    LF = SingleReadFiles()
    if not P.LONGFILE == "" :
        LF.ReadSfile(P.LONGFILE,P.FULLPATHS)
    SF = SingleReadFiles()
    if not P.SEFILE == "" :
        SF.ReadSfile(P.SEFILE,P.FULLPATHS)

    # Write comand line output in the specified format.
    if P.OUTTYPE == 'spadescom' :
        WriteSpadesCom(TF,PF,SF)
    elif P.OUTTYPE == 'abysscom' :
        WriteAbyssCom(TF,PF,MF,LF,SF,P.OUTFILE)
    elif P.OUTTYPE == 'transratecom' :
        WriteTransrateCom(TF,PF)
    elif P.OUTTYPE == 'polluxcom' :
        WritePolluxCom(TF,PF,SF)
    else:
        pass

if __name__ == "__main__":
    main()

