#!/usr/bin/env python3

"""
optparse is deprecated in favor of argparse as of Python 2.7. However,
 since 2.7 is not always present on many systems, at this writing,
 it is safer to stick with optparse for now. It should be easy
 to change later, since the syntax is very similar between argparse and optparse.
 from optparse import OptionParser
"""
from optparse import OptionParser

import os
import re
import sys

'''
guesspairs.py - Given a list of sequencing read files, make a guess as to which pairs of files should be grouped together as left and right read files. Output is a .tsv file. Pairs of files are written as two fields on a line. Unpaired files are written as output lines with a single field eg.

Synopsis: guesspairs.py --infile <filename> --ltag <string> --rtag <string> [--extension <string>] --outfile <filename>

    --infile - file containing one filename per line
    --ltag - part of the filename that is only found in left read files
    --rtag - part of the filename that is only found in right read files
    --extension <string> If a file extension is specified, only files with that file extension will be
         included in the output. Files with other extensions (eg. .html) will be ignored at input.
         string can be a comma-separated list of file extensions
    --outfile - output in TAB-separated (.tsv) format. Paired end files are together on an output line, separated
         by TAB. Unpaired files are each on a separate line.

EXAMPLE:

Given the inputfile names.in

illumina_control_L1_.fq.gz
illumina_control_R2.fq.gz 
illumina_treatment_L1.fq.gz
illumina_treatment_R2.fq.gz
iontorrent_control1.fq.gz
iontorrent_control2.fq.gz

Command: guesspairs.py --infile names.in --ltag L1_ --rtag R2 --extension .fq.gz --outfile names.grouped

will create a file called names.grouped:

illumina_control_L1_.fq.gz<TAB>illumina_control_R2.fq.gz 
illumina_treatment_L1.fq.gz<TAB>illumina_treatment_R2.fq.gz
iontorrent_control1.fq.gz
iontorrent_control2.fq.gz

It may still be necessary to edit this file to get a namefile that can be used for genome or transcriptome assembly. 


@modified: April 24, 2018
@author: Brian Fristensky
@contact: Brian.Fristensky@umanitoba.ca  
'''

#blib = os.environ.get("BIRCHPYLIB")
#sys.path.append(blib)

#from birchlib import Birchmod


PROGRAM = "guesspairs.py : "
USAGE = "\n\tUSAGE: --infile <filename> --ltag <string> --rtag <string> [--extension <string>] --outfile <filename>"

DEBUG = False
if DEBUG :
    print('guesspairs: Debugging mode on')

#BM = Birchmod(PROGRAM, USAGE)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
              IFN = ""
              LTAG=""
              RTAG=""
              EXTENSION=""
              OUTFILE=""
          """
        self.IFN = ""
        self.LTAG=""
        self.RTAG=""
        self.EXTENSION=[]
        self.OFN=""
        self.read_args()


        if DEBUG :
            print('------------ Parameters from command line ------')
            print('    IFN: ' + self.IFN)
            print('    LTAG: ' + self.LTAG)
            print('    RTAG: ' + self.RTAG)
            print('    EXTENSION: ' + str(self.EXTENSION))
            print('    OFN: ' + self.OFN) 
            print()  

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
    	"""
            
        parser = OptionParser()
        parser.add_option("--infile", dest="ifn", action="store", default="",
                          help="input file, one filename per line")
        parser.add_option("--ltag", dest="ltag", action="store", default="",
                          help="string found in left read filenames")
        parser.add_option("--rtag", dest="rtag", action="store", default="",
                          help="string found in right read filenames")
        parser.add_option("--extension", dest="extension", action="store", default="",
                          help="common file extension for all read files")
        parser.add_option("--outfile", dest="ofn", action="store", default="",
                          help="output file in .tsv format")

        (options, args) = parser.parse_args() 
        self.IFN = options.ifn
        self.LTAG=options.ltag
        self.RTAG=options.rtag
        self.EXTENSION=options.extension.split(",")
        self.OFN=options.ofn

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class SeqFiles:
    """
      	Read, write and process lists of sequencing read filenames.
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
              RawReads = []      
        """
        self.RawReads = []
        self.PairedReads = []
        self.UnpairedReads = []

    def ReadNames(self,FN,EXT) :
        """
        Read input file with one name per line, and add these to RawReads
        """

        # Returns True if name ends with one of the file extensions
        # specified in --extension option
        def LegalExtension(name,EXT) :
            OKAY=False
            for E in EXT :
                if name.endswith(E) :
                    OKAY=True
            return OKAY

        F = open(FN,"r")
        for line in F.readlines() :
            name = line.strip()
            if len(name) > 0 :
                # Ignore names that do not end in the file extension
                if len(EXT) > 0 :
                    if LegalExtension(name,EXT) :
                        self.RawReads.append(name)
                else :
                    self.RawReads.append(name)
        F.close()
        if DEBUG :
               print('ReadNames: ' + str(self.RawReads))

    def MovePair(self,L,R) :
        """
        Move a read pair to the self.PairedReads
        """
        LeftName = self.RawReads[L]
        RightName = self.RawReads[R]
        ReadPair = [LeftName,RightName]
        self.RawReads.remove(LeftName)
        self.RawReads.remove(RightName)
        self.PairedReads.append(ReadPair)

    def MoveSingle(self,I) :
        """
        Move a read to the self.UnpairedReads
        """   
        self.UnpairedReads.append(self.RawReads.pop(I))


    def FindPairs(self,LTAG,RTAG) :
        """
        Iterate through RawReads and move filenames either to PairedReads or
        UnpairedReads. 
        """
        self.RawReads.sort() #pre-sorting should make this more efficient

        # Find the first read that begins with the same unique string as the first read
        def SeekMate(UniquePart,TAG) :
            J=1
            N = len(self.RawReads)
            RETCODE = -1
            while J < N :
                SecondName = self.RawReads[J]
                TagIndex = SecondName.find(TAG)
                if SecondName[:TagIndex] == UniquePart :
                    RETCODE = J
                    J = N
                else:
                    J+=1          
            return RETCODE

        # The exit condition occurs when all names have been moved from
        # self.RawReads to either self.PairdReads or self.Unpaired.Reads
        while len(self.RawReads) > 0 :
            I = 0
            FirstName = self.RawReads[I]
            LFOUND = FirstName.rfind(LTAG)
            RFOUND = FirstName.rfind(RTAG)
            if DEBUG :
                print("LFOUND: " + str(LFOUND) + " RFOUND: " + str(RFOUND))
            if LFOUND > -1 :
                if DEBUG :
                    print(FirstName[:LFOUND])
                J = SeekMate(FirstName[:LFOUND],RTAG)
                if DEBUG :
                    print("J: " + str(J))
                if J > -1 :
                    self.MovePair(I,J)
                else :
                    self.MoveSingle(I)                
            elif RFOUND > -1 :
                print(FirstName[:RFOUND])
                J = SeekMate(FirstName[:RFOUND],LTAG)
                if J > -1 :
                    self.MovePair(J,I)
                else :
                    self.MoveSingle(I)
            else :
                self.MoveSingle(I)            

    def WriteOutput(self,OFN) :
        F = open(OFN,"w")
        TAB = "\t"
        NL = "\n"
        for pair in self.PairedReads :
            F.write(pair[0] + TAB + pair[1] + NL)
        for single in self.UnpairedReads:
            F.write(single + NL)
        F.close()
        

#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
    """	
    # Read parameters from command line
    P = Parameters()
    SF = SeqFiles()
    SF.ReadNames(P.IFN,P.EXTENSION)
    SF.FindPairs(P.LTAG,P.RTAG)
    SF.WriteOutput(P.OFN)

if __name__ == "__main__":
    main()
#else:
    #used to generate documentation
#    import doctest
#    doctest.testmod()

#if (BM.documentor() or "-test" in sys.argv):
#    pass
#else:
#    main()
