#!/usr/bin/env python

'''
bl_seqkit_sample.py - given files with names of readfiles, create files containing a randomly chosen sample of reads

Synopsis: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer


@modified: January 15, 2019
@author: Brian Fristensky
@contact: Brian.Fristensky@umanitoba.ca  
'''

"""
optparse is deprecated in favor of argparse as of Python 2.7. However,
 since 2.7 is not always present on many systems, at this writing,
 it is safer to stick with optparse for now. It should be easy
 to change later, since the syntax is very similar between argparse and optparse.
 from optparse import OptionParser
"""
from optparse import OptionParser

import os
import random
import re
import subprocess
import sys



PROGRAM = "bl_seqkit_sample.py : "
USAGE = "\n\tUSAGE: bl_seqkit_sample.py --tsv tsvfile --ext file_extension --prefix string --percent integer"

DEBUG = True #Must be false when run by BioLegato
if DEBUG :
    print('bl_seqkit_sample: Debugging mode on')


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
                TSVFILE = ""
                EXT = ""
                PREFIX = ""
                PERCENT = ""

          """
        self.TSVFILE = ""
        self.EXT = ""
        self.PREFIX = ""
        self.PERCENT = 5         
        self.read_args()


        if DEBUG :
            print('------------ Parameters from command line ------')
            print('    TSVFILE: ' + self.TSVFILE)
            print('    EXT: ' + self.EXT)
            print('    PREFIX: ' + self.PREFIX)
            print('    PERCENT: ' + str(self.PERCENT))
            print('')  

    def read_args(self):
        """
        	Read command line arguments into a Parameter object
    	"""
            
        parser = OptionParser()
        parser.add_option("--tsv", dest="tsvfile", action="store", default="",
                          help="TAB-separated value file of file names")
        parser.add_option("--ext", dest="ext", action="store", default="",
                          help="file extension of input files")
        parser.add_option("--prefix", dest="prefix", action="store", default="",
                          help="prefix to add to file extension")
        parser.add_option("--percent", dest="percent", action="store", default=5,
                          help="percent of original file to include in the sample")
       
        (options, args) = parser.parse_args() 
        self.TSVFILE = options.tsvfile
        self.EXT = options.ext
        self.PREFIX = options.prefix
        self.PERCENT = int(options.percent)

class TSVFiles :
    """
    Methods for reading lists of paired read TSV files, and for
    writing lists to output.
    """
    def __init__(self):
        """
     	  Initializes arguments:
                READPAIRS = []

          """
        self.FILENAMES = []            

    def ReadTSVfile(self,FN) :
        """

        Each file is on a separate line

        reads1.fq.gz
        reads2.fq.gz
        reads3.fq.gz
        """
        TAB = '\t'
        F = open(FN,"r")
        for line in F.readlines() :
            line = line.strip()
            if len(line) > 0 and not line.startswith('#') :
                # get rid of double quotes that enclose fields when some programs write
                # output, and then split by TABs.
                tokens = line.replace('"','').split(TAB)

                # ignore blank fields. Add either single or pair of filenames
                # to list. Only process names from first two fields on a line
                # and ignore other fields. 
                if len(tokens) > 0 :
                    fname = tokens[0].strip()
                    self.FILENAMES.append(fname)
        if DEBUG :
            print(str(self.FILENAMES))
        F.close()


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def CreateSampleFile(FN,EXT,PREFIX,PERCENT) :

    if FN.endswith(EXT) :
        # Parse the filename into basename and extension
        BaseName = FN[0:FN.find(EXT)]

        # Create an output filename consisting of <BASENAME><PREFIX><EXTENSION>
        OutName = BaseName + PREFIX + EXT

        # Generate parameters
        Proportion = PERCENT/100.0
        Seed = random.randint(-32767,32767)

        # Run seqkit sample to generate the sample file
        COMSTR = ['seqkit','sample','--proportion',str(Proportion),'--rand-seed', str(Seed), '-o', OutName, FN]
        p = subprocess.Popen(COMSTR)
        p.wait()


#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    # Read parameters from command line
    P = Parameters()

    # Read paired-end files used for sequence assembly
    TF = TSVFiles()
    if not P.TSVFILE == "" :
        TF.ReadTSVfile(P.TSVFILE)

    # For each file, write a sample file with the specified percentage of the 
    # original file.
    for F in TF.FILENAMES :
        CreateSampleFile(F,P.EXT,P.PREFIX,P.PERCENT)


if __name__ == "__main__":
    main()

