#!/usr/bin/env python3

"""
    fastalen.py - separate fasta sequences by length

@modified: June 7, 2021
@author: Brian Fristensky
@contact: brian.fristensky@umanitoba.ca

"""

import argparse
import datetime
import os
import re
import sys

# - - - - - - -  GLOBAL VARIABLES - - - - - - - -
PROGRAM = os.path.basename(sys.argv[0]) + ": "   # preceeds print messages
USAGE = "\n\t USAGE: fastalen.py [--filelist] infile --split <int>|--gte <int>|--lt <int>|--between <int> <int>"


DEBUG = True
NL = "\n"
BLANKLINE = " " + NL
 
Pattern = r'[AGCTN]+'
NUCLEOTIDES = re.compile(Pattern)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Wrapper class for command line parameters"
class Parameters:

    def __init__(self):
        """
                Initializes arguments:
                Then calls read_args() to fill in their values from command line
                """

        self.IFN = "" 
        self.FILELIST = False # True if --filelist was set. 
        self.read_args()

    def read_args(self):
        """
                Read command line arguments into a Paramters object
                """
        parser = argparse.ArgumentParser()
        parser.add_argument("--filelist", dest="filelist", action="store_true", help="if set, file contains list of filenames")
        parser.add_argument("infile", action="store", default="", help="input file")
        group = parser.add_mutually_exclusive_group()
        group.add_argument('-s', '--split', dest="splitargs", action='store', type=int, nargs=1, default=[])
        group.add_argument('-g', '--gte', dest="gteargs", action='store', type=int, nargs=1, default=[])
        group.add_argument('-l', '--lt', dest="ltargs", action='store', type=int, nargs=1, default=[])
        group.add_argument('-b', '--between', dest="betweenargs", action='store', type=int, nargs=2, default=[])
        #parser.set_defaults(action='split')    
        try:
            args = parser.parse_args()
            if args.filelist :
                self.FILELIST = True
            self.IFN = args.infile
            if not args.splitargs == [] :
                self.TASK = "split"
                self.TASKARGS = args.splitargs
            if not args.gteargs == [] :
                self.TASK = "gte"
                self.TASKARGS = args.gteargs
            if not args.ltargs == [] :
                self.TASK = "lt"
                self.TASKARGS = args.ltargs
            if not args.betweenargs == [] :
                self.TASK = "between"
                self.TASKARGS = args.betweenargs
        except ValueError:
            print(USAGE)

        if DEBUG :
            print("FILELIST: " + str(self.FILELIST))
            print("IFN: " + self.IFN)
            print("TASK: " + self.TASK + " " + str(self.TASKARGS))


# --------------------------------------------
def ReadList(F) :
    lfile = open(F,"r")
    FILES = []
    for line in lfile.readlines() :
        FILES.append(line.strip())
    lfile.close()
    return FILES

# --------------------------------------------
class Seq :
    """
    Fasta sequence. Sequence may span many lines.
    """
    def __init__(self):
        self.Id = BLANKLINE
        self.Seq = BLANKLINE
        self.Len = 0

    # Read the next sequence
    def Read (self,line,infile) :

        self.Id = line
        self.Seq = ""
        line = infile.readline()
        #line = infile.readline()
        while line != "" and line[0] != ">" :
            self.Seq += line.strip()
            line = infile.readline()
        self.Len = len(self.Seq)
        return self, line

    def Write(self,outfile) :
        outfile.write(self.Id)
        outfile.write(self.Seq + NL)

# --------------------------------------------
def WriteSplit(F,MIN) :
    basename,ext = os.path.splitext(F)
    infile = open(F,"r")
    OFN1 = basename + "." + "gte" + str(P.TASKARGS[0]) + ext
    OFN2 = basename + "." + "lt" + str(P.TASKARGS[0]) + ext
    outfile1 = open(OFN1,'w')
    outfile2 = open(OFN2,'w')

    line = infile.readline()
    S = Seq()

    gteseqs = 0
    ltseqs = 0
    while line != "" :
        if line[0] == ">" :
            # line acts a a "lookahead", getting the next line after we're done 
            # reading the current sequence
            S, line = S.Read(line,infile)
            if S.Len >= MIN :
                S.Write(outfile1)
                gteseqs += 1
            else :
                S.Write(outfile2)
                ltseqs += 1
        else :
            line = infile.readline()

    print("Total sequences written to " + OFN1 + ": " + str(gteseqs)) 
    print("Total sequences written to "  + OFN2 + ": " + str(ltseqs)) 
    print(" ")
    outfile1.close()
    outfile2.close()
    infile.close()

# --------------------------------------------
def WriteGte(F,MIN) :

    basename,ext = os.path.splitext(F)
    infile = open(F,"r")
    OFN1 = basename + "." + "gte" + str(P.TASKARGS[0]) + ext
    outfile1 = open(OFN1,'w')

    line = infile.readline()
    S = Seq()

    gteseqs = 0
    ltseqs = 0
    while line != "" :
        if line[0] == ">" :
            # line acts a a "lookahead", getting the next line after we're done 
            # reading the current sequence
            S, line = S.Read(line,infile)
            if S.Len >= MIN :
                S.Write(outfile1)
                gteseqs += 1
            else :
                ltseqs += 1
        else :
            line = infile.readline()

    print("Total sequences written to " + OFN1 + ": " + str(gteseqs)) 
    print(" ")
    outfile1.close()
    infile.close()

# --------------------------------------------
def WriteLt(F,MAX) :

    basename,ext = os.path.splitext(F)
    infile = open(F,"r")
    OFN1 = basename + "." + "lt" + str(P.TASKARGS[0]) + ext
    outfile1 = open(OFN1,'w')

    line = infile.readline()
    S = Seq()

    gteseqs = 0
    ltseqs = 0
    while line != "" :
        if line[0] == ">" :
            # line acts a a "lookahead", getting the next line after we're done 
            # reading the current sequence
            S, line = S.Read(line,infile)
            if S.Len >= MAX :
                gteseqs += 1
            else :
                S.Write(outfile1)
                ltseqs += 1
        else :
            line = infile.readline()

    print("Total sequences written to "  + OFN1 + ": " + str(ltseqs)) 
    print(" ")
    outfile1.close()
    infile.close()


# --------------------------------------------
def WriteBetween(F,MIN,MAX) :

    basename,ext = os.path.splitext(F)
    infile = open(F,"r")
    OFN1 = basename + "." + str(P.TASKARGS[0]) + "-" + str(P.TASKARGS[1]) + ext
    outfile1 = open(OFN1,'w')

    line = infile.readline()
    S = Seq()

    gteseqs = 0
    ltseqs = 0
    while line != "" :
        if line[0] == ">" :
            # line acts a a "lookahead", getting the next line after we're done 
            # reading the current sequence
            S, line = S.Read(line,infile)
            if S.Len >= MIN and S.Len <= MAX :
                S.Write(outfile1)
                gteseqs += 1
            else :
                ltseqs += 1
        else :
            line = infile.readline()

    print("Total sequences written to " + OFN1 + ": " + str(gteseqs))  
    print(" ")
    outfile1.close()
    infile.close()


#========================    MAIN   =============================

print("========== " + PROGRAM + " ==========")
P = Parameters()

#Convert arguments to variables

if P.FILELIST :
    FILES = ReadList(P.IFN)
else:
    FILES = [P.IFN]

for F in FILES :
    print("---------- " + F + " ----------")

    # We have four separate functions for writing output files, in part
    # because it saves on a lot if if-then checking. Also, there are
    # enough differences among the four tasks that it's less confusing
    # to have four functions.
    if P.TASK == "split" :
        WriteSplit(F,P.TASKARGS[0])
    elif P.TASK == "gte" :
        WriteGte(F,P.TASKARGS[0])
    elif P.TASK == "lt" :
        WriteLt(F,P.TASKARGS[0])
    elif P.TASK == "between" :
        WriteBetween(F,P.TASKARGS[0],P.TASKARGS[1])



