#!/usr/bin/env python3

"""
 rebasecnv.py - Create subsets of REBASE based on types of cutting sites.

 Synopsis:
    rebasecnv.py --infile infile  --outfile outfile [-outfmt name|number ] [--prototypes] [--commercial] [--ends 5|3|b] [--symmetric a|s|b]

@modified: January 5, 2017
@author: Brian Fristensky
@contact: frist@cc.umanitoba.ca
"""

import os
import sys

#optparse is deprecated in favor of argparse as of Python 2.7. However,
# since 2.7 is not always present on many systems, at this writing,
# it is safer to stick with optparse for now. It should be easy
# to change later, since the syntax is very similar between argparse and optparse.
from optparse import OptionParser


blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod

PROGRAM = "rebasecnv.py : "
USAGE = "\n\tUSAGE: rebasecnv.py --infile infile  --outfile outfile   [-outfmt name|number ] [--prototypes] [--commercial] [--ends 5|3|b] [--symmetry a|s]"


BM = Birchmod(PROGRAM, USAGE)
DEBUG = True

class Parameters:
    """
      	Wrapper class for command line parameters
      	
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
     		IFN=""
     		OFN=""
                OUTFMT="bairoch"
     		PROTOTYPES=False
     		COMMERCIAL=False
                ENDS=["5","3","b"]
                SYMMETRY=["a","s"]
     		PID = str(os.getpid())
     	  Then calls read_args() to fill in their values from command line
          """
        self.IFN = ""
        self.OFN = ""
        self.OUTFMT="bairoch"
        self.PROTOTYPES=False
     	self.COMMERCIAL=False
        self.ENDS=["5","3","b"]
        self.SYMMETRY=["a","s"]
        self.PID = str(os.getpid())
        self.read_args()


    def read_args(self):
        """
        	Read command line arguments into a Parameter object

        	"""
        parser = OptionParser()
        parser.add_option("--infile", dest="infile", action="store", type="string", default="",
                          help="input file in Bairoch format")
        parser.add_option("--outfile", dest="outfile", action="store", type="string", default="",
                          help="output file")
        parser.add_option("--outfmt", dest="outfmt", action="store", type="string", default="bairoch",
                          help="name or number of output format")
        parser.add_option("--prototypes", dest="prototypes", action="store_true", default=False,
                          help="Write prototype enzymes only to output")
        parser.add_option("--commercial", dest="commercial", action="store_true", default=False,
                          help="Write commercially available enzymes only to output")
        parser.add_option("--ends", dest="ends", action="store", type="string", default="53b",
                          help="Type of ends")
        parser.add_option("--symmetry", dest="symmetry", action="store", type="string", default="as",
                          help="asymmetric or symmetric sites")

        (options, args) = parser.parse_args() 
        self.IFN = options.infile 
        self.OFN = options.outfile
        self.OUTFMT = options.outfmt
        self.PROTOTYPES = options.prototypes
        self.COMMERCIAL = options.commercial
        self.ENDS=list(options.ends) #break up string into a list of characters
        self.SYMMETRY=list(options.symmetry) #break up string into a list of characters

        if DEBUG :
            print('------------ Parameters from command line ------' + '\n')
            print('    IFN: ' + self.IFN)
            print('    OFN: ' + self.OFN)
            print('    OUTFMT: ' + self.OUTFMT)
            print('    PROTOTYPES: ' + str(self.PROTOTYPES))
            print('    COMMERCIAL: ' + str(self.COMMERCIAL))   
            print('    ENDS: ' + str(self.ENDS))
            print('    SYMMETRY: ' + str(self.SYMMETRY)) 

class Rsite :

    def __init__(self) :
        """
        Restriction site is a tuple with a recognition sequence and a cutting site.
        Asymmetric enzymes will have two Rsites, symmetric enzymes will have one.

        """
        self.Rseq = "" 
        self.Rcut = ""


class Enzyme :
 
    def __init__(self):
        """
        Initialize enzyme
          """
        self.LINES = []
        self.ID = ""
        self.PROTOTYPE = False
     	self.COMMERCIAL = False
        self.SITE = [] # List of Rsites. Asymmetric enzymes will have two Rsites,
                       # Symmetric enzymes will have one.
        self.ENDS="" # 5|3|b
        self.SYMMETRIC=True
    
    def readenz(self,INFILE,LINE,OUTFILE) :
        """
        Read the next enzyme from the file.
        """
        while (LINE != "") and (not LINE.startswith('ID')) :
            LINE=INFILE.readline()
        while not LINE.startswith('//') :
           self.LINES.append(LINE)
           LINE = INFILE.readline()

    def getID(self,ELINE) :
        """
        Get the ID from ELINE.
        """
        return ELINE[5:].strip()

    def getCOMMERCIAL(self,ELINE) :
        """
        Get COMMERCIAL from ELINE.
        """
        COM=ELINE[5:].strip()
        if COM == "." :
            return False
        else :
            return True

    def getPROTO(self,ELINE) :
        """
        Get PROTOTYPE from ELINE.
        """
        PID = ELINE[5:].strip()
        if PID == self.ID :
            return True

    def getRSlist(self,ELINE) :
        """
        Get a list of Restriction sites from ELINE.
        Symmetric cutters will have a single site eg. RS   GAATTC, 1;
        A asymmetric will have two eg. RS   GAYNNNNNVTC, 24; GABNNNNNRTC, -8;
        """
        RSlist = []
        RStr = ELINE[5:].strip()
        # remove terminal semicolon
        if RStr.endswith(';') :
            RStr = RStr[:-1]
        # parse the rest. seq and cutting site
        TOKENS1 = RStr.split(';')

        for T in TOKENS1 :
            S = Rsite ()
            TOKENS2 = T.split(',')
            S.Rseq = TOKENS2[0].strip()
            S.Rcut = TOKENS2[1].strip()
            RSlist.append(S)            
        return RSlist

    def isSymmetric(self) :
        """
        Test recognition sequence for symmetry. REBASE gives us a single recognition
        sequence for symmetric sequences, and two sequences for assymetric sites.
        """
        if len(self.SITE) == 1 :
            return True
        else :
            return False

    def EndType(self) :
        """
        Determine the type of fragment ends. Returns 5, 3 or b.
        """
        FragEnds = ""
        if self.SYMMETRIC :
            Plen = len(self.SITE[0].Rseq)
            #CutsAfter = Plen + int(self.SITE[0].Rcut)
            CutsAfter = int(self.SITE[0].Rcut)
            if CutsAfter < (Plen/2) :
                FragEnds = "5"
            elif CutsAfter > (Plen/2) :
                FragEnds = "3"
            else:
                FragEnds = "b"
        else :
            Cut = int(self.SITE[0].Rcut)
            CutOpp = int(self.SITE[1].Rcut)
            if Cut < CutOpp :
                FragEnds = "5"
            elif Cut > CutOpp :
                FragEnds = "3"
            else :                
                FragEnds = "b"
        return FragEnds

    def writeenz(self,OUTFILE) :
        """
        Write enzyme to output file.
        """
        OUTFILE.write('\n')
        for LINE in self.LINES :
             OUTFILE.write(LINE)
        OUTFILE.write('//' + '\n')

           
#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
	
    P = Parameters ()

    
    INFILE=open(P.IFN,'r')
    OUTFILE=open(P.OFN,'w')

    #Copy header comment lines from infile to outfile
    LINE = INFILE.readline()
    while LINE.startswith('CC') :
        OUTFILE.write(LINE)
        LINE = INFILE.readline()
    
    # Read an enzyme at a time. For each enzyme, find out whether it
    # meets criteria for prototypes, commercial, symmetry and ends. 
    # If so, print the enzyme to the output.
    E = Enzyme ()
    while LINE != "" :
        E.readenz(INFILE,LINE,OUTFILE)
        #OUTFILE.write(LINE)

        # Get criteria information from the enzyme
        OKAY = True
        for ELINE in E.LINES :
            FIELD=ELINE[0:2]
            if FIELD == 'ID' : # id
                E.ID = E.getID(ELINE)
            elif FIELD == 'PT' :  # prototype
                E.PROTOTYPE = E.getPROTO(ELINE)
            elif FIELD == 'RS' :  #restriction site
                E.SITE = E.getRSlist(ELINE)
            elif FIELD == 'CR' : #commercial availability
                E.COMMERCIAL = E.getCOMMERCIAL(ELINE)
        # Ignore all enzymes that have unknown recognition sequences or cutting sites
        for S in E.SITE :
            if S.Rseq == "?" or S.Rcut == "?" :
                OKAY = False

        if OKAY :
            E.SYMMETRIC = E.isSymmetric()
            E.ENDS = E.EndType()

            # OKAY is False if any of the criteria from enzyme E match fail to match
            # criteria from command line parameters P       
            if P.PROTOTYPES and not E.PROTOTYPE :
                OKAY = False  
            if P.COMMERCIAL and not E.COMMERCIAL :
                OKAY = False 
            if P.SYMMETRY == ["s"] and not E.SYMMETRIC :
                OKAY = False
            if P.SYMMETRY == ["a"] and E.SYMMETRIC :
                OKAY = False
            if not E.ENDS in P.ENDS  :
                OKAY = False

        # If enzyme meets all criteria, print it to output
        if OKAY :
            if DEBUG: 
                print(E.ID)
                for S in E.SITE :
                    print(S.Rseq + ' ' + S.Rcut)
            E.writeenz(OUTFILE)        

        # Get the next enzyme
        LINE = INFILE.readline()
        E.__init__()

    

    INFILE.close()
    OUTFILE.close()

    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()
