#!/usr/bin/env python3
"""
 uniqid.py - Read a source file and replace each definition line with a unique
             identifier. Store the unique ID and original definition line
             in a .csv file as a key-value pair.

 Synopsis:
   uniqid.py [options]  --encode sourcein sourceout csvout
   uniqid.py [options]  --encodesame textin textout csvin
   uniqid.py [options]  --decode textin textout csvin

          options begin with a dash; filenames do not

        --encode (default)   
                            The first three filenames on the command line
                            are read as sourcein, the original source file;
                            sourceout, the sourcefile sequences in which the
                            description line is replaced with a unique ID;
                            and csvout, a comma-separated value file containing
                            the unique identifier and the corresponding
                            definition line

        --encodesame        Encode another file, substituting in the same random
                            names from a previous run using --encode. This makes
                            it possible to encode two or more files using the 
                            same random names, so that all output files generated
                            can be decoded with a single csv file. 

                            The first three filenames on the command line
                            are read as sourcein, the original source file;
                            sourceout, the sourcefile in which the
                            description line is replaced with a unique ID
                            generated previously by --encode;
                            and csvin, a comma-separated value file containing
                            the unique identifier and the corresponding
                            definition line

        --decode            The first three filenames on the command line
                            are read as textin, any text file containing
                            unique IDs generated from a previous run using
                            -encode; textout the output file in which the
                            unique ID is replaced by the original name, or
                            the name plus parts of the definition line; csvin,
                            the csv file generated by a previous run using
                            -encode.

        -f list_of_fields   similar to -f in the Unix cut
                            command. A comma-separated list of fields to be
                            written to textout when decoding files.

        -s seperator        seperator is a character to use as the seperator
                            when parsing a definition line into fields.
                            default = " ", a blank space

        -nf string          string is one or more characters to begin the
                            unique identifier, which which the definition
                            line is replaced. (default '!_')

 Idea for more general version of program:
   An option lets you input a regular expression that is used for
   finding the original ID, rather than just hardwiring fasta format
   into the program. The program will still default to search for fasta
   sequence names, but by employing regular expressions, uniqid.py
   can perform substitutions in ANY type of file. Probably not hard
   to implement, either.

@modified: August 15 2022
@author: Brian Fristensky, Dale Hamel
@contact: brian.fristensky@umanitoba.ca
"""

import argparse
import bisect
import math
import operator
import os
import random
import re
import string
import sys

blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

PROGRAM = "uniqid.py: "
USAGE = "\n\t USAGE: uniqid.py [options]  --encode sourcein sourceout csvout\n\t\tuniqid.py [options]  --decode textin textout csvin"

DEBUG = False


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
"Wrapper class for command line parameters"
class Parameters:

    def __init__(self):
        """
                Initializes arguments:
                        FIELDS=[1]
                        SEP= " " (whitespace)
                        NAMEFLAG="!%"
                        ACTIOM=""
                        SOURCEIN=""
                        CSVIN=""
                        TEXTIN=""
                        TEXTOUT=""
                        CSVOUT=""
                        CSVSEP="\\\t"
                Then calls read_args() to fill in their values from command line
                """

        self.FIELDS = [1] # list of fields to parse with f option
        self.SEP = " " # seperator for parsing fields from the def. line
        self.NAMEFLAG = "!%" # all IDs begin with this string
        self.ACTION = ""
        self.SOURCEIN = ""
        self.SOURCEOUT = ""
        self.CSVIN = ""
        self.TEXTIN = ""
        self.TEXTOUT = ""
        self.CSVOUT = ""
        self.CSVSEP = "\t"
        self.read_args()


    def unquote(self, S):
        """
            Remove leading and trailing quotes from a string
            @param STR: The string to clean up
            @type STR: str
            """

        if not S == "" : 
            if S.startswith('"') :
                S = S.replace('"', '')
            else:
                S = S.replace("'", "")
        return S

    def read_args(self):
        """
                Read command line arguments into a Paramters object
                """
        parser = argparse.ArgumentParser()

        parser.add_argument("-f", action="store", default="", help="comma separated list of field numbers")
        parser.add_argument("-nf", action="store", default="", help="prefix for randomized IDs")
        parser.add_argument("-s", action="store", default="", help="field separator for csv files")

        task = parser.add_mutually_exclusive_group()
        task.add_argument("--encode", action="store_true")
        task.add_argument("--decode", action="store_true")
        task.add_argument("--encodesame", action="store_true")

        parser.add_argument("infile", action="store", default="", help="input file")
        parser.add_argument("outfile", action="store", default="", help="output file")
        parser.add_argument("csvfile", action="store", default="", help="csv file")


        try:
            args = parser.parse_args()
            
            if not args.f == "" :
                self.FIELDS = args.f.split(",")
            if not args.nf == "" :
                self.NAMEFLAG = self.unquote(args.nf)
            if not args.s == "" :
                self.SEP = self.unquote(args.s)

            if args.encode :
                self.TASK = "encode"
                self.SOURCEIN = args.infile
                self.SOURCEOUT = args.outfile
                self.CSVOUT =   args.csvfile
            elif args.decode :
                self.TASK = "decode"
                self.TEXTIN = args.infile
                self.TEXTOUT = args.outfile
                self.CSVIN =   args.csvfile
            else :
                self.TASK = "encodesame"
                self.TEXTIN = args.infile
                self.TEXTOUT = args.outfile
                self.CSVIN =   args.csvfile

        except ValueError:
            print(USAGE)

        if DEBUG :
            print("FIELDS: " + str(self.FIELDS))
            print("SEP: " + self.SEP)
            print("NAMEFLAG: " + self.NAMEFLAG)
            print("TASK: " + self.TASK)
            print("SOURCEIN: " + self.SOURCEIN)
            print("SOURCEOUT: " + self.SOURCEOUT)
            print("CSVOUT: " + self.CSVOUT)
            print("TEXTIN: " + self.TEXTIN)
            print("TEXTOUT: " + self.TEXTOUT)
            print("CSVIN: " + self.CSVIN)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def EncodeNames(NAMEFLAG, SOURCEIN, SOURCEOUT, CSVOUT, CSVSEP):
    """
    For each sequence in sourcein, write the sequence to sourceout
    replacing the definition line with a unique identifier.
    Write the unique identifier and the definition line to csvout
    """

    def GetUniqID(NAMELIST):
        """
        Choose a random number, and make sure it hasn't already
        been recorded in NAMELIST. If we just kept appending
        to NAMELIST, checking to see if a number was already used
        would become inefficient for large lists. We want to keep
        adding numbers to the list so that it stays sorted.
        bisect_left module lets us insert each new value into an
        already-sorted list to the left of the next-highest value.
        Thus, the list always stays sorted. See
        http://www.doughellmann.com/PyMOTW/bisect/index.html
        """

        DONE = bool(False)
        ID = str(random.randint(1, 9999999))
        while not DONE:
            if ID in NAMELIST:
                ID = str(random.randint(1, 9999999))
            else:
                bisect.insort_left(NAMELIST, ID)
                DONE = bool(True)
        return ID

    # create a dummy file just so that we have a file to close
    # the first time the loop is executed. This also takes care
    # of files in which the first sequence begins after the first line

    try:
        FIN = open(SOURCEIN, 'r')
    except:
        print("uniqid.py: Cannot open file: " + SOURCEIN)

    FOUT = open(SOURCEOUT, 'w')
    COUT = open(CSVOUT, 'w')

    NAMELIST = []

    for LINE in FIN:
        LINE = LINE.strip()
        if len(LINE) > 0:
            if LINE[0] == '>': #new sequence
                UNIQNAME = NAMEFLAG + GetUniqID(NAMELIST)
                FOUT.write('>' + UNIQNAME + '\n')
                # > is not considered as part of the definition line
                # so we drop the first char. of LINE
                COUT.write(UNIQNAME + CSVSEP + LINE[1:] + '\n')
            else: #copy the line to output file
                FOUT.write(LINE + '\n')

    FIN.close()
    FOUT.close()
    COUT.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def EncodeSame(NAMEFLAG, TEXTIN, TEXTOUT, CSVIN, CSVSEP):

    """
    For each sequence in textin, write the sequence to texteout
    replacing the definition line with a unique identifier from csvin.

    """

    def ReadDefLines(CSVIN):
        """
        Read unique ids and corresponding definition lines
        from CSV into a dictionary
        @param CSVIN: The name of the file containing the CSV data
        @type CSVIN: str
        """
        DICT = {}
        CIN = open(CSVIN, 'r')
        for LINE in CIN.readlines():
            TOKENS = LINE.strip().split(CSVSEP)
            DICT[TOKENS[1]] = TOKENS[0]
        CIN.close()
        return DICT

    try:
        TIN = open(TEXTIN, 'r')
    except:
        print("uniqid.py: Cannot open file: " + TEXTIN)

    TOUT = open(TEXTOUT, 'w')

    # Read in key-value pairs of unique IDs and definition lines
    IDDICT = ReadDefLines(CSVIN)
    KEYLIST = list(IDDICT.keys())

    # for each line in the file, replace the ID with the original
    # definition line
    for LINE in TIN.readlines():
        OUTPUTLINE = LINE.strip()

        # Replace each occurence of a keyword in the an input line
        # with the corresponding random name from CIN.
        if DEBUG: 
            print('Original: ' + OUTPUTLINE)
        for K in KEYLIST :
            OUTPUTLINE = OUTPUTLINE.replace(K, IDDICT[K])
        if DEBUG :
            print('Modified: ' + OUTPUTLINE)
        TOUT.write(OUTPUTLINE + '\n')

    TIN.close()
    TOUT.close()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def DecodeNames(SEP, NF, FIELDS, TEXTIN, TEXTOUT, CSVIN):
    """
    For each occurence of a name in textin, write the sequence to sourceout
    replacing the definition line with a unique identifier.
    Write the unique identifier and the definition line to csvout
    """
    try:
        TIN = open(TEXTIN, 'r')
    except:
        print("uniqid.py: Cannot open file: " + TEXTIN)

    TOUT = open(TEXTOUT, 'w')


    def ReadDefLines(CSVIN):
        """
        Read unique ids and corresponding definition lines
        from CSV in into a dictionary
        @param CSVIN: The name of the file containing the CSV data
        @type CSVIN: str
        """
        DICT = {}
        CIN = open(CSVIN, 'r')
        for LINE in CIN:
            TOKENS = LINE.strip().split('\t')
            DICT[TOKENS[0]] = TOKENS[1]
        CIN.close()
        return DICT


    def GetFields(DEF, FIELDS, SEP):
        """
        Get one or more fields from a definition line, using
        SEP as the field seperator
        @param DEF: The definition line
        @type DEF: str
        @param FIELDS: A list of fields
        @type FIELDS: list
        @param SEP: The string to split on
        @type SEP: str
        """
        STR = ""
        TOKENS = DEF.split(SEP)
        #print(TOKENS)
        LEN = len(TOKENS)
        if LEN > 0:
            J = int(1)
            #print(J)
            STR = TOKENS[int(FIELDS[J-1])-1]
            J = J + 1
            while J < LEN:
                #print(J)
                STR = STR + SEP + TOKENS[int(FIELDS[J-1])-1]
                J = J + 1
        return STR


    # Read in key-value pairs of unique IDs and definition lines
    IDDICT = ReadDefLines(CSVIN)

    # for each line in the file, replace the ID with the original
    # definition line
    for LINE in TIN:
        OUTPUTLINE = LINE.strip()
        if OUTPUTLINE.find(NF) != -1:
            # Find each occurence of
            if DEBUG: 
                print('Original: ' + OUTPUTLINE)
            for K in list(IDDICT.keys()):
                STR = GetFields(IDDICT[K], FIELDS, SEP)
                OUTPUTLINE = OUTPUTLINE.replace(K, STR)
            if DEBUG :
                print('Modified: ' + OUTPUTLINE)
        TOUT.write(OUTPUTLINE + '\n')

    TIN.close()
    TOUT.close()


#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
    P = Parameters ()

    if P.TASK == "encode":
        EncodeNames(P.NAMEFLAG, P.SOURCEIN, P.SOURCEOUT, P.CSVOUT, P.CSVSEP)
    elif P.TASK == "decode" :
        DecodeNames(P.SEP, P.NAMEFLAG, P.FIELDS, P.TEXTIN, P.TEXTOUT, P.CSVIN)
    elif P.TASK == "encodesame" :
        EncodeSame(P.NAMEFLAG, P.TEXTIN, P.TEXTOUT, P.CSVIN, P.CSVSEP)
    else :
        print(USAGE)

if ( "-test" in sys.argv):
    pass
else:
    main()