#!/usr/bin/env python
"""
Dr. Brian Fristensky, University of Manitoba

 Description: Convert Phylip file into other file formats

 Synopsis: phylcnv.py [-inf format] [-outf format] [-inv] [infile] [outfile]

 Files: infile      GDE flat file, containing a comma separated
                    list of tokens

        outfile     file of tokens


@modified: Feb. 25, 2020
@author: Brian Fristensky
@contact: frist@cc.umanitoba.ca
"""

import os.path
import sys

blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod
from birchlib import Argument

PROGRAM = "phylcnv.py: "
USAGE = "\n\t USAGE: phylcnv.py [options] infile outfile"
BM = Birchmod(PROGRAM, USAGE)

class Options:

    def __init__(self):
        """
                Initializes arguments:
                        Ifn=""
                        Ofn=""
                Then calls read_args() to fill in their values from command line
                """
        self.Ifn = ""
        self.Ofn = ""
        self.InFormat = "pint"
        self.OutFormat = "tsv"
        self.Invert = False
        self.read_args()

    def read_args(self):
        """
                Reads command line arguments into a Paramter object
                """
        self.AInf = Argument("-inf", str, BM)
        self.AInf.set_optional()

        self.AOutf = Argument("-outf", str, BM)
        self.AOutf.set_optional()

        self.AInvert = Argument("-inv", str, BM)
        self.AInvert.set_is_switch()
        self.AInvert.set_optional()

        Ainfile = Argument("", str, BM)
        Ainfile.set_position(-2)

        Aoutfile = Argument("", str, BM)
        Aoutfile.set_position(-1)


        try:
            if (BM.arg_given("-inf")):
                self.InFormat = self.AInf.fetch()
            if (BM.arg_given("-outf")):
                self.OutFormat = self.AOutf.fetch()
            self.Invert = BM.arg_given("-inv")
            self.Ifn = Ainfile.fetch()
            self.Ofn = Aoutfile.fetch()
        except ValueError:
            BM.printusage()


class Sequence:
    def __init__(self):
        """
        Holds name and sequence
                """
        self.Name = ""
        self.Seq=""

class SeqData:
    def __init__(self):
        """
        Holds sequences and associated data
                """
        self.SeqLst = []  #list of sequences
        self.NumSeq = 0 # number of sequences
        self.SeqLen = 0 #length of sequences (all must be the same)
        self.NumEnz = 0 #number of rest. enzymes, primrily used by RESTML

    """
    Calculate the minimum and maximum lengths of sequences read.
    Return true if they are equal, false if they are not.
    """
    def AllSeqsSameLength(self):
        MinRead = len(self.SeqLst[0].Seq)
        MaxRead = MinRead
        for i in range(1, len(self.SeqLst)) :
            if len(self.SeqLst[i].Seq) < MinRead :
                MinRead = len(self.SeqLst[i].Seq)
            elif len(self.SeqLst[i].Seq) > MaxRead :
                MaxRead = len(self.SeqLst[i].Seq)
        return  (MinRead == MaxRead)

    """
    Change the sense of a molecular marker set by changing characters
    as follows: 0 -->1, 1-->0, + --> -, and - --> +
    """
    def InvertSeq(self):
        for i in range(0, len(self.SeqLst)-1) :
            tempseq = self.SeqLst[i].Seq.replace("1","!")
            tempseq = tempseq.replace("0","1")
            tempseq = tempseq.replace("!","0")
            tempseq = tempseq.replace("-","_")
            tempseq = tempseq.replace("+","-")
            self.SeqLst[i].Seq = tempseq.replace("_","-")
        return


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def ReadPhylipInterleaved(O,S):
    """
    Read in a Phylip Interleaved file.
    """

    """
    Parse a sequence name form an input line
    """
    def ReadName(line):
        Name = line[0:9].rstrip()
        return Name

    """
    Parse sequence data from an input line
    """
    def ReadSeq(line,Start,Finish):
        Seq = line[Start:Finish]
        return Seq

    try:
        in_file = open(O.Ifn, 'r')
    except:
        BM.file_error(O.Ifn)
    # read first line, containing number of sequences, seq. length, and optionally,
    #
    line = in_file.readline()
    values = line.split()
    S.NumSeq=int(values[0])
    S.SeqLen=int(values[1])
    if len(values) > 2 :
        S.NumEnz = int(values[2])

    # Initialize the array of sequences. Since the first line tells us precisely
    # how many sequences there are, we can add blank sequence objects right at
    # the beginning.
    for i in range(0, S.NumSeq) :
        tempseq = Sequence()
        S.SeqLst.append(tempseq)

    # Read sequences

    line = in_file.readline()
    j = 0

    # The Phylip interleaved format tells precisely how many sequences there are. If
    # If there are x sequences, the first group of x lines, after line 1 in the file,
    # have the sequence names in the first 10 characters, followed by sequences.
    # If the sequences are long, additional groups of x lines may follow, each having
    # continuations of the x sequences. FirstGroup is true while we are still in the
    # first group of sequences, ie, the first x lines.
    FirstGroup = True
    while (line != "") :
        if FirstGroup :
            S.SeqLst[j].Name = ReadName(line)
            S.SeqLst[j].Seq = ReadSeq(line,10,len(line)).strip()
        else:
            S.SeqLst[j].Seq = S.SeqLst[j].Seq + line.strip()
        j = j + 1
        if j == S.NumSeq :
            FirstGroup = False
            j = 0
        line = in_file.readline().strip()


    in_file.close()
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def ReadPhylipSequential(O,S):
    """
    Read in a Phylip Sequential file.
    """

    """
    Parse a sequence name form an input line
    """
    def ReadName(line):
        Name = line.strip()
        return Name


    try:
        in_file = open(O.Ifn, 'r')
    except:
        BM.file_error(O.Ifn)
    # read first line, containing number of sequences, seq. length, and optionally,
    #
    line = in_file.readline()
    values = line.split()
    S.NumSeq=int(values[0])
    S.SeqLen=int(values[1])
    if len(values) > 2 :
        S.NumEnz = int(values[2])

    # Initialize the array of sequences. Since the first line tells us precisely
    # how many sequences there are, we can add blank sequence objects right at
    # the beginning.
    for i in range(0, S.NumSeq) :
        tempseq = Sequence()
        S.SeqLst.append(tempseq)

    # Read sequences

    line = in_file.readline()
    j = 0

    # In the Phylip sequential format each sequence as a name line, followed
    # by one or more lines of sequence. The only way to know that we have
    # gotten to the end of a sequence is if we have read in SeqLen characters
    # of sequence.
    while j < S.NumSeq :
        S.SeqLst[j].Name = ReadName(line)
        line = in_file.readline()
        SeqRead = 0
        while (line != "") and (SeqRead < S.SeqLen):
            tempseq = line.strip()
            S.SeqLst[j].Seq = S.SeqLst[j].Seq + tempseq
            SeqRead = len(S.SeqLst[j].Seq)
            line = in_file.readline()
        j = j + 1

    in_file.close()
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def ReadCSV(O,S,Sep):
    """
    Read in a comma-separated value or tab-separated value file.
    """

    try:
        in_file = open(O.Ifn, 'r')
    except:
        BM.file_error(O.Ifn)

    # Read sequences
    line = in_file.readline()
    S.NumSeq = 0
    while line != "" :
        tempSeq = Sequence()
        S.SeqLst.append(tempSeq)
        # delete double quotes, which are sometimes added when
        # spreadsheets export a csv file
        line = line.replace('"','')
        templist = line.partition(Sep)
        S.SeqLst[S.NumSeq].Name = templist[0]
        #delete separator characters
        S.SeqLst[S.NumSeq].Seq = templist[2].replace(Sep,"").strip()
        S.SeqLen = len(S.SeqLst[S.NumSeq].Seq)
        S.NumSeq = S.NumSeq + 1
        line = in_file.readline()

    in_file.close()
    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def writecsvfile(O,S,Sep):
    """
    Write the data as a single line of comma-separated values
    """
    try:
        outfile = open(O.Ofn, 'w')
    except:
        BM.file_error(O.Ofn)
    for i in range(0, len(S.SeqLst)):
        outfile.write(S.SeqLst[i].Name)
        for j in range(0, len(S.SeqLst[i].Seq)):
            outfile.write(Sep + S.SeqLst[i].Seq[j])
        outfile.write('\n')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def writePhylipInterleaved(O,S):
    """
    Write the data in Phylip interleaved format.
    """
    try:
        outfile = open(O.Ofn, 'w')
    except:
        BM.file_error(O.Ofn)

    # Write the number of sequences, length, and optional number of enzymes,
    # as specified for the Phylip restml program.
    outfile.write(str(S.NumSeq) + " " + str(S.SeqLen))
    if S.NumEnz != 0 :
        outfile.write(" " + str(S.NumEnz))
    outfile.write('\n')

    LineLen = 50
    FirstGroup = True
    Start = 0
    while Start < S.SeqLen :
        for i in range(0,S.NumSeq) :

        # Write out names on the first set of lines
            if FirstGroup :
                outfile.write(S.SeqLst[i].Name.ljust(10," "))

            # Write a line of sequence data
            Finish = Start + LineLen - 1
            if Finish >= S.SeqLen :
                Finish = S.SeqLen-1
            # Python slice notation: first number is starting index. Seond
            # number is stopping index plus 1. I guess this made sense to somebody.
            outfile.write(S.SeqLst[i].Seq[Start:Finish+1])
            outfile.write('\n')

        FirstGroup = False
        Start = Finish + 1

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def writePhylipSequential(O,S):
    """
    Write the data in Phylip sequential format.
    """
    try:
        outfile = open(O.Ofn, 'w')
    except:
        BM.file_error(O.Ofn)

    # Write the number of sequences, length, and optional number of enzymes,
    # as specified for the Phylip restml program.
    outfile.write(str(S.NumSeq) + " " + str(S.SeqLen))
    if S.NumEnz != 0 :
        outfile.write(" " + str(S.NumEnz))
    outfile.write('\n')

    LineLen = 50

    for i in range(0,S.NumSeq) :

        # Write name - yes, Toto, in Oz (well, in Phylip) the name must
        # be padded with blanks to exactly 10 characters. Sheesh!
        outfile.write(S.SeqLst[i].Name.ljust(10," "))
        outfile.write('\n')

        # Write sequence
        Start = 0
        while Start < S.SeqLen :
            # Write a line of sequence data
            Finish = Start + LineLen - 1
            if Finish >= S.SeqLen :
                Finish = S.SeqLen-1
            # Python slice notation: first number is starting index. Seond
            # number is stopping index plus 1. I guess this made sense to somebody.
            outfile.write(S.SeqLst[i].Seq[Start:Finish+1])
            outfile.write('\n')
            Start = Finish + 1

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

def writeFlat(O,S,FlagChar):
    """
    Write the data in various flat file formats.
    fasta - FlagChar = '>'
    flatdna - FlagChar = '#'
    flatpro - FlagChar = '%'
    flattext - FlagChar = '"'
    """
    try:
        outfile = open(O.Ofn, 'w')
    except:
        BM.file_error(O.Ofn)


    LineLen = 50

    for i in range(0,S.NumSeq) :

        # Write name
        outfile.write(FlagChar + S.SeqLst[i].Name)
        outfile.write('\n')

        # Write sequence
        Start = 0
        while Start < S.SeqLen :
            # Write a line of sequence data
            Finish = Start + LineLen - 1
            if Finish >= S.SeqLen :
                Finish = S.SeqLen-1
            # Python slice notation: first number is starting index. Seond
            # number is stopping index plus 1. I guess this made sense to somebody.
            outfile.write(S.SeqLst[i].Seq[Start:Finish+1])
            outfile.write('\n')
            Start = Finish + 1


#======================== MAIN PROCEDURE ==========================

#---------- Set global variables
def main():
    """
        Called when not in documentation mode.
        """
    O = Options()

    outfile = open(O.Ofn, 'w')

    if os.path.exists(O.Ifn):
        S = SeqData()
        FormatOkay = True
        if O.InFormat == "pint":
            ReadPhylipInterleaved(O,S)
        elif O.InFormat == "pseq":
            ReadPhylipSequential(O,S)
        elif O.InFormat == "csv":
            ReadCSV(O,S,",")
        elif O.InFormat == "tsv":
            ReadCSV(O,S,"\t")
        else :
            FormatOkay = False
            print(">>> phylcnv.py: Input format " + O.InFormat + " is not supported.")

        #Write the sequences to the output file
        if FormatOkay :
            if S.AllSeqsSameLength():
                if O.Invert:
                    S.InvertSeq()
                if O.OutFormat == "csv" :
                    writecsvfile(O,S,',')
                elif O.OutFormat == "tsv" :
                    writecsvfile(O,S,'\t')
                elif O.OutFormat == "pint" :
                    writePhylipInterleaved(O,S)
                elif O.OutFormat == "pseq" :
                    writePhylipSequential(O,S)
                elif O.OutFormat == "fasta" :
                    writeFlat(O,S,'>')
                elif O.OutFormat == "flatdna" :
                    writeFlat(O,S,'#')
                elif O.OutFormat == "flatpro" :
                    writeFlat(O,S,'%')
                elif O.OutFormat == "flattext" :
                    writeFlat(O,S,'"')
                else :
                    print(">>> phylcnv.py: Output format " + O.OutFormat + " is not supported.")

            else :
                print('>>> phylcnv.py: Phylip files require that all sequences')
                print('>>> must be the same length.')

    outfile.close()

    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()