#!/usr/local/bin/python
# January 23, 2005, Dr. Brian Fristensky, University of Manitoba

# Description: Given a file of GI numbers, return sequence lengths 

# Synopsis: gi2size.py infile outfile

# Files: infile      GDE flat file, with name on line 1, followed by
#                    comma separated list of tokens GI numbers
#        outfile     GDE flat file, with name on line 1, followed by
#                    comma-separated list of integers 

import sys
import string
import os

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class FILE :
      "Wrapper class for files"
     
      def __init__(self,FILENAME,MODE) :
          self.FN = FILENAME
	  self.F = open(FILENAME,MODE)
	  self.LINE = "" # most recent line read

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class IDLST :
      "Wrapper class for ID lists"
     
      def __init__(self) :
          self.NAME = ""
	  self.LST = []
	  	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in old and new strings, striping 
# leading and trailing whitespace, including
# newline characters.
def GETGDELIST(INFILE,NAMEFLAG,GILST) :

    # Read name line
    while (INFILE.LINE != "" and GILST.NAME == "") :
          INFILE.LINE = INFILE.LINE.strip()
	  if len(INFILE.LINE) > 0 :
             if INFILE.LINE[0] == NAMEFLAG :
                GILST.NAME = INFILE.LINE[1:]
          INFILE.LINE = INFILE.F.readline()	  
    
    # Read GI list 
    GILST.LST = []   
    if GILST.NAME != "" :
       # GDE wraps the flat file with newlines every 60
       # characters. 
       # Next, we have to delete the newlines to turn the entire
       # file into a single long string called BIGLINE
       BIGLINE = ""  
       DONE =  0     
       while (INFILE.LINE != "" and DONE ==0) :
             TMPLINE = INFILE.LINE.strip()
	     if len(TMPLINE) > 0 :
	        if TMPLINE[0] == NAMEFLAG :
		   DONE = 1 
	        else:
		   BIGLINE = BIGLINE + TMPLINE
	           INFILE.LINE = INFILE.F.readline()

       # parse the string as a comma separated list
       GILST.LST = BIGLINE.split(',')

    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# convert GILIST into list of sequence lengths
def CONVERT(GILST) : 

    # Create a temporary filename
    TFN = 'gi2size.' + str(os.getpid())
    TEMPFILE = FILE(TFN,'w')

    SIZELST = IDLST()    
    #If name has .gi extension, replace with .len extension
    # Actually, we truncate at the first occurrence of .gi, because
    # the linux-intel version of GDE mysteriously appends a parenthetical
    # expression to the name eg. if the name is temp.gi, the name is written
    # to the flat files as something like temp.gi(27). Who knows why?
    EXT = GILST.NAME.find('.gi')
    if EXT > -1 :
       SIZELST.NAME = GILST.NAME[:EXT] + '.len'
    else :
       SIZELST.NAME = GILST.NAME + '.len'
    
    print SIZELST.NAME
    LEN = len(GILST.LST)
    if  LEN > 0 :
       for i in range(0,LEN) :
           MPI = str(GILST.LST[i])
           COMMAND = 'leash -mn SHoundSequenceLength -mpi ' + MPI + ' -of ' + TFN
           os.system(COMMAND)
           TEMPFILE.F.close()
    
	   # Read in temp file, which is a single line
	   TEMPFILE = FILE(TFN,'r')
	   LINE = TEMPFILE.F.readline()
	   ID = LINE.strip()
	   SIZELST.LST.append(ID)      
	   TEMPFILE.F.close()
    
    os.remove(TFN)
    return SIZELST

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Write the list as a single line of comma-separated values
def WRITEFLATFILE(F,NAMEFLAG,SIZELST) :
    # Write name
    if SIZELST.NAME != "" :
       F.write(NAMEFLAG + SIZELST.NAME + '\n')
    else :
       F.write(NAMEFLAG + OFN + '\n')
    # Write list
    LEN = len(SIZELST.LST)
    if  LEN > 0 :
       for i in range(0,LEN) :
           F.write(SIZELST.LST[i])
	   if i < LEN-1 :
	      F.write(",")	   
       F.write('\n')
	      
  
#======================== MAIN PROCEDURE ==========================

#---------- Set global variables
IFN = sys.argv[1]
OFN = sys.argv[2]
NAMEFLAG = '"'  # 1st character on the name line, indicating
                # the beginning of the next data list

INFILE = FILE(IFN,'r')
OUTFILE = FILE(OFN,'w')


INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line

while (INFILE.LINE != "") :

   # Read in GDE flat file
   GILST = IDLST()
   GETGDELIST(INFILE,NAMEFLAG,GILST)
 
   # For each GI number, find the length of the 
   # corresponding sequence.
   SIZELST = IDLST()
   SIZELST = CONVERT(GILST)
  
   #Write the list
   WRITEFLATFILE(OUTFILE.F,NAMEFLAG,SIZELST)


INFILE.F.close()
OUTFILE.F.close()



