#!/usr/local/bin/python
# September 20, 2004, Dr. Brian Fristensky, University of Manitoba

# Description: Given a file of numbers, return basic statistics
#     including count, sum, mean, median, and standard deviation 

# Synopsis: rawstats.py infile outfile

# Files: infile      GDE flat file, with name on line 1, followed by
#                    comma separated list of tokens GI numbers
#        outfile     GDE flat file, with name on line 1, followed by
#                    comma-separated list of integers 

import sys
import string
import os
import math

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class FILE :
      "Wrapper class for files"
     
      def __init__(self,FILENAME,MODE) :
          self.FN = FILENAME
	  self.F = open(FILENAME,MODE)
	  self.LINE = "" # most recent line read
	  self.TYPE = "flat"

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class IDLST :
      "Wrapper class for ID lists"
     
      def __init__(self) :
          self.NAME = ""
	  self.LST = []
	  self.COUNT = 0
	  self.SUM = 0
	  self.MEAN = 0
	  self.MEDIAN = 0
	  self.SD = 0
	  	  
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in old and new strings, striping 
# leading and trailing whitespace, including
# newline characters.
def GETGDELIST(INFILE,NAMEFLAG,DATASET) :

    # Read name line
    while (INFILE.LINE != "" and DATASET.NAME == "") :
          INFILE.LINE = INFILE.LINE.strip()
	  if len(INFILE.LINE) > 0 :
             if INFILE.LINE[0] == NAMEFLAG :
                DATASET.NAME = INFILE.LINE[1:]
          INFILE.LINE = INFILE.F.readline()	  
    
    # Read GI list 
    DATASET.LST = []   
    if DATASET.NAME != "" :
       # GDE wraps the flat file with newlines every 60
       # characters. 
       # Next, we have to delete the newlines to turn the entire
       # file into a single long string called BIGLINE
       BIGLINE = ""  
       DONE = 0     
       while (INFILE.LINE != "" and DONE == 0 ) :
             TMPLINE = INFILE.LINE.strip()
	     if len(TMPLINE) > 0 :
	        if TMPLINE[0] == NAMEFLAG :
		   DONE = 1
	        else:
		   BIGLINE = BIGLINE + TMPLINE
	           INFILE.LINE = INFILE.F.readline()

       # parse the string as a comma separated list
       DATASET.LST = BIGLINE.split(',')

    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Calculate basic statistics on the list
def DOSTATS(DATASET) : 
   
    #If name has .gi extension, replace with .stat extension
    DATASET.NAME = DATASET.NAME + '.stat'

    LEN = len(DATASET.LST)
    DATASET.COUNT = LEN
    SUMSQ=0
    
    if  LEN > 0 :
       for i in range(0,LEN) :
           DATASET.LST[i] = int(DATASET.LST[i])
           DATASET.SUM = DATASET.SUM + DATASET.LST[i]
	   SUMSQ = SUMSQ + DATASET.LST[i]*DATASET.LST[i]
       DATASET.SD =  \
         math.sqrt((SUMSQ - ((DATASET.SUM*DATASET.SUM)/DATASET.COUNT))/DATASET.COUNT)

       DATASET.MEAN = float(DATASET.SUM)/float(DATASET.COUNT)
       DATASET.LST.sort()
       # Note: In Python, division of two integers
       # gives the floor of the result eg. 9/2 = 4, whereas
       # 9.0/4.0 = 4.5
       if LEN == 0 :
	  DATASET.MEDIAN = 0
       elif LEN == 1 :
	  DATASET.MEDIAN = DATASET.LST[0]
       else :
	  M1 = (LEN/2) # remember, numbering begins with 0
	  if LEN%2 == 0 : # even number; div by 2.0 to force real math
	     DATASET.MEDIAN = (DATASET.LST[M1-1] + DATASET.LST[M1]) / 2.0
	  else : # odd number
	     DATASET.MEDIAN = DATASET.LST[M1]  

    return

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Write the list as a single line of comma-separated values
def WRITEFLATFILE(OUTFILE,NAMEFLAG,DATASET) :

    def WRITEFIELD (N,TERMINATOR) :
        S = str(N) + TERMINATOR
	OUTFILE.F.write(S)
    
    # Write name
    if OUTFILE.TYPE == "csv" :
       OUTFILE.F.write(DATASET.NAME + ',')
    else :
       OUTFILE.F.write(NAMEFLAG + DATASET.NAME + '\n')    
    # Write statistics
    WRITEFIELD(DATASET.COUNT,",")
    WRITEFIELD(DATASET.SUM,",")
    WRITEFIELD(DATASET.MEAN,",")    
    WRITEFIELD(DATASET.MEDIAN,",")	   
    WRITEFIELD(DATASET.SD,"\n")	      
  
#======================== MAIN PROCEDURE ==========================

#---------- Set global variables
IFN = sys.argv[1]
OFN = sys.argv[2]
      
NAMEFLAG = '"'  # 1st character on the name line, indicating
                # the beginning of the next data list

INFILE = FILE(IFN,'r')
OUTFILE = FILE(OFN,'w')
if len(sys.argv) >3 :
   if sys.argv[3] == "-csv" :
      OUTFILE.TYPE = "csv"
      #Header lines identifying the fields 
      OUTFILE.F.write('name,count,sum,mean,median,stdev' + '\n') 
else :     
  #Header lines identifying the fields 
  OUTFILE.F.write(NAMEFLAG + 'name' + '\n')
  OUTFILE.F.write('count,sum,mean,median,stdev' + '\n')

INFILE.LINE = INFILE.F.readline() # LINE contains the most recently-read line

while (INFILE.LINE != "") :

   # Read in GDE flat file
   DATASET = IDLST()
   GETGDELIST(INFILE,NAMEFLAG,DATASET)
 
   # For each GI number, find the length of the 
   # corresponding sequence.
   DOSTATS(DATASET)
  
   #Write the list
   WRITEFLATFILE(OUTFILE,NAMEFLAG,DATASET)


INFILE.F.close()
OUTFILE.F.close()



