#!/usr/bin/env python3

# Graham's protein statistic generator
# ------------------------------------
# SYNOPSIS
#   Calculates the following statistics from FASTA protein sequences
#
#           1. the molecular weight of each FASTA sequence.
#           2. the theoretical pI value of the FASTA sequence.
#           3. the composition of each amino acid.
#           4. the total number of amino acids in each protein
#
# USAGE
#   gstat.py <files>
#
#   Examples:
#      gstat.py ecoli-k12.fsa
#      gstat.py -kda ecoli-k12.fsa > data_save.csv
#      gstat.py -img ecoli-k12.fsa ecoli-plasmid1.fsa
#      gstat.py -kda -img ecoli-k12.fsa ecoli-plasmid1.fsa > data_save.csv
#
# INPUT FILES
#    This program ASSUMES that the files are already translated to protein.
#    This program does NOT translate DNA sequences to protein sequences at all.
# 
# PREREQUISITES
#    This program requires the following modules (version numbers in parenthesis
#    are the version numbers I used, NOT the minimum or maximum version numbers):
#
#       Python (2.6.1)
#
# OUTPUT
#    The output of the program is CSV (tab delimited) via stdout.
#    The columns outputted are as follows:
#
#       NAME	Mol. Wt.	pI	COMPOSITION	SEQUENCE
#
#    IF you are using multiple files, this program will print all
#    of the sequences in succession to standard output.
#
###################################
# THIS IS OPEN SOURCE CODE (YAY!) #
###################################
# LICENSE
#    This code is licensed under the Creative Commons 3.0
#    Attribution + ShareAlike license - for details see:
#       http://creativecommons.org/licenses/by-sa/3.0/
#
#    NOTE: this work, itself, is in part derrived from other
#          works.  Please see the ACKNOWLEDGEMENTS/CO-AUTHORS
#          section for additional attributions.
#
# AUTHOR
#    Graham Alvare - home.cc.umanitoba.ca/~alvare
#
# ACKNOWLEDGEMENTS/CO-AUTHORS
#    Dr. Brian Fristensky - my work supervisor, and the man who introduced
#                           me to the wonderful field of bioinformatics.
#    Lukasz Kozlowski - bisection Henderson-Hasselbach algorithm
#                       for determining the theoretical pI of proteins
#                           Kozlowski L. 2007-2012 Isoelectric Point Calculator.
#                           http://isoelectric.ovh.org
#    Dr. Abby Perrill - amino acid pKa table
#                           http://www.cem.msu.edu/~cem252/sp97/ch24/ch24aa.html
#
# QUESTIONS & COMMENTS
#    If you have any questions, please contact me: alvare@cc.umanitoba.ca
#    I usually get back to people within 1-2 weekdays (weekends, I am slower)
#
#    P.S. please also let me know of any bugs, or if you have any suggestions
#         I am generally happy to help create new tools, or modify my existing
#         tools to make them more useful.
#
# Happy usage!
#

import re
import sys

# a list of the basic molecular weights of the basic amino acids
# determined by periodic table and AA composition
aaweights = {
	'A' : 71.09,  # alanine
	'R' : 156.19, # arginine
	'D' : 114.11, # aspartic acid
	'N' : 115.09, # asparagine
	'C' : 103.15, # cysteine
	'E' : 129.12, # glutamic acid
	'Q' : 128.14, # glutamine
	'G' : 57.05,  # glycine
	'H' : 137.14, # histidine
	'I' : 113.16, # isoleucine
	'L' : 113.16, # leucine
	'K' : 128.17, # lysine
	'M' : 131.19, # methionine
	'F' : 147.18, # phenylalanine
	'P' : 97.12,  # proline
	'S' : 87.08,  # serine
	'T' : 101.11, # threonine
	'W' : 186.12, # tryptophan
	'Y' : 163.18, # tyrosine
	'V' : 99.14   # valine
}

# a list of pKa's for amino acids
# references from: http://www.cem.msu.edu/~cem252/sp97/ch24/ch24aa.html
# on Tuesday October 2, 2012
# pka's are listed as such [c-terminus, n-terminus, side chain]
aa_pkas = {
	'A' : [2.35,  9.87       ], # alanine
	'R' : [2.01,  9.04, 12.48], # arginine
	'D' : [2.10,  9.82,  3.86], # aspartic acid
	'N' : [2.02,  8.80       ], # asparagine
	'C' : [2.05, 10.25,  8.00], # cysteine
	'E' : [2.10,  9.47,  4.07], # glutamic acid
	'Q' : [2.17,  9.13       ], # glutamine
	'G' : [2.35,  9.78       ], # glycine
	'H' : [1.77,  9.18,  6.10], # histidine
	'I' : [2.32,  9.76       ], # isoleucine
	'L' : [2.33,  9.74       ], # leucine
	'K' : [2.18,  8.95, 10.53], # lysine
	'M' : [2.28,  9.21       ], # methionine
	'F' : [2.58,  9.24       ], # phenylalanine
	'P' : [2.00, 10.60       ], # proline
	'S' : [2.21,  9.15       ], # serine
	'T' : [2.09,  9.10       ], # threonine
	'W' : [2.38,  9.39       ], # tryptophan
	'Y' : [2.20,  9.11, 10.07], # tyrosine
	'V' : [2.29,  9.72       ]  # valine
}

# the number of C, H, O, and N atoms in each amino acid
# determined from AA structure
aa_comp = {
	'A' : [3,  5,  1, 1], # alanine - OK
	'R' : [6,  12, 1, 4], # arginine
	'D' : [4,  4,  3, 1], # aspartic acid - OK
	'N' : [4,  7,  2, 2], # asparagine
	'C' : [3,  5,  1, 1], # cysteine - OK
	'E' : [5,  7,  3, 1], # glutamic acid
	'Q' : [5,  8,  2, 2], # glutamine
	'G' : [2,  3,  1, 1], # glycine
	'H' : [6,  7,  1, 3], # histidine
	'I' : [6,  11, 1, 1], # isoleucine
	'L' : [6,  11, 1, 1], # leucine
	'K' : [6,  12, 1, 2], # lysine
	'M' : [5,  9,  1, 1], # methionine - OK
	'F' : [9,  9,  1, 1], # phenylalanine
	'P' : [5,  7,  1, 1], # proline **
	'S' : [3,  5,  2, 1], # serine
	'T' : [4,  7,  2, 1], # threonine - OK
	'W' : [11, 10, 1, 2], # tryptophan **
	'Y' : [9,  9,  2, 1], # tyrosine
	'V' : [5,  9,  1, 1]  # valine
}


########
# Theoretical pI calculator
########
# This method esimates the theoretical pI of a protein.
#
# The algorithm used in this method is a modified version of:
#    http://isoelectric.ovh.org/files/isoelectric-point-theory.html
#
# If you use this algorithm in subsequent works, please cite:
#    Kozlowski L. 2007-2012 Isoelectric Point Calculator. http://isoelectric.ovh.org
#
# If you use this code, please include me in addition to Kozlowski:
#    Kozlowski L. 2007-2012 Isoelectric Point Calculator. http://isoelectric.ovh.org
#    Graham Alvare - Python translation of the Isoelectric Point Calculator algoirthm
def calculate_pi(first_aa, last_aa, numarg, numasp, numglu, numcys, numtyr, numhis, numlys, THRESHOLD = 0.01):
	first  = aa_pkas [ first_aa ][0]
	last   = aa_pkas [ last_aa  ][1]
	pH     = 6.5
	pHprev = 0.0
	pHnext = 14.0

	# bisection algorithm from: http://isoelectric.ovh.org/files/isoelectric-point-theory.html
	# C-terminal charge + D charge + E charge + C charge + Y charge + H charge
	# + NH2charge + K charge + R charge
	while not ((pH - pHprev < THRESHOLD) and (pHnext - pH < THRESHOLD)):
		charge =-1/(1+pow(10,(first-pH)))	\
			- numasp/(1+pow(10,(3.9-pH)))	\
			- numglu/(1+pow(10,(4.07-pH)))	\
			- numcys/(1+pow(10,(8.18-pH)))	\
			- numtyr/(1+pow(10,(10.46-pH)))	\
			+ numhis/(1+pow(10,(pH-6.04)))	\
			+ 1/(1+pow(10,(last-8.2)))	\
			+ numlys/(1+pow(10,(pH-10.54)))	\
			+ numarg/(1+pow(10,(pH-12.48)))

		if charge < 0:
			# the charge is negative, we need to decrease the pH
			pHnext = pH
			pH = pH - ((pH - pHprev) / 2)
		else:
			# the charge is positive, we need to increase the pH
			pHprev = pH
			pH = pH + ((pHnext - pH)/2)

	return pH


########
# Protein statistic output
########
# This method prints all of the statistics for a given protein
def printstats(name, seq, kDa = False, aaCount = False, TSV = True):
	tag = ""

	weight = 0
	numala = 0
	numarg = 0
	numasp = 0
	numasn = 0
	numcys = 0
	numglu = 0
	numgln = 0
	numgly = 0
	numhis = 0
	numile = 0
	numleu = 0
	numlys = 0
	nummet = 0
	numphe = 0
	numpro = 0
	numser = 0
	numthr = 0
	numtrp = 0
	numtyr = 0
	numval = 0

	first_aa = ""
	last_aa = ""

	numc = 0
	numh = 0
	numo = 0
	numn = 0

	# counts B, Z, and X
	numasx = 0	# B
	numunk = 0	# X
	numglx = 0	# Z

	numaa = 0

	# loop through every amino acid in the protein sequence
	for aa in seq:
		if   aa == "B":
			numasx  += 1
			numaa += 1
		elif aa == "X":
			numunk  += 1
			numaa += 1
		elif aa == "Z":
			numglx += 1
			numaa += 1
		elif aa in aaweights:
			# calculate the molecular weight
			weight += aaweights[aa]

			# calculate the instance of each charged amino acid (and charge)
			if first_aa == "":
				first_aa = aa

			if aa == "A":
				numala += 1
			elif aa == "R":
				numarg += 1
			elif aa == "D":
				numasp += 1
			elif aa == "N":
				numasn += 1
			elif aa == "C":
				numcys += 1
			elif aa == "E":
				numglu += 1
			elif aa == "Q":
				numgln += 1
			elif aa == "G":
				numgly += 1
			elif aa == "H":
				numhis += 1
			elif aa == "I":
				numile += 1
			elif aa == "L":
				numleu += 1
			elif aa == "K":
				numlys += 1
			elif aa == "M":
				nummet += 1
			elif aa == "F":
				numphe += 1
			elif aa == "P":
				numpro += 1
			elif aa == "S":
				numser += 1
			elif aa == "T":
				numthr += 1
			elif aa == "W":
				numtrp += 1
			elif aa == "Y":
				numtyr += 1
			elif aa == "V":
				numval += 1

			numc += aa_comp[aa][0]
			numh += aa_comp[aa][1]
			numo += aa_comp[aa][2]
			numn += aa_comp[aa][3]

			last_aa = aa
			numaa += 1

	formula = "C" + str(numc) + "H" + str(numh + 2) + "O" + str(numo + 1) + "N" + str(numn)
	if nummet + numcys > 0:
		formula += "S" + str(nummet + numcys)

	if kDa:
		weight = str(round((weight / 1000), 2)) + " kDa"
	else:
		weight = str(round((weight), 2)) + " Da"
	pI = round(calculate_pi(first_aa, last_aa, numarg, numasp, numglu, numcys, numtyr, numhis, numlys), 2)


	if TSV:
		if aaCount:
			print(name + "	" + str(numaa) + "	" + weight + "	" + str(pI) + "	" + formula + "	" \
				+ str(numala) + "	" + str(numasx) + "	" + str(numcys) + "	" \
				+ str(numasp) + "	" + str(numglu) + "	" + str(numphe) + "	" \
				+ str(numgly) + "	" + str(numhis) + "	" + str(numile) + "	" \
				+ str(numlys) + "	" + str(numleu) + "	" + str(nummet) + "	" \
				+ str(numasn) + "	" + str(numpro) + "	" + str(numgln) + "	" \
				+ str(numarg) + "	" + str(numser) + "	" + str(numthr) + "	" \
				+ str(numval) + "	" + str(numtrp) + "	" + str(numunk) + "	" \
				+ str(numtyr) + "	" + str(numglx) + "	" + seq)
		else:
			print(name + "	" + str(numaa) + "	" + weight + "	" + str(pI) + "	" + formula + "	" + seq)

	else:
		print("Name: " + name)
		print("Molecular Weight: " + weight)
		print("Theoretical pI: " + str(pI))
		print("Molecular Formula: " + formula)

		if aaCount:
			# required, so division will work properly
			numaa = float(numaa)

			numnonpol = numgly + numala + numval + numleu + numile + nummet + numphe + numpro
			numpolar  = numser + numthr + numcys + numasn + numgln + numtyr + numtrp
			numcharge = numasp + numglu + numhis + numlys + numarg
			numplus   = numhis + numlys + numarg
			numminus  = numasp + numglu
			numother  = numasx + numglx + numunk

			# print the relevant protein statistics
			print("""Number of amino acids: """ + str(numaa) + """

Nonpolar side chains:      """ + str(numnonpol) + " (" + str(round(numnonpol / numaa, 3)) + """)
   Gly(G)           """ + str(numgly) + " (" + str(round(numgly / numaa, 3)) + """)
   Ala(A)           """ + str(numala) + " (" + str(round(numala / numaa, 3)) + """)
   Val(V)           """ + str(numval) + " (" + str(round(numval / numaa, 3)) + """)
   Leu(L)           """ + str(numleu) + " (" + str(round(numleu / numaa, 3)) + """)
   Ile(I)           """ + str(numile) + " (" + str(round(numile / numaa, 3)) + """)
   Met(M)           """ + str(nummet) + " (" + str(round(nummet / numaa, 3)) + """)
   Phe(F)           """ + str(numphe) + " (" + str(round(numphe / numaa, 3)) + """)
   Pro(P)           """ + str(numpro) + " (" + str(round(numpro / numaa, 3)) + """)

Neutral polar side chains: """ + str(numpolar) + " (" + str(round(numpolar / numaa, 3)) + """)
   Ser(S)           """ + str(numser) + " (" + str(round(numser / numaa, 3)) + """)
   Thr(T)           """ + str(numthr) + " (" + str(round(numthr / numaa, 3)) + """)
   Cys(C)           """ + str(numcys) + " (" + str(round(numcys / numaa, 3)) + """)
   Asn(N)           """ + str(numasn) + " (" + str(round(numasn / numaa, 3)) + """)
   Gln(Q)           """ + str(numgln) + " (" + str(round(numgln / numaa, 3)) + """)
   Tyr(Y)           """ + str(numtyr) + " (" + str(round(numtyr / numaa, 3)) + """)
   Trp(W)           """ + str(numtrp) + " (" + str(round(numtrp / numaa, 3)) + """)

Charged polar side chains: """ + str(numcharge) + " (" + str(round(numcharge / numaa, 3)) + """)
   Positive:        """ + str(numplus) + " (" + str(round(numplus / numaa, 3)) + """)
     His(H)         """ + str(numhis) + " (" + str(round(numhis / numaa, 3)) + """)
     Lys(K)         """ + str(numlys) + " (" + str(round(numlys / numaa, 3)) + """)
     Arg(R)         """ + str(numarg) + " (" + str(round(numarg / numaa, 3)) + """)

   Negative:        """ + str(numminus) + " (" + str(round(numminus / numaa, 3)) + """)
     Asp(D)         """ + str(numasp) + " (" + str(round(numasp / numaa, 3)) + """)
     Glu(E)         """ + str(numglu) + " (" + str(round(numglu / numaa, 3)) + """)

Other:              """ + str(numother) + " (" + str(round(numother / numaa, 3)) + """)
   Asx(B)           """ + str(numasx) + " (" + str(round(numasx / numaa, 3)) + """)
   Glx(Z)           """ + str(numglx) + " (" + str(round(numglx / numaa, 3)) + """)
   Unk(X)           """ + str(numunk) + " (" + str(round(numunk / numaa, 3)) + """)
""")
		print("Sequence: " + seq)
		print()
		print()

########
# Main method body
########
if __name__ == "__main__":
	# stores the start position of the file portion of the command line arguments
	start = 1

	# stores whether you are using IMG/ER FASTA sequences
	IMG = False

	# stores whether you want the molecular weights
	# displayed in daltons (Da) or kilodaltons (kDa)
	kDa = False

	# stores whether an amino acid count should be
	# performed for each sequence.
	aaCount = False

	# whether to present the output in a TSV format
	TSV = False

	# check the presence of command line arguments
	while len(sys.argv) > start:
		if sys.argv[start].lower() in ["-i", "-img"]:
			IMG = True
		elif sys.argv[start].lower() in ["-k", "-kda"]:
			kDa = True
		elif sys.argv[start].lower() in ["-aacount"]:
			aaCount = True
		elif sys.argv[start].lower() in ["-tsv"]:
			TSV = True
		else:
			break
		start += 1

	# ensure that the command was executed with proper arguments
	if len(sys.argv) < 2:
		print("""Graham's protein statistic generator
------------------------------------
calculates the following statistics from FASTA protein sequences

   1. the molecular weight of each FASTA sequence
   2. the theoretical pI value of the FASTA sequence
         using the alogirthm of:
           Kozlowski L. 2007-2012 Isoelectric Point Calculator.
           http://isoelectric.ovh.org
         using pKa tables from:
   3. the atomic composition (CHONS) of the protein
   4. the total number of amino acids in each protein

Usage: gmstat.py [ options ] <files>

Examples:
     gstat.py ecoli-k12.fsa
     gstat.py -kda ecoli-k12.fsa > data_save.csv
     gstat.py -img ecoli-k12.fsa ecoli-plasmid1.fsa
     gstat.py -kda -img ecoli-k12.fsa ecoli-plasmid1.fsa > data_save.csv

Options:
  -kda     --- display molecular weights in kDa
  -img     --- display locus tags from IMG/ER FASTA files
  -aacount --- perform an amino acid count for each sequence
  -tsv     --- presents the output in a TSV format
""")
		sys.exit(1)

	# if we are presenting the output in a TSV format
	# we must print a header for the TSV output
	if TSV:
		if aaCount:
			if IMG:
				# print the header (IMG/ER)
				print("NAME	TAG	NUMAA	Mol. Wt.	pI	COMPOSITION	" \
					+ "A	B	C	D	E	F	G	H	" \
					+ "I	K	L	M	N	P	Q	R	" \
					+ "S	T	V	W	X	Y	Z	SEQUENCE")
			else:
				# print the regular header
				print("NAME	NUMAA	Mol. Wt.	pI	COMPOSITION	" \
					+ "A	B	C	D	E	F	G	H	" \
					+ "I	K	L	M	N	P	Q	R	" \
					+ "S	T	V	W	X	Y	Z	SEQUENCE")
		else:
			if IMG:
				# print the header (IMG/ER)
				print("NAME	TAG	NUMAA	Mol. Wt.	pI	COMPOSITION	SEQUENCE")
			else:
				# print the regular header
				print("NAME	NUMAA	Mol. Wt.	pI	COMPOSITION	SEQUENCE")

	name = ""
	sequence = ""

	for filename in sys.argv[start:]:
		# open the current FASTA file
		FILE_HANDLE = open(filename, 'rU')

		# iterate through the lines in the current FASTA file
		for line in FILE_HANDLE:
			# obtain the name of the current sequence in the file
			if line.startswith(">"):
				# if there is already a sequence in the queue,
				# print the sequence's statistics before going
				# onto the next sequence
				if name != "":
					printstats(name, sequence, kDa, aaCount, TSV)

				# clear the queued sequence, and read in the name
				# of the sequence contained on the current line
				name = line[1:].strip('\n\r')
				if IMG:
					# here is where you can also do any modifications to the name field
					# such as extracting the locus tags from an IMG/ER FASTA file
					# I have done just this below (because at the time of this writing
					# I am using IMG/ER FASTA files); however, I have made this conditional
					# code because not everyone using this will be using IMG/ER
					img  = name.split(" ")[0]
					tag  = name.split(" ")[1]
					name = str.join(" ", name.split(" ")[2:]) + "	" + tag
				sequence = ""
			else:
				# add the sequence to the current sequence queue
				# be sure to delete any whitespace, and convert
				# the nucleotides to upper case
				sequence += re.sub(r'[\s\n\r]', '', line).upper()

		# print the stats for the last protein in the FASTA file.
		printstats(name, sequence, kDa, aaCount, TSV)
			
		# close the current FASTA file
		FILE_HANDLE.close()
