#!/usr/bin/env python

import birchenv
import birchscript
import os
import os.path
import subprocess
import sys
import re
import shutil
from Bio import Entrez

#optparse is deprecated in favor of argparse as of Python 2.7. However,
# since 2.7 is not always present on many systems, at this writing,
# it is safer to stick with optparse for now. It should be easy
# to change later, since the syntax is very similar between argparse and optparse.
from optparse import OptionParser

'''
seqfetch.py - Given a set of GI (UID) or ACCESSION numbers , create a file containing
the corresponding genbank entries. If the input file is a csv file, only the first
column is read. Lines beginning with '#' are ignored as comments

Synopsis: seqfetch.py --infile infile --outfile outfile  [--format sequence_format] [--query entrez_query_statement] [--db database] [--sep separator]

  --infile containing GI (UID) or ACCESSION numbers
  --outfile to contain all of the GenBank entries
  --query entrez_query_statement - limit sequences retrieved using an Entrez query
  --format sequence_format - sequence format as defined by NCBI EUtilities.
  --db - NCBI database as defined for EUtilities
  --sep - separator character to use when input is a csv file eg tab, comma
  
@modified: May 8, 2020
@author: Brian Fristensky
@contact: frist@cc.umanitoba.ca  
'''


blib = os.environ.get("BIRCHPYLIB")
sys.path.append(blib)

from birchlib import Birchmod
from birchlib import Argument

PROGRAM = "seqfetch.py : "
USAGE = "\n\tUSAGE: seqfetch.py --infile infile --outfile outfile [--format sequence_format] [--query entrez_query_statement] [--db database] [--sep separator]"

DEBUG = True
if DEBUG :
    print('Debugging mode on')

BM = Birchmod(PROGRAM, USAGE)

EMAILADDR = BM.getEmailAddr()
if (EMAILADDR != "") :
   Entrez.email = EMAILADDR

if DEBUG :
    print('EMAILADDR: ' + EMAILADDR)


class Parameters:
    """
      	Wrapper class for command line parameters
      	"""
    def __init__(self):
        """
     	  Initializes arguments:
     		IFN=""
     		EQUERY=""
                FORMAT="gb"
                DATABASE = "nuccore"
                SEPARATOR = ","
     		OFN=""
     		PID=""

     	  Then calls read_args() to fill in their values from command line
          """
        self.IFN = ""
        self.EQUERY = ""
        self.FORMAT = "gb"
        self.DATABASE = "nuccore"
        self.SEPARATOR = ","
        self.OFN = ""
        self.PID = str(os.getpid())
        self.read_args()

    def read_args(self):
        """
        	Read command line arguments into a Parameter object

        	"""

        parser = OptionParser()
        parser.add_option("--infile", dest="ifn", action="store", type="string", default="",
                          help="file of GI or Accession numbers")
        parser.add_option("--outfile", dest="ofn", action="store", type="string", default="",
                          help="output file")
        parser.add_option("--format", dest="format", action="store", type="string", default="gb",
                          help="format for output file")
        parser.add_option("--query", dest="query", action="store", type="string", default="",
                          help="ncbiquery query term")
        parser.add_option("--db", dest="database", action="store", type="string", default="nuccore",
                          help="database to search")
        parser.add_option("--sep", dest="seperator", action="store", type="string", default=",",
                          help="filter for query")

        (options, args) = parser.parse_args() 

        self.IFN = options.ifn
        self.OFN = options.ofn
        self.FORMAT = options.format
        self.EQUERY = options.query
        self.DATABASE = options.database
        self.SEPERATOR = options.seperator


        if DEBUG :
            print('------------ Parameters from command line ------' + '\n')
            print('    INFILE: ' + self.IFN)
            print('    OUTFILE: ' + self.OFN)  
            print('    FORMAT: ' + self.FORMAT) 
            print('    EQUERY: ' + self.EQUERY)
            print('    DATABASE: ' + self.DATABASE) 
            print('    SEPERATOR: ' + self.SEPERATOR)


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def Retrieve(IFN,OFN,JOBID,FORMAT,EQUERY,DATABASE,SEPARATOR):
    """ 
 """
 
    #If input file contains characters other than [0-9], we assume
    #that the input is ACCESSION numbers. By default, we assume
    #the input is GI numbers.

    # Preprocessing - strip out comment lines beginning with '#' and
    # write to a new file only column 1, assuming that the delimiter is a tab.				
    h_INFILE  = open(IFN, "r")
    # UIDstr is a string containing a comma-separated list of UIDs or Accessions.
    # We need to make this more sophisticated to break up lists greater than
    # 500 UIDs into smaller lists so that we can do mulitple queries, rather than
    # one really big query.
    UIDstr=""
    IDType='uid'
    SPATTERN=re.compile('[A-Z]',re.IGNORECASE)
    for line in h_INFILE :
        if line[0] != '#' :
            rawline = line.strip()
            #print(rawline)
            TOKENS = rawline.split(SEPARATOR)
            if (UIDstr == "") :
               UIDstr=TOKENS[0]
            elif not TOKENS[0] in UIDstr : # eliminate duplicate ID numbers
               UIDstr = UIDstr + ',' + TOKENS[0]
            if re.search(SPATTERN,TOKENS[0]) :
               IDType='accn'
    h_INFILE.close()
    if DEBUG :
        print('UIDstr: ' + UIDstr)
        print('IDType: ' + IDType)

    # If the ID list is GI/uid numbers, a query statement will be evaluated to apply to the 
    # ID list. This option is NOT available if the ID list is ACCESSION numbers.
    # Run epost to generate a history file at NCBI, containing links to the
    # specified entries.
    if IDType=='uid' :
        handle = Entrez.epost(db=DATABASE, id=UIDstr, usehistory="y")
        result = Entrez.read(handle)
        WEBENV = result["WebEnv"]
        QUERYKEY = result["QueryKey"]
        if DEBUG :
           print("WebEnv: " + WEBENV)
           print("query_key: " + QUERYKEY)

        if (EQUERY != "" ):
            # Run efilter apply an Entrez query term to the list
            handle = Entrez.esearch(db=DATABASE, term=EQUERY, usehistory="y", WebEnv=WEBENV, query_key=QUERYKEY) 
            result = Entrez.read(handle)       
            WEBENV = result["WebEnv"]
            QUERYKEY = result["QueryKey"]
            if DEBUG :
               print("WebEnv: " + WEBENV)
               print("query_key: " + QUERYKEY)

        # Run efetch to retrieve the entries
        h_OUTFILE = open(OFN, "w")
#        handle = Entrez.efetch(id=UIDstr, db=DATABASE,query_key=QUERYKEY, WebEnv=WEBENV,  usehistory="y", retmode="text", rettype=FORMAT)
        handle = Entrez.efetch(db=DATABASE,query_key=QUERYKEY, WebEnv=WEBENV,  usehistory="y", retmode="text", rettype=FORMAT)

    else:
        h_OUTFILE = open(OFN, "w")
        handle = Entrez.efetch(id=UIDstr, db=DATABASE, retmode="text", rettype=FORMAT)   

    for line in handle:
        h_OUTFILE.write(line)
        #print line
    handle.close()
    h_OUTFILE.close()


#======================== MAIN PROCEDURE ==========================
def main():
    """
        Called when not in documentation mode.
        """
    # Read parameters from command line
    P = Parameters()

    # Retrieve entries from NCBI to outfile
    Retrieve(P.IFN,P.OFN,P.PID,P.FORMAT,P.EQUERY,P.DATABASE,P.SEPARATOR)
	   
    BM.exit_success()

if (BM.documentor() or "-test" in sys.argv):
    pass
else:
    main()