#!/usr/bin/env python3
# June 3, 2023, Dr. Brian Fristensky, University of Manitoba

"""
Description: Extract a subset of sequences from a GDE flatfile."

Synopsis: BLExtractSubset.py namefile infile outfile"

 Files: namefile    list of sequence names to be written. If namefile is a TSV file,
                    the leftmost field will be read as a name.
        infile      GDE flat file containing all sequences
        outfile     GDE flat file containing subset of sequences listed in namefile
"""

import sys
import string
import os.path
import shutil

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in namefile
def GETNAMES(FN) :

    FILE = open(FN,'r')
    NAMES = []
    for LINE in FILE :
        if LINE.startswith("#") :
            pass
        else:
          LINE = LINE.strip()
          if len(LINE) > 0 :
             F0 = LINE.split("\t")[0]
             if not F0 in NAMES :
                 NAMES.append(F0)
    FILE.close()
    return NAMES


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read each sequence in infile. If a sequence name is in the
# list, write the sequence to a separate file in the temporary
# directory
def GETSEQS(INFILE,TEMPDIR) :
    
    os.chdir(TEMPDIR)
    # create a dummy file just so that we have a file to close 
    # the first time the loop is executed. This also takes care
    # of files in which the first sequence begins after the first line
    OUTFILE = open('/dev/null','w') 
     
    for LINE in INFILE :
          LINE = LINE.strip()
          if len(LINE) > 0 :
             if LINE[0] in SEQFLAGS : #start a new output file
                TOKENS = LINE.split()
                SEQNAME = TOKENS[0][1:]
                OUTFILE.close()
                OUTFILE = open(SEQNAME + '.flat','w')
                OUTFILE.write(LINE + '\n')
             else : #copy the line to output file
                OUTFILE.write(LINE + '\n')
          else :
              pass
    INFILE.close()    


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Write only the sequences listed in namefile to the output file,
# in the order they appear in namefile
def WRITESEQS(NAMES,OFN) :
    OUTFILE = open(OFN,'w')
    if len(NAMES) > 0:
       for N in NAMES :
           FN = N + '.flat'
           if os.path.exists(FN) :
               INFILE = open(FN,'r')
               for LINE in INFILE :
                   OUTFILE.write(LINE)
               INFILE.close()
    OUTFILE.close()      
    os.chdir(CWD)
    
#======================== MAIN PROCEDURE ==========================

#---------- Set global variables
NFN = sys.argv[1]
IFN = sys.argv[2]
OFN = sys.argv[3]
OUTFILE = open(OFN,'w')
SEQFLAGS = (['>','#','%','"','@']) # indicate sequence name line
CWD = os.getcwd()

if os.path.exists(NFN) :

   # Read in namefile
   NAMES = GETNAMES(NFN)
   
   if os.path.exists(IFN) :
     #  Create a temporary working directory
     TEMPDIR = 'BLExtractSubset.' + str(os.getpid())
     os.mkdir(TEMPDIR,0o700)     
     
     INFILE = open(IFN,'r')
     GETSEQS(INFILE,TEMPDIR)
     INFILE.close()     
     
     #Write sequence to outfile
     WRITESEQS(NAMES,os.path.join(CWD,OFN))
     
     # Clean up
     shutil.rmtree(TEMPDIR)

     
   else:
     print('BLExtract Subset: File not found: ' + IFN)   
   
else :
   print('BLExtract Subset: File not found: ' + NFN)




