#!/usr/bin/env python3

# January 7, 2020, Dr. Brian Fristensky, University of Manitoba

# Description: Customize local copy of BIRCH documentation by 
# converting URLs and other strings in HTML files to correspond
# to local files and directory structures.

# Synopsis: customdoc.py oldstrings newstrings htmldirs

# Files: oldstrings      old strings to be replaced
#        newstrings      new strings to replace old strings
#        htmldirs        directories in which to change HTML files

# Automatically converted to Python3 using 2to3. Compliant with Python 2 and 3.

import sys
import os
import re

###########
# GLOBALS #
###########
TEMPFN = str(os.getpid()) + '.TEMP'
DONTCHANGE = "<!-- DON'T CHANGE -->"
BEGIN_DELETE = "<!-- BEGIN DELETE -->"
END_DELETE = "<!-- END DELETE -->"
BEGIN_PROTECT = "<!-- BEGIN PROTECT -->"
END_PROTECT = "<!-- END PROTECT -->"
#BEGIN_REPLACE = '<!-- BEGIN REPLACE name =".*" -->'
BEGIN_REPLACE = '<!-- BEGIN REPLACE name="'
END_REPLACE = "<!-- END REPLACE -->"

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Read in old and new strings, striping 
# leading and trailing whitespace, including
# newline characters.
def GETLIST(FN) :
    LST = []
    FILE = open(FN,'r')
    LINE = FILE.readline()
    while LINE != '':
          LST.append(LINE.strip())
          LINE = FILE.readline()
    FILE.close()
    return LST

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def CHANGEFILE(HTMLFN, OLDLIST, NEWLIST, DIRLIST) :
    # Run through INFILE, changing old URLs to new URLs
    # Note that these changes are consecutive. However,
    # lines containing DONTCHANGE, or blocks enclosed by
    # PROTECT tags are not changed.
    CHANGED = 0

    print("customdoc.py: -------- " + HTMLFN)
    # In Python2, very little checking was done to find encoding for text files.
    # Python3 is does more checking, and has a number of ways to handle encodings.
    # One approach is to explicitly set the encoding when opening a text file. For
    # web pages in English, latin-1 usually works. All bets are off for other languages
    # and Unicode encodings.  For a thorough discussion see:
    # http://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html#files-with-a-reliable-encoding-marker
    #HTMLFILE = open(HTMLFN,'r')
    HTMLFILE = open(HTMLFN,'r',encoding="latin-1")
    TEMPFILE = open(TEMPFN,'w')
    DELETE = 0
    PROTECT = 0
    REPLACE = 0
    LINE = HTMLFILE.readline()

    # This loop reads a line at a time from HTMLFILE. The if statements
    # determine what is done with each line. The ONLY line that
    # reads from HTMLFILE is at the bottom of the loop. Nothing else
    # inside the loop reads from HTMLFILE. The variables DELETE, PROECT,
    # REPLACE and DONTCHANGE determine what happens to a line. 
    
    while LINE != '':
          #print("     ++++++ customdoc.py: " + LINE)
          # Turn DELETE on and off - - - - - - - - - - - - - - - -
          if LINE.find(BEGIN_DELETE) >= 0 :
             DELETE = 1
             CHANGED = 1
          elif LINE.find(END_DELETE) >= 0 :
             DELETE = 0

          # Turn PROTECT on and off - - - - - - - - - - - - - - - -
          elif LINE.find(BEGIN_PROTECT) >= 0 :
             TEMPFILE.write(LINE)  
             PROTECT = 1          
          elif LINE.find(END_PROTECT) >= 0 :
             PROTECT = 0
             TEMPFILE.write("\n")
             TEMPFILE.write(LINE)

          # Turn REPLACE on and off,and - - - - - - - - - - - - - - - -
          # read in lines to replace the old text with the new text
          elif re.search(BEGIN_REPLACE,LINE) :
             TEMPFILE.write(LINE) 
             REPLACE = 1
             RFN = re.split('"',LINE)
             REPLACEFN = "local/public_html/" + RFN[1]
             if os.path.exists(REPLACEFN) :
                REPLACEFILE = open(REPLACEFN,'r')
                REPLINE = REPLACEFILE.readline()
                while REPLINE != '':
                   TEMPFILE.write(REPLINE)
                   REPLINE = REPLACEFILE.readline()
                   print(REPLINE)
                REPLACEFILE.close()     
             CHANGED = 1
          elif LINE.find(END_REPLACE) >= 0 :
             REPLACE = 0
             TEMPFILE.write("\n")
             TEMPFILE.write(LINE)    

          # Change the line, unless PROTECT or DONTCHANGE are true
          else :
             if REPLACE == 1 :
                pass  
             elif DELETE == 0 :
                #if line contains DONTCHANGE
                #   do nothing
                if (PROTECT == 0) or (LINE.find(DONTCHANGE) >= 0 ) :
                    # Otherwise, replace each string in OLDLIST with
                    # its counterpart in NEWLIST. 
                    I = 0;
                    for STR in OLDLIST :
                        if LINE.find(OLDLIST[I]) >= 0 :
                           LINE = LINE.replace(OLDLIST[I], NEWLIST[I])
                           CHANGED = 1
                           print(LINE)
                        I = I + 1               
                TEMPFILE.write(LINE)
          
          # This is the ONLY line that reads from HTMLFILE   
          LINE = HTMLFILE.readline()
    HTMLFILE.close() 
    TEMPFILE.close()

    # If file has changed, overwrite original file.
    # Delete temporary file.
    if CHANGED == 0 :
       os.remove(TEMPFN)
    else :
       os.remove(HTMLFN)
       os.rename(TEMPFN,HTMLFN)
       os.chmod(HTMLFN,0o644)
    return
      
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def TRAVERSE(P, OLDLIST, NEWLIST, DIRLIST) :
    os.chdir(P)
    print (P)
    
    # Make lists of all files and directories in the 
    # current directory.
    ALLFILES = os.listdir(os.curdir)
    HTMLFILES = []
    DIRECTORIES = []
    for NAME in ALLFILES :
        if os.path.isdir(NAME) and not os.path.islink(NAME):
           DIRECTORIES.append(NAME)
        elif NAME[-5:] == '.html' :
           HTMLFILES.append(NAME)
           
    # list HTML files
    for FILE in HTMLFILES :  
        CHANGEFILE(FILE, OLDLIST, NEWLIST, DIRLIST)

    # Visit all directories recursively
    for D in DIRECTORIES :
        TRAVERSE(D, OLDLIST, NEWLIST, DIRLIST)
        
    # Don't forget to return to the parent directory.
    os.chdir(os.pardir)
    return

#======================== MAIN PROCEDURE ==========================
def customdoc(OLDSTRFN, NEWSTRFN, DIRFN):
        # Read in list of strings to change
        OLDLIST = GETLIST(OLDSTRFN)
        NEWLIST = GETLIST(NEWSTRFN)
        DIRLIST = GETLIST(DIRFN)
        
        OLDLEN = len(OLDLIST)
        NEWLEN = len(NEWLIST)
        DIRLEN = len(DIRLIST)
        
        if OLDLEN == 0 :
           print(OLDSTRFN + " has 0 elements. Doing nothing.")
        elif NEWLEN == 0 :
           print(NEWSTRFN + " has 0 elements. Doing nothing.")
        elif OLDLEN != NEWLEN :
           print(OLDSTRFN + ' and ' + NEWSTRFN + ' must have the same number of elements') 
           print("Doing nothing.")
        elif DIRLEN == 0:
           print(DIRFN + " has 0 elements. Doing nothing.")
        else: 
           # Traverse the directory tree recursively, changing all HTML
           # files to use new strings.
           for PNAME in DIRLIST : 
               TRAVERSE(PNAME, OLDLIST, NEWLIST, DIRLIST)
        
        # debugging
        print("OLDLIST:")
        print(OLDLIST)
        print("NEWLIST:")
        print(NEWLIST)

if __name__=="__main__":
        #---------- Set global variables
        OLDSTRFN = sys.argv[1]
        NEWSTRFN = sys.argv[2]
        DIRFN = sys.argv[3]
        
        customdoc(OLDSTRFN, NEWSTRFN, DIRFN)