# portSentencer - sentenciador de texto puro para o Portugues
#
# Este programa recebe diversos arquivos de entrada em formato
#    textual e gera um arquivo textual com uma sentença por linha.
#
# Opções:
#
# -h help
# -o output file
# -r replace non standart characters
# -l limit the number of characters per sentence
#
# Exemplo de utilização:
#
# portSent -o sents.txt -r -l 2048 text1.txt text2.txt
#
# Busca o texto nos arquivos 'text1.txt' e 'text2.txt',
#   substitui caracteres não usuais,
#   gera sentenças com limite máximo de 2048 carateres e
#   salva as sentenças no arquivo 'sents.txt'
#
# last edit: 01/21/2024
# created by Lucelene Lopes - lucelene@gmail.com

import sys, os

#################################################
### Captura de argumentos da linha de comando
#################################################
def parseOptions(arguments):
    # default options
    output_file, input_files, replace, limit = "", [], False, 0
    i = 1
    while i < len(arguments):
        if (arguments[i][0] == "-"):
            # ajuda (help) - mostra ajuda, nada é executado
            if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
               (arguments[i] == "-help"):
                print("Opções:\n-h ajuda\n-o arquivo de saída", \
                      "-r substitui caracteres não padrão", \
                      "-l limite de caracteres por sentença", \
                      " -demais opções ignoradas, por favor execute novamente sem opção de ajuda",
                      "Exemplo de utilização:", \
                      "portSent -o sents.txt -r -l 2048 text1.txt text2.txt", \
                      "Busca o texto nos arquivos 'text1.txt' e 'text2.txt'", \
                      "  substitui caracteres não usuais,", \
                      "  gera sentenças com limite máximo de 2048 carateres e", \
                      "  salva as sentenças no arquivo 'sents.txt'", \
                      sep="\n")
                return None
            # opção de substituição (replace) de caracteres não usuais
            elif ((arguments[i][1] == "r") and (len(arguments[i])==2)) or \
                 (arguments[i] == "-replace"):
                replace = True
                i += 1
            # opção de limite de tamanho de sentença (limit) - 0 para sem limite
            elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \
                 (arguments[i] == "-limit"):
                try:
                    limit = eval(arguments[i+1])
                    i += 2
                except:
                    print("limite de caracteres por sentença não informado - assumindo sem limite")
                    i += 1
            # opção de arquivo de saída (um nome de arquivo)
            elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
                 (arguments[i] == "-output"):
                output_file = arguments[i+1]
                i += 2
            # opções inválidas - nada é executado
            else:
                print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i]))
                return None
        # arquivos de entrada (qualquer número) - só são incluídos se existirem
        else:
            if (os.path.isfile(arguments[i])):
                input_files.append(arguments[i])
                i += 1
            else:
                print("O arquivo {} não foi encontrado (ignorado)".format(arguments[i]))
                i += 1
    return [output_file, input_files, limit, replace]

#################################################
### função stripSents - faz de fato o sentenciamento
#################################################
def stripSents(inputText, outfile, limit, replace):
    def cleanPrint(sent, outfile):
        # do not print empty sentences
        if (sent == "") or (sent == ".") or (sent == ".."):
            return 0
        # remove second . in sentences ending by ..
        elif (len(sent) > 2) and (sent[-3:] != "...") and (sent[-2:] == ".."):
            print(sent[:-1], file=outfile)
            return 1
        # insert . in sentences not ending by punctuation
        elif (sent[-1] not in [".", "!", "?", ":", ";"]) and \
            not ((sent[-1] in ["'", '"']) and (sent[-2] in [".", "!", "?"])):
            print(sent+".", file=outfile)
            return 1
        # remove encompassing quotations " or ' if the quotations do not appear inside the sentence
        elif (sent[0] == sent[1]) and ((sent[0] == "'") or (sent[0] == '"')) and (sent.count(sent[0]) == 2):
            print(sent[1:-1], file=outfile)
            return 1
        # otherwise print it as it is
        else:
            print(sent, file=outfile)
            return 1
    def isAbbrev(chunk, abbrev):
        abbr = False
        for a in abbrev:
            if (chunk == a):
                abbr = True
                break
            else:
                lasts = -len(a)
                if (chunk[lasts:] == a) and (not chunk[lasts-1].isalpha()):
                    abbr = True
                    break
        return abbr
    # the function stripSents main body
    abbrev = []
    infile = open("./src/portSentencer/abbrev.txt", "r")
    for line in infile:
        abbrev.append(line[:-1])
    infile.close()
    if (replace):
        replaceables = [[" ", " "], \
                        ["—", "-"], ["–", "-"], \
                        ['＂', '"'], \
                        ['“', '"'], ['”', '"'], \
                        ['‟', '"'], ['″', '"'], \
                        ['‶', '"'], ['〃', '"'], \
                        ['״', '"'], ['˝', '"'], \
                        ['ʺ', '"'], ['˶', '"'], \
                        ['ˮ', '"'], ['ײ', '"'], \
                        [" ‣", "."], [" >>", "."], [" ○", "."], [" *", "."], \
                        [" | ", ". "], [" .", "."], \
                        ["\n", " "], ["\t", " "]]
    else:
        replaceables = [["\n", " "], ["\t", " "]]
    tmp = inputText.replace("  "," ")
    for r in replaceables:
        tmp = tmp.replace(r[0], r[1])
    while (tmp.find("  ") != -1):
        tmp = tmp.replace("  "," ")
    if (tmp[0] == " "):
        tmp = tmp[1:]
    bagOfChunks = tmp.split(" ")
    s, sent = 0, ""
    if (bagOfChunks[-1] == ""):
        bagOfChunks.pop()
    for i in range(len(bagOfChunks)):
        # if it is the last chunk, it is the end of sentence
        if (i == len(bagOfChunks)-1):
            sent += " " + bagOfChunks[i]
            s += cleanPrint(sent[1:], outfile)
            break
        chunk = bagOfChunks[i]
        # if there is a limit and the chunk is greater than the limit, discard it
        if (limit != 0) and (len(chunk) > limit):
            continue
        # if there is a limit and it is reached, ends the sentence arbitrarily
        elif (limit != 0) and (len(sent) + len(chunk) > limit):
            s += cleanPrint(sent[1:], outfile)
            sent = chunk
        # if the chunk is too short
        elif (len(chunk) < 3) and (len(chunk) != 0):
            sent += " " + chunk
        # if the chunk is empty
        elif (len(chunk) == 0):
            continue
        # ! ? or ... always mark an end of sentence
        elif (chunk[-3:] == "...") or (chunk[-1] == "!") or (chunk[-1] == "?"):
            sent += " " + chunk
            s += cleanPrint(sent[1:], outfile)
            sent = ""
        # a . : or ; followed by a lowercase chunk is not an end of sentence
        elif ((chunk[-1] == ".") or (chunk[-1] == ":") or (chunk[-1] == ";")) and (bagOfChunks[i+1][0].islower()):
            sent += " " + chunk
        # a : or ; not followed by a lowercase chunk is an end of sentence
        elif ((chunk[-1] == ":") or (chunk[-1] == ";")) and (not bagOfChunks[i+1][0].islower()):
            sent += " " + chunk
            s += cleanPrint(sent[1:], outfile)
            sent = ""
        # chunk ends with ! or ? followed by quotations that had appear before an odd number is an end of sentence
        elif (chunk[-2:] in ["!'", '!"', "?'", '?"']):
            sent += " " + chunk
            s += cleanPrint(sent[1:], outfile)
            sent = ""
        elif (chunk[-2:] in [".'", '."']):
            sent += " " + chunk
            abbr = isAbbrev(chunk[:-1], abbrev)
            if not abbr:
                s += cleanPrint(sent[1:], outfile)
                sent = ""
        # a chunk not ending with ! ? ... ; : or . is not an end of sentence
        elif (chunk[-1] != "."):
            sent += " " + chunk
        # chunk ending by . is either a know abbreviation (not an end of sentence), or an end of sentence
        elif (chunk[-1] == "."):
            abbr = isAbbrev(chunk, abbrev)
            if (abbr):
                sent += " " + chunk
            else:
                sent += " " + chunk
                s += cleanPrint(sent[1:], outfile)
                sent = ""
    # return the number of generated sentences
    return s

#################################################
### função principal do programa - busca argumentos e chama 'stripSents' que faz de fato o sentenciamento
#################################################
def portSent():
    if (len(sys.argv) == 1):
        arguments = ["sents2.txt", ["relic.txt"], 0, True]
        print("Assumindo default: 'sents.txt' como arquivo de saída, 'text1.txt' como arquivo de entrada, sem limite e substituições.")
    else:
        arguments = parseOptions(sys.argv)
    if (arguments != None):
        if (arguments[0] == ""):
            print("Assumindo 'sents.txt' como arquivo de saída")
            arguments[0] = 'sents.txt'
        if (arguments[1] == []):
            print("Nenhum arquivo de entrada válido - por favor corrija e tente novamente")
        else:
            outfile = open(arguments[0], "w")
            inputText = ""
            for oneInput in arguments[1]:
                infile = open(oneInput, "r")
                inputText += infile.read()
                infile.close()
                s = stripSents(inputText, outfile, arguments[2], arguments[3])
            outfile.close()
            print("Sentenciamento terminado com {} sentenças extraídas e salvas em {}".format(s, arguments[0]))
    else:
        print("Problemas com parâmetros - por favor corrija e tente novamente")

portSent()