Spaces:
Running
Running
| # portSentencer - sentenciador de texto puro para o Portugues | |
| # | |
| # Este programa recebe diversos arquivos de entrada em formato | |
| # textual e gera um arquivo textual com uma sentença por linha. | |
| # | |
| # Opções: | |
| # | |
| # -h help | |
| # -o output file | |
| # -r replace non standart characters | |
| # -l limit the number of characters per sentence | |
| # | |
| # Exemplo de utilização: | |
| # | |
| # portSent -o sents.txt -r -l 2048 text1.txt text2.txt | |
| # | |
| # Busca o texto nos arquivos 'text1.txt' e 'text2.txt', | |
| # substitui caracteres não usuais, | |
| # gera sentenças com limite máximo de 2048 carateres e | |
| # salva as sentenças no arquivo 'sents.txt' | |
| # | |
| # last edit: 01/21/2024 | |
| # created by Lucelene Lopes - [email protected] | |
| import sys, os | |
| ################################################# | |
| ### Captura de argumentos da linha de comando | |
| ################################################# | |
| def parseOptions(arguments): | |
| # default options | |
| output_file, input_files, replace, limit = "", [], False, 0 | |
| i = 1 | |
| while i < len(arguments): | |
| if (arguments[i][0] == "-"): | |
| # ajuda (help) - mostra ajuda, nada é executado | |
| if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-help"): | |
| print("Opções:\n-h ajuda\n-o arquivo de saída", \ | |
| "-r substitui caracteres não padrão", \ | |
| "-l limite de caracteres por sentença", \ | |
| " -demais opções ignoradas, por favor execute novamente sem opção de ajuda", | |
| "Exemplo de utilização:", \ | |
| "portSent -o sents.txt -r -l 2048 text1.txt text2.txt", \ | |
| "Busca o texto nos arquivos 'text1.txt' e 'text2.txt'", \ | |
| " substitui caracteres não usuais,", \ | |
| " gera sentenças com limite máximo de 2048 carateres e", \ | |
| " salva as sentenças no arquivo 'sents.txt'", \ | |
| sep="\n") | |
| return None | |
| # opção de substituição (replace) de caracteres não usuais | |
| elif ((arguments[i][1] == "r") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-replace"): | |
| replace = True | |
| i += 1 | |
| # opção de limite de tamanho de sentença (limit) - 0 para sem limite | |
| elif ((arguments[i][1] == "l") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-limit"): | |
| try: | |
| limit = eval(arguments[i+1]) | |
| i += 2 | |
| except: | |
| print("limite de caracteres por sentença não informado - assumindo sem limite") | |
| i += 1 | |
| # opção de arquivo de saída (um nome de arquivo) | |
| elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-output"): | |
| output_file = arguments[i+1] | |
| i += 2 | |
| # opções inválidas - nada é executado | |
| else: | |
| print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i])) | |
| return None | |
| # arquivos de entrada (qualquer número) - só são incluídos se existirem | |
| else: | |
| if (os.path.isfile(arguments[i])): | |
| input_files.append(arguments[i]) | |
| i += 1 | |
| else: | |
| print("O arquivo {} não foi encontrado (ignorado)".format(arguments[i])) | |
| i += 1 | |
| return [output_file, input_files, limit, replace] | |
| ################################################# | |
| ### função stripSents - faz de fato o sentenciamento | |
| ################################################# | |
| def stripSents(inputText, outfile, limit, replace): | |
| def cleanPrint(sent, outfile): | |
| # do not print empty sentences | |
| if (sent == "") or (sent == ".") or (sent == ".."): | |
| return 0 | |
| # remove second . in sentences ending by .. | |
| elif (len(sent) > 2) and (sent[-3:] != "...") and (sent[-2:] == ".."): | |
| print(sent[:-1], file=outfile) | |
| return 1 | |
| # insert . in sentences not ending by punctuation | |
| elif (sent[-1] not in [".", "!", "?", ":", ";"]) and \ | |
| not ((sent[-1] in ["'", '"']) and (sent[-2] in [".", "!", "?"])): | |
| print(sent+".", file=outfile) | |
| return 1 | |
| # remove encompassing quotations " or ' if the quotations do not appear inside the sentence | |
| elif (sent[0] == sent[1]) and ((sent[0] == "'") or (sent[0] == '"')) and (sent.count(sent[0]) == 2): | |
| print(sent[1:-1], file=outfile) | |
| return 1 | |
| # otherwise print it as it is | |
| else: | |
| print(sent, file=outfile) | |
| return 1 | |
| def isAbbrev(chunk, abbrev): | |
| abbr = False | |
| for a in abbrev: | |
| if (chunk == a): | |
| abbr = True | |
| break | |
| else: | |
| lasts = -len(a) | |
| if (chunk[lasts:] == a) and (not chunk[lasts-1].isalpha()): | |
| abbr = True | |
| break | |
| return abbr | |
| # the function stripSents main body | |
| abbrev = [] | |
| infile = open("./src/portSentencer/abbrev.txt", "r") | |
| for line in infile: | |
| abbrev.append(line[:-1]) | |
| infile.close() | |
| if (replace): | |
| replaceables = [[" ", " "], \ | |
| ["—", "-"], ["–", "-"], \ | |
| ['"', '"'], \ | |
| ['“', '"'], ['”', '"'], \ | |
| ['‟', '"'], ['″', '"'], \ | |
| ['‶', '"'], ['〃', '"'], \ | |
| ['״', '"'], ['˝', '"'], \ | |
| ['ʺ', '"'], ['˶', '"'], \ | |
| ['ˮ', '"'], ['ײ', '"'], \ | |
| [" ‣", "."], [" >>", "."], [" ○", "."], [" *", "."], \ | |
| [" | ", ". "], [" .", "."], \ | |
| ["\n", " "], ["\t", " "]] | |
| else: | |
| replaceables = [["\n", " "], ["\t", " "]] | |
| tmp = inputText.replace(" "," ") | |
| for r in replaceables: | |
| tmp = tmp.replace(r[0], r[1]) | |
| while (tmp.find(" ") != -1): | |
| tmp = tmp.replace(" "," ") | |
| if (tmp[0] == " "): | |
| tmp = tmp[1:] | |
| bagOfChunks = tmp.split(" ") | |
| s, sent = 0, "" | |
| if (bagOfChunks[-1] == ""): | |
| bagOfChunks.pop() | |
| for i in range(len(bagOfChunks)): | |
| # if it is the last chunk, it is the end of sentence | |
| if (i == len(bagOfChunks)-1): | |
| sent += " " + bagOfChunks[i] | |
| s += cleanPrint(sent[1:], outfile) | |
| break | |
| chunk = bagOfChunks[i] | |
| # if there is a limit and the chunk is greater than the limit, discard it | |
| if (limit != 0) and (len(chunk) > limit): | |
| continue | |
| # if there is a limit and it is reached, ends the sentence arbitrarily | |
| elif (limit != 0) and (len(sent) + len(chunk) > limit): | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = chunk | |
| # if the chunk is too short | |
| elif (len(chunk) < 3) and (len(chunk) != 0): | |
| sent += " " + chunk | |
| # if the chunk is empty | |
| elif (len(chunk) == 0): | |
| continue | |
| # ! ? or ... always mark an end of sentence | |
| elif (chunk[-3:] == "...") or (chunk[-1] == "!") or (chunk[-1] == "?"): | |
| sent += " " + chunk | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = "" | |
| # a . : or ; followed by a lowercase chunk is not an end of sentence | |
| elif ((chunk[-1] == ".") or (chunk[-1] == ":") or (chunk[-1] == ";")) and (bagOfChunks[i+1][0].islower()): | |
| sent += " " + chunk | |
| # a : or ; not followed by a lowercase chunk is an end of sentence | |
| elif ((chunk[-1] == ":") or (chunk[-1] == ";")) and (not bagOfChunks[i+1][0].islower()): | |
| sent += " " + chunk | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = "" | |
| # chunk ends with ! or ? followed by quotations that had appear before an odd number is an end of sentence | |
| elif (chunk[-2:] in ["!'", '!"', "?'", '?"']): | |
| sent += " " + chunk | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = "" | |
| elif (chunk[-2:] in [".'", '."']): | |
| sent += " " + chunk | |
| abbr = isAbbrev(chunk[:-1], abbrev) | |
| if not abbr: | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = "" | |
| # a chunk not ending with ! ? ... ; : or . is not an end of sentence | |
| elif (chunk[-1] != "."): | |
| sent += " " + chunk | |
| # chunk ending by . is either a know abbreviation (not an end of sentence), or an end of sentence | |
| elif (chunk[-1] == "."): | |
| abbr = isAbbrev(chunk, abbrev) | |
| if (abbr): | |
| sent += " " + chunk | |
| else: | |
| sent += " " + chunk | |
| s += cleanPrint(sent[1:], outfile) | |
| sent = "" | |
| # return the number of generated sentences | |
| return s | |
| ################################################# | |
| ### função principal do programa - busca argumentos e chama 'stripSents' que faz de fato o sentenciamento | |
| ################################################# | |
| def portSent(): | |
| if (len(sys.argv) == 1): | |
| arguments = ["sents2.txt", ["relic.txt"], 0, True] | |
| print("Assumindo default: 'sents.txt' como arquivo de saída, 'text1.txt' como arquivo de entrada, sem limite e substituições.") | |
| else: | |
| arguments = parseOptions(sys.argv) | |
| if (arguments != None): | |
| if (arguments[0] == ""): | |
| print("Assumindo 'sents.txt' como arquivo de saída") | |
| arguments[0] = 'sents.txt' | |
| if (arguments[1] == []): | |
| print("Nenhum arquivo de entrada válido - por favor corrija e tente novamente") | |
| else: | |
| outfile = open(arguments[0], "w") | |
| inputText = "" | |
| for oneInput in arguments[1]: | |
| infile = open(oneInput, "r") | |
| inputText += infile.read() | |
| infile.close() | |
| s = stripSents(inputText, outfile, arguments[2], arguments[3]) | |
| outfile.close() | |
| print("Sentenciamento terminado com {} sentenças extraídas e salvas em {}".format(s, arguments[0])) | |
| else: | |
| print("Problemas com parâmetros - por favor corrija e tente novamente") | |
| portSent() | |