Spaces:
Running
Running
| # portTokenizer - tokenizador de sentenças em Portugues para um arquivo CoNLL-U | |
| # | |
| # Este programa recebe um arquivo de entrada textual com uma sentença | |
| # por linha e gera um arquivo CoNLL-U devidamente tokenizado. | |
| # | |
| # Este programa utiliza um léxico de Português, PortiLexicon-UD através da | |
| # chamada da classe UDlexPT incluída no arquivo "lexikon.py" disponível | |
| # em conjunto com este arquivo ("portTok.py") e com os arquivos textuais | |
| # do léxico ("ADJ.tsv", "ADP.tsv", "ADV.tsv", "AUX.tsv", "CCONJ.tsv", | |
| # "DET.tsv", "INTJ.tsv", "NOUN.tsv", "NUM.tsv", "PRON.tsv", "SCONJ.tsv", | |
| # "VERB.tsv", "WORDmaster.txt"). | |
| # | |
| # Opções: | |
| # | |
| # -h help | |
| # -o output file | |
| # -p preserve as one token itemizations as a) b) and i) ii) | |
| # -m matches the paired punctuations | |
| # -t trims headlines (heuristic) | |
| # -s sentence id (sid) model | |
| # | |
| # Exemplo de utilização: | |
| # | |
| # portTok -o sents.conllu -p -m -t -s S000000 sents.txt | |
| # | |
| # Busca as sentenças no arquivo 'sents.txt', | |
| # preserva tokens em itens como a) b) i) ii), | |
| # corrige pontuações casadas (aspas, parenteses, etc), | |
| # remove possíveis MANCHETES que precedem as frases, | |
| # usa S000000 como modelo de identificador de sentença e | |
| # salva as sentenças devidamente tokenizadas no arquivo 'sents.conllu' | |
| # | |
| # last edit: 10/05/2025 | |
| # created by Lucelene Lopes - [email protected] | |
| import sys, os | |
| import lexikon | |
| lex = lexikon.UDlexPT() | |
| ################################################# | |
| ### Captura de argumentos da linha de comando | |
| ################################################# | |
| def parseOptions(arguments): | |
| # default options | |
| output_file, input_file, preserve, match, trim, model = "", [], False, False, False, "S000000" | |
| i = 1 | |
| while i < len(arguments): | |
| if (arguments[i][0] == "-"): | |
| # ajuda (help) - mostra ajuda, nada é executado | |
| if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-help"): | |
| print("Opções:\n-h ajuda\n-o arquivo de saída", \ | |
| "-m corrige pontuações casadas (aspas, parenteses, etc)", \ | |
| "-t remove possíveis MANCHETES que precedem as frases", \ | |
| "Exemplo de utilização:", \ | |
| "portTok -o sents.conllu -m -t -s S000000 sents.txt", \ | |
| "Busca as sentenças no arquivo 'sents.txt',", \ | |
| " corrige pontuações casadas (aspas, parenteses, etc),", \ | |
| " remove possíveis MANCHETES que precedem as frases", \ | |
| " usa S0000000 como modelo de identificador de sentença e" | |
| " salva as sentenças devidamente tokenizadas no arquivo 'sents.conllu''", \ | |
| sep="\n") | |
| return None | |
| # opção de correção (preserve) para itens a) b) i) ii) | |
| elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-preserve"): | |
| preserve = True | |
| i += 1 | |
| # opção de correção (matching) de pontuações pareadas | |
| elif ((arguments[i][1] == "m") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-match"): | |
| match = True | |
| i += 1 | |
| # opção de remoção (trim) de manchetes no início da sentença | |
| elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-trim"): | |
| trim = True | |
| i += 1 | |
| # opção de modelo de identificador de sentença (sid) - 0 para sem limite | |
| elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-sid"): | |
| try: | |
| model = arguments[i+1] | |
| i += 2 | |
| except: | |
| print("modelo de identificador de sentença não informado - assumindo S00000") | |
| i += 1 | |
| # opção de arquivo de saída (um nome de arquivo) | |
| elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \ | |
| (arguments[i] == "-output"): | |
| output_file = arguments[i+1] | |
| i += 2 | |
| # opções inválidas - nada é executado | |
| else: | |
| print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i])) | |
| return None | |
| # arquivo de entrada - só é incluído se existir | |
| else: | |
| if (os.path.isfile(arguments[i])): | |
| input_file = arguments[i] | |
| i += 1 | |
| else: | |
| print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i])) | |
| return None | |
| return [output_file, input_file, preserve, match, trim, model] | |
| ############################################################################# | |
| # Increment a name index | |
| ############################################################################# | |
| def nextName(name): | |
| # increment the digits from right to left | |
| ans = "" | |
| while name != "": | |
| digit, name = name[-1], name[:-1] | |
| if digit == "9": | |
| ans = "0" + ans | |
| elif digit == "8": | |
| ans = "9" + ans | |
| return name+ans | |
| elif digit == "7": | |
| ans = "8" + ans | |
| return name+ans | |
| elif digit == "6": | |
| ans = "7" + ans | |
| return name+ans | |
| elif digit == "5": | |
| ans = "6" + ans | |
| return name+ans | |
| elif digit == "4": | |
| ans = "5" + ans | |
| return name+ans | |
| elif digit == "3": | |
| ans = "4" + ans | |
| return name+ans | |
| elif digit == "2": | |
| ans = "3" + ans | |
| return name+ans | |
| elif digit == "1": | |
| ans = "2" + ans | |
| return name+ans | |
| elif digit == "0": | |
| ans = "1" + ans | |
| return name+ans | |
| else: | |
| ans = "1" + ans | |
| return name+ans | |
| return "overflow"+ans | |
| ############################################################################# | |
| # Trim the unwanted bits at the sentence - trimIt (step 1) | |
| ############################################################################# | |
| def trimIt(s): | |
| # generate the bits separated by blanks trimming blanks before, after, and multiples | |
| bits = s.strip().replace(" ", " ").replace(" ", " ").split(" ") | |
| start = 0 | |
| # remove itemize symbols | |
| if (bits[0] in ["*", "★", "-", "—", "–", ">", "."]): | |
| if (len(bits) == 1): | |
| return "" | |
| else: | |
| start = 1 | |
| # remove (BELO HORIZONTE) ... kind | |
| if (bits[start][0] == "(") and (bits[-1][-1] != ")"): | |
| for i in range(len(bits)): | |
| if (bits[i][-1] == ")"): | |
| start = i+1 | |
| break | |
| # remove CRONOLOGIA .... kind | |
| i = start | |
| while (i<len(bits)): | |
| if (bits[i].isupper()): | |
| start = i | |
| i += 1 | |
| else: | |
| break | |
| if ((len(bits[start]) > 1) and (bits[start].isupper())) and \ | |
| (start+1 < len(bits)): # make sure the next after all upper | |
| if (bits[start+1][0].isupper()): # is not a beginning of sentence | |
| start += 1 | |
| ans = bits[start] | |
| for i in range(start+1,len(bits)): | |
| ans += " "+bits[i] | |
| return ans | |
| ############################################################################# | |
| # Tag the itemize prompts and double paragraph with //*||*\\ or //*|(|*\\ - tagIt (step 2) | |
| ############################################################################# | |
| def tagIt(s): | |
| romans = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", \ | |
| "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", \ | |
| "xix", "xx", "xxi", "xxii", "xxiii", "xxiv", "xxvi", "xxvii", \ | |
| "xxviii", "xxix", "xxx", "xxxi", "xxxii", "xxxiii", "xxxiv", "xxxv"] | |
| # limited up to 35 | |
| letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", \ | |
| "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"] | |
| # limited to single letters | |
| itemizePrompts = romans+letters | |
| # go over the sent string looking for itemize prompt patern | |
| ans = "" | |
| bits = s.split(" ") | |
| for i in range(len(bits)): | |
| if (bits[i][-1] == ")"): | |
| if (bits[i][0] == "("): | |
| if (bits[i][1:-1] in itemizePrompts): | |
| ans += "//*||*\\\\"+bits[i]+"//*||*\\\\ " | |
| else: | |
| ans += bits[i]+" " | |
| else: | |
| if (bits[i][0:-1] in itemizePrompts): | |
| ans += "//*|(|*\\\\"+bits[i]+"//*||*\\\\ " | |
| else: | |
| ans += bits[i]+" " | |
| elif (bits[i] == "§§"): | |
| ans += "//*||*\\\\"+bits[i]+"//*||*\\\\ " | |
| else: | |
| ans += bits[i]+" " | |
| return ans | |
| ############################################################################# | |
| # Clear matching punctuations - punctIt (step 3) | |
| ############################################################################# | |
| def punctIt(s): | |
| def notAlphaNum(sent): | |
| ans = True | |
| for c in sent: | |
| if c.isalpha() or c.isdigit(): | |
| ans = False | |
| break | |
| return ans | |
| doubleQuotes = s.count('"') | |
| singleQuotes = s.count("'") | |
| openParentes = s.count("(") | |
| closParentes = s.count(")") | |
| openBrackets = s.count("[") | |
| closBrackets = s.count("]") | |
| openCurBrace = s.count("{") | |
| closCurBrace = s.count("}") | |
| openAligator = s.count("<") | |
| closAligator = s.count(">") | |
| if ((doubleQuotes == 2 ) and (s[0] == '"') and (s[-1] == '"')) or \ | |
| ((singleQuotes == 2 ) and (s[0] == "'") and (s[-1] == "'")) or \ | |
| ((openParentes == 1 ) and (closParentes == 1 ) and (s[0] == "(") and (s[-1] == ")")) or \ | |
| ((openBrackets == 1 ) and (closBrackets == 1 ) and (s[0] == "[") and (s[-1] == "]")) or \ | |
| ((openCurBrace == 1 ) and (closCurBrace == 1 ) and (s[0] == "{") and (s[-1] == "}")) or \ | |
| ((openAligator == 1 ) and (closAligator == 1 ) and (s[0] == "<") and (s[-1] == ">")): | |
| S = s[1:-1].strip() | |
| else: | |
| S = s.strip() | |
| if (doubleQuotes % 2 != 0): | |
| S = S.replace('"', '') | |
| if (singleQuotes % 2 != 0): | |
| S = S.replace("'", "") | |
| if (openParentes != closParentes): | |
| S = S.replace("(", "").replace(")", "") | |
| if (openBrackets != closBrackets): | |
| S = S.replace("[", "").replace("]", "") | |
| if (openCurBrace != closCurBrace): | |
| S = S.replace("{", "").replace("}", "") | |
| if (openAligator != closAligator): | |
| S = S.replace("<", "").replace(">", "") | |
| if (S == "") or (notAlphaNum(S) and ()): | |
| return "" | |
| elif (S[-2:] == "..") and S[-3:] != "...": | |
| S = S[:-2]+"." | |
| elif (S[-2:] in [":.", ";."]): | |
| S = S[:-2]+"." | |
| elif (S[-1] not in [".", "!", "?", ":", ";"]): | |
| if (S[-1] in ["'", '"', ")", "]", "}", ">"]) and (S[-2] in [".", "!", "?", ":", ";"]): | |
| S = S[:-2]+S[-1]+S[-2] | |
| else: | |
| S = S+"." | |
| return S.replace(" ", " ").replace(" ", " ") | |
| ############################################################################# | |
| # Decide if ambiguous tokens are contracted or not - desambIt (within step 4) | |
| ############################################################################# | |
| def desambIt(token, bits, i, lastField, s, SID, tokens): | |
| def stripWord(w): | |
| start, end = 0, len(w) | |
| for j in range(len(w)): | |
| if (not w[j].isalpha()): | |
| start = j+1 | |
| else: | |
| break | |
| for j in range(start,len(w)): | |
| if (not w[j].isalpha()): | |
| end = j | |
| break | |
| return w[start:end].lower() | |
| # nos - em os - nos | |
| if (token.lower() == "nos"): | |
| if (i > 0): | |
| preVERB = lex.pexists(stripWord(bits[i-1]), "VERB") or lex.pexists(stripWord(bits[i-1]), "AUX") | |
| else: | |
| preVERB = False | |
| if (i < len(bits)-1): | |
| posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX") | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| if (posNOUNDET): | |
| possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| posNOUNDET = False | |
| else: | |
| posVERB = False | |
| posNOUNDET = False | |
| if (posVERB and not posNOUNDET): | |
| tokens.append([token, lastField]) # don't break | |
| else: | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["EM","_"]) | |
| tokens.append(["OS","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Em","_"]) | |
| tokens.append(["os","_"]) | |
| else: | |
| tokens.append(["em","_"]) | |
| tokens.append(["os","_"]) | |
| # consigo - com si - consigo | |
| elif (token.lower() == "consigo"): | |
| if (i > 0): | |
| prePRONADV = lex.pexists(stripWord(bits[i-1]), "PRON") or lex.pexists(stripWord(bits[i-1]), "ADV") | |
| else: | |
| prePRONADV = False | |
| if (i < len(bits)-1): | |
| posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX") | |
| else: | |
| posVERB = False | |
| if (i < len(bits)-2): | |
| doQue = ((bits[i+1] == "do") and (bits[i+2] == "que")) or ((bits[i+1] == "sua")) | |
| else: | |
| doQue = False | |
| if ((prePRONADV) or (posVERB)) and (not doQue): | |
| tokens.append([token, lastField]) # don't break | |
| else: | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["COM","_"]) | |
| tokens.append(["SI","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Com","_"]) | |
| tokens.append(["si","_"]) | |
| else: | |
| tokens.append(["com","_"]) | |
| tokens.append(["si","_"]) | |
| # pra - para a - para | |
| elif (token.lower() == "pra"): | |
| if (i < len(bits)-1): | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| if (posNOUNDET): | |
| possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Plur" not in feats[2]) and ("Gender=Masc" not in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| posNOUNDET = False | |
| else: | |
| posNOUNDET = False | |
| if (posNOUNDET): | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["PARA","_"]) | |
| tokens.append(["A","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Para","_"]) | |
| tokens.append(["a","_"]) | |
| else: | |
| tokens.append(["para","_"]) | |
| tokens.append(["a","_"]) | |
| else: | |
| tokens.append([token, lastField]) # don't break | |
| # pela - por a - pela | |
| elif (token.lower() == "pela"): | |
| if (i < len(bits)-1): | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric() | |
| else: | |
| posNOUNDET = False | |
| properNOUNDIGIT = False | |
| if (posNOUNDET) or (properNOUNDIGIT): | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["POR","_"]) | |
| tokens.append(["A","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Por","_"]) | |
| tokens.append(["a","_"]) | |
| else: | |
| tokens.append(["por","_"]) | |
| tokens.append(["a","_"]) | |
| else: | |
| tokens.append([token, lastField]) # don't break | |
| # pelas - por as - pelas | |
| elif (token.lower() == "pelas"): | |
| if (i < len(bits)-1): | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric() | |
| else: | |
| posNOUNDET = False | |
| properNOUNDIGIT = False | |
| if (posNOUNDET) or (properNOUNDIGIT): | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["POR","_"]) | |
| tokens.append(["AS","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Por","_"]) | |
| tokens.append(["as","_"]) | |
| else: | |
| tokens.append(["por","_"]) | |
| tokens.append(["as","_"]) | |
| else: | |
| tokens.append([token, lastField]) # don't break | |
| # pelo - por o - pelo | |
| elif (token.lower() == "pelo"): | |
| if (i > 0): | |
| preART = lex.pexists(stripWord(bits[i-1]), "DET") | |
| if (preART): | |
| possible = lex.pget(stripWord(bits[i-1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| preART = False | |
| else: | |
| preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais") | |
| else: | |
| preART = False | |
| if (i < len(bits)-1): | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| posLower = not bits[i+1][0].isupper() | |
| if (posNOUNDET): | |
| possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| posNOUNDET = False | |
| else: | |
| posNOUNDET = False | |
| posLower = True | |
| if (preART) and (not posNOUNDET) and (posLower): | |
| tokens.append([token, lastField]) # don't break | |
| else: | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["POR","_"]) | |
| tokens.append(["O","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Por","_"]) | |
| tokens.append(["o","_"]) | |
| else: | |
| tokens.append(["por","_"]) | |
| tokens.append(["o","_"]) | |
| # pelos - por os - pelos | |
| elif (token.lower() == "pelos"): | |
| if (i > 0): | |
| preART = lex.pexists(stripWord(bits[i-1]), "DET") | |
| if (preART): | |
| possible = lex.pget(stripWord(bits[i-1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| preART = False | |
| else: | |
| preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais") | |
| else: | |
| preART = False | |
| if (i < len(bits)-1): | |
| posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET") | |
| posLower = not bits[i+1][0].isupper() | |
| if (posNOUNDET): | |
| possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET") | |
| agree = False | |
| for feats in possible: | |
| if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]): | |
| agree = True | |
| break | |
| if (not agree): | |
| posNOUNDET = False | |
| else: | |
| posNOUNDET = False | |
| posLower = True | |
| if (preART) and (not posNOUNDET) and (posLower): | |
| tokens.append([token, lastField]) # don't break | |
| else: | |
| tokens.append([token, "c"+lastField]) # break | |
| if (token.isupper()): | |
| tokens.append(["POR","_"]) | |
| tokens.append(["OS","_"]) | |
| elif (token[0].isupper()): | |
| tokens.append(["Por","_"]) | |
| tokens.append(["os","_"]) | |
| else: | |
| tokens.append(["por","_"]) | |
| tokens.append(["os","_"]) | |
| ############################################################################# | |
| # Tokenizing - tokenizeIt (step 4) | |
| ############################################################################# | |
| def tokenizeIt(s, SID, outfile): | |
| removable = ["'", '"', "(", ")", "[", "]", "{", "}", "<", ">", \ | |
| "!", "?", ",", ";", ":", "=", "+", "*", "★", "|", "/", "\\", \ | |
| "&", "^", "_", "`", "'", "~", "%", "§"] | |
| ignored = ["@", "#"] | |
| digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] | |
| contracts = {"à":["a","a"], | |
| "às":["a","as"], | |
| "ao":["a", "o"], | |
| "aos":["a", "os"], | |
| "àquela":["a", "aquela"], | |
| "àquelas":["a", "aquelas"], | |
| "àquele":["a", "aquele"], | |
| "àqueles":["a", "aqueles"], | |
| "comigo":["com", "mim"], | |
| "contigo":["com", "ti"], | |
| "consigo":["com", "si"], | |
| "conosco":["com", "nós"], | |
| "convosco":["com", "vós"], | |
| "da":["de", "a"], | |
| "das":["de", "as"], | |
| "do":["de", "o"], | |
| "dos":["de", "os"], | |
| "dali":["de", "ali"], | |
| "daqui":["de", "aqui"], | |
| "daí":["de", "aí"], | |
| "dentre":["de", "entre"], | |
| "desta":["de", "esta"], | |
| "destas":["de", "estas"], | |
| "deste":["de", "este"], | |
| "destes":["de", "estes"], | |
| "dessa":["de", "essa"], | |
| "dessas":["de", "essas"], | |
| "desse":["de", "esse"], | |
| "desses":["de", "esses"], | |
| "daquela":["de", "aquela"], | |
| "daquelas":["de", "aquelas"], | |
| "daquele":["de", "aquele"], | |
| "daqueles":["de", "aqueles"], | |
| "disto":["de", "isto"], | |
| "disso":["de", "isso"], | |
| "daquilo":["de", "aquilo"], | |
| "dela":["de", "ela"], | |
| "delas":["de", "elas"], | |
| "dele":["de", "ele"], | |
| "deles":["de", "eles"], | |
| "doutra":["de", "outra"], | |
| "doutras":["de", "outras"], | |
| "doutro":["de", "outro"], | |
| "doutros":["de", "outros"], | |
| "dum":["de", "um"], | |
| "duns":["de", "uns"], | |
| "duma":["de", "uma"], | |
| "dumas":["de", "umas"], | |
| "na":["em", "a"], | |
| "nas":["em", "as"], | |
| "no":["em", "o"], | |
| "nos":["em", "os"], | |
| "nesta":["em", "esta"], | |
| "nestas":["em", "estas"], | |
| "neste":["em", "este"], | |
| "nestes":["em", "estes"], | |
| "nessa":["em", "essa"], | |
| "nessas":["em", "essas"], | |
| "nesse":["em", "esse"], | |
| "nesses":["em", "esses"], | |
| "naquela":["em", "aquela"], | |
| "naquelas":["em", "aquelas"], | |
| "naquele":["em", "aquele"], | |
| "naqueles":["em", "aqueles"], | |
| "nisto":["em", "isto"], | |
| "nisso":["em", "isso"], | |
| "naquilo":["em", "aquilo"], | |
| "nela":["em", "ela"], | |
| "nelas":["em", "elas"], | |
| "nele":["em", "ele"], | |
| "neles":["em", "eles"], | |
| "noutra":["em", "outra"], | |
| "noutras":["em", "outras"], | |
| "noutro":["em", "outro"], | |
| "noutros":["em", "outros"], | |
| "num":["em", "um"], | |
| "nuns":["em", "uns"], | |
| "numa":["em", "uma"], | |
| "numas":["em", "umas"], | |
| "pela":["por", "a"], | |
| "pelas":["por", "as"], | |
| "pelo":["por", "o"], | |
| "pelos":["por", "os"], | |
| "pra":["para", "a"], | |
| "pras":["para", "as"], | |
| "pro":["para", "o"], | |
| "pros":["para", "os"], | |
| "prum":["para", "um"], | |
| "pruns":["para", "uns"], | |
| "pruma":["para", "uma"], | |
| "prumas":["para", "umas"]} | |
| ambigous = ["nos", "consigo", "pra", "pela", "pelas", "pelo", "pelos"] | |
| # ambigous = ["nos", "consigo", "pra", "pelo", "pelos"] | |
| enclisis = ['me', 'te', 'se', 'lhe', 'o', 'a', 'nos', 'vos', 'lhes', 'os', 'as', 'lo', 'la', 'los', 'las'] | |
| doubleEnclisis = ["mo", "to", "lho", "lhos", "ma", "ta", "lha", "lhas", "mos", "tos"] | |
| doubleEnclisisTri = ["no-lo", "vo-lo", "no-la", "vo-la", "no-los", "vo-los", "no-las", "vo-las"] | |
| terminations = ["ia", "ias", "as", "iamos", "ieis", "iam", "ei", "as", "a", "emos", "eis", "ão", "á"] | |
| abbrev = [ | |
| "dr.", "dra.", "sr.", "sra.", "prof.", "profa.", "Dr.", "Dra.", "Sr.", "Sra.", "Prof.", "Profa.", "DR.", "DRA.", "SR.", "SRA.", "PROF.", "PROFA.", | |
| "ilmo.", "Ilmo.", "ILMO.", "bel.", "Bel.", "BEL.", "eng.", "Eng.", "ENG.", "reg.", "Reg.", "REG.", "visc.", "Visc.", "VISC.", "bar.", "Bar.", "BAR.", | |
| "cond.", "Cond.", "COND.", "séc.", "Séc.", "SÉC.", "jr.", "Jr.", "JR.", "ir.", "Ir.", "IR.", "st.", "St.", "ST.", "app.", "App.", "APP.", | |
| "gov.", "Gov.", "GOV.", "des.", "Des.", "DES.", "gen.", "Gen.", "GEN.", "gal.", "Gal.", "GAL.", "cel.", "Cel.", "CEL.", "col.", "Col.", "COL.", | |
| "maj.", "Maj.", "MAJ.", "ten.", "Ten.", "TEN.", "cap.", "Cap.", "CAP.", "capt.", "Capt.", "CAPT.", "com.", "Com.", "COM.", "brig.", "Brig.", "BRIG.", | |
| "estac.", "Estac.", "ESTAC.", "tel.", "Tel.", "TEL.", "ave.", "Ave.", "AVE.", "av.", "Av.", "AV.", "trav.", "Trav.", "TRAV.", "con.", "Con.", "CON.", | |
| "jd.", "Jd.", "JD.", "ed.", "Ed.", "ED.", "lj.", "Lj.", "LJ.", "cj.", "Cj.", "CJ.", "apto.", "Apto.", "APTO.", "apt.", "Apt.", "APT.", "ingr.", "Ingr.", "INGR.", | |
| "ap.", "Ap.", "AP.", "dir.", "Dir.", "DIR.", "min.", "Min.", "MIN.", "sec.", "Sec.", "SEC.", "kg.", "Kg.", "KG.", "ml.", "Ml.", "ML.", "km.", "Km.", "KM.", "cm.", "Cm.", "CM.", | |
| "vol.", "Vol.", "VOL.", "PP.", "pp.", "Pp", "pag.", "Pag", "PAG.", "pág.", "Pág", "PÁG.", "al.", "Al.", "AL.", "etc.", "i.e.", "e.g.", "cia.", "Cia.", "CIA.", | |
| "co.", "Co.", "CO.", "ltda.", "Ltda.", "LTDA.", "ex.", "Ex.", "EX.", "ac.", "Ac.", "AC.", "dc.", "Dc.", "DC.", "bros.", "Bros.", "BROS.", "pq.", "Pq.", "PQ.", | |
| "br.", "Br.", "BR.", "cent.", "Cent.", "CENT.", "ft.", "Ft.", "FT.", "net.", "Net.", "NET.", "no.", "No.", "NO.", "nr.", "Nr.", "NR.", "tr.", "Tr.", "TR.", | |
| "mi.", "Mi.", "MI.", "sta.", "Sta.", "STA.", "sto.", "Sto.", "STO.", "int.", "Int.", "INT.", "inf.", "Inf.", "INF.", "cult.", "Cult.", "CULT.", "op.", "Op.", "OP.", | |
| "aprox.", "Aprox.", "APROX.", "it.", "It.", "IT.", "ex.", "Ex.", "EX.", "flex.", "Flex.", "FLEX.", "ass.", "Ass.", "ASS.", "pç.", "Pç.", "PÇ.", "ind.", "Ind.", "IND.", | |
| "vl.", "Vl.", "VL.", "imp.", "Imp.", "IMP.", "emp.", "Emp.", "EMP.", "esq.", "Esq.", "ESQ.", "dir.", "Dir.", "DIR.", "ingr.", "Ingr.", "INGR.", "pça.", "Pça.", "PÇA.", | |
| "art.", "Art.", "ART.", "sec.", "Sec.", "SEC.", "inc.", "Inc.", "INC.", "a.", "A.", "b.", "B.", "c.", "C.", "d.", "D.", "e.", "E.", "f.", "F.", "g.", "G.", "h.", "H.", "i.", "I.", | |
| "j.", "J.", "k.", "K.", "l.", "L.", "m.", "M.", "n.", "N.", "o.", "O.", "p.", "P.", "q.", "Q.", "r.", "R.", "s.", "S.", "t.", "T.", "u.", "U.", "v.", "V.", "w.", "W.", "x.", "X.", "y.", "Y.", "z.", "Z.", | |
| "seg.", "ter.", "qua.", "qui.", "sex.", "sab.", "sáb.", "dom.", "Seg.", "Ter.", "Qua.", "Qui.", "Sex.", "Sab.", "Sáb.", "Dom.", "SEG.", "TER.", "QUA.", "QUI.", "SEX.", "SAB.", "SÁB.", "DOM.", | |
| "jan.", "fev.", "mar.", "abr.", "mai.", "jun.", "jul.", "ago.", "sep.", "out.", "nov.", "dez.", "Jan.", "Fev.", "Mar.", "Abr.", "Mai.", "Jun.", "Jul.", "Ago.", "Sep.", "Out.", "Nov.", "Dez.", | |
| "JAN.", "FEV.", "MAR.", "ABR.", "MAI.", "JUN.", "JUL.", "AGO.", "SET.", "OUT.", "NOV.", "DEZ." | |
| ] | |
| #abbrev = [] | |
| #infile = open("abbrev.txt", "r") | |
| #for line in infile: | |
| # abbrev.append(line[:-1]) | |
| #infile.close() | |
| def isAbbrev(chunk, abbrev): | |
| abbr = False | |
| for a in abbrev: | |
| if (chunk == a): | |
| abbr = True | |
| break | |
| #else: | |
| # lasts = -len(a) | |
| # if (chunk[lasts:] == a) and (not chunk[lasts-1].isalpha()): | |
| # abbr = True | |
| # break | |
| return abbr | |
| tokens = [] | |
| bits = s.split(" ") | |
| k = 0 | |
| for b in bits: | |
| pretagged = False | |
| if (len(b) > 16): | |
| if (b[:8] == "//*||*\\\\") or (b[:9] == "//*|(|*\\\\"): | |
| pretagged = True | |
| if (pretagged): | |
| # keep the bit as token and clean the tags //*||*\\ before and after | |
| tokens.append([b.replace("//*||*\\\\", "").replace("//*||*\\\\", "").replace("//*|(|*\\\\", ""), "_"]) | |
| else: | |
| # deal with the pre (before) middle | |
| pre = [] | |
| changed = True | |
| while (changed) and (len(b) > 1): | |
| changed = False | |
| if (b[0] in removable) or ((b[0] == "$") and (b[1] in digits)) or ((b[0] == "-") and (b[1] not in digits)): | |
| pre.append(b[0]) | |
| b = b[1:] | |
| changed = True | |
| # deal with the pos (after) middle | |
| tmp = [] | |
| changed = True | |
| while (changed) and (len(b) > 1): | |
| if (isAbbrev(b, abbrev)): | |
| break | |
| changed = False | |
| if (b[-1] in removable+["-", "."]): | |
| tmp.append(b[-1]) | |
| b = b[:-1] | |
| changed = True | |
| pos = [] | |
| reticent = "" | |
| for i in range(len(tmp)-1, -1, -1): | |
| if (tmp[i] == "."): | |
| if (reticent == ""): | |
| reticent = "." | |
| elif (reticent == "."): | |
| reticent = ".." | |
| elif (reticent == ".."): | |
| pos.append("...") | |
| reticent = "" | |
| else: | |
| if (reticent != ""): | |
| pos.append(reticent) | |
| reticent = "" | |
| pos.append(tmp[i]) | |
| if (reticent != ""): | |
| pos.append(reticent) | |
| # deal with the middle | |
| buf = b.split("-") | |
| if (len(buf) == 1): | |
| parts = pre+[b]+pos | |
| # enclisis (types I - infinitive e.g. cumprí-lo and type II - sonore e.g. satisfê-lo) | |
| elif (len(buf) == 2) and (buf[1] in enclisis): | |
| if (buf[0][-1] == "á"): | |
| if (lex.pexists(buf[0][:-1]+"ar", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"ar", buf[1]]+pos | |
| else: | |
| if (lex.pexists(buf[0][:-1]+"as", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"as", buf[1]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0][:-1]+"az", buf[1]]+pos | |
| elif (buf[0][-1] == "ê"): | |
| if (lex.pexists(buf[0][:-1]+"er", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"er", buf[1]]+pos | |
| else: | |
| if (lex.pexists(buf[0][:-1]+"es", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"es", buf[1]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0][:-1]+"ez", buf[1]]+pos | |
| elif (buf[0][-1] == "í"): | |
| if (lex.pexists(buf[0][:-1]+"ir", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"ir", buf[1]]+pos | |
| else: | |
| if (lex.pexists(buf[0][:-1]+"is", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"is", buf[1]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0][:-1]+"iz", buf[1]]+pos | |
| elif (buf[0][-1] == "ô"): | |
| if (lex.pexists(buf[0][:-1]+"or", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"or", buf[1]]+pos | |
| else: | |
| if (lex.pexists(buf[0][:-1]+"os", "VERB")): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"os", buf[1]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0][:-1]+"oz", buf[1]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0], buf[1]]+pos | |
| # double enclisis - type II (e.g. disse-lhos, dei-ta) | |
| elif (len(buf) == 2) and (buf[1] in doubleEnclisis): | |
| if (buf[1][-1] == "a"): | |
| parts = pre+["*^^*"+b, buf[0], buf[1][:-1]+"e", buf[1][-1]]+pos | |
| elif (buf[1][-1] == "o"): | |
| parts = pre+["*^^*"+b, buf[0], buf[1][:-1]+"e", buf[1][-1]]+pos | |
| elif (buf[1][-2:] == "as"): | |
| parts = pre+["*^^*"+b, buf[0], buf[1][:-2]+"e", buf[1][-2:]]+pos | |
| elif (buf[1][-2:] == "os"): | |
| parts = pre+["*^^*"+b, buf[0], buf[1][:-2]+"e", buf[1][-2:]]+pos | |
| else: | |
| parts = pre+["*^*"+b, buf[0], buf[1]]+pos | |
| # double enclisis - type I (e.g. dá-se-lhes) | |
| elif (len(buf) == 3) and (buf[1] in enclisis) and (buf[2] in enclisis): | |
| if (buf[0][-1] == "á"): | |
| parts = pre+["*^^*"+b, buf[0][:-1]+"ar", buf[1], buf[2]]+pos | |
| elif (buf[0][-1] == "ê"): | |
| parts = pre+["*^^*"+b, buf[0][:-1]+"er", buf[1], buf[2]]+pos | |
| elif (buf[0][-1] == "í"): | |
| parts = pre+["*^^*"+b, buf[0][:-1]+"ir", buf[1], buf[2]]+pos | |
| elif (buf[0][-1] == "ô"): | |
| parts = pre+["*^^*"+b, buf[0][:-1]+"or", buf[1], buf[2]]+pos | |
| else: | |
| parts = pre+["*^^*"+b, buf[0], buf[1], buf[2]]+pos | |
| # mesoclisis - type I (e.g. dar-lo-ia) | |
| elif (len(buf) == 3) and (buf[1] in enclisis) \ | |
| and (buf[0][-1] == "r") and (buf[2] in terminations): | |
| parts = pre+["*^*"+b, buf[0]+buf[2], buf[1]]+pos | |
| # mesoclisis - type II (e.g. dá-lo-ia) | |
| elif (len(buf) == 3) and (buf[1] in enclisis) \ | |
| and (buf[0][-1] in ["á", "ê", "í", "ô"]) and (buf[2] in terminations): | |
| if (buf[0][-1] == "á"): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"ar"+buf[2], buf[1]]+pos | |
| elif (buf[0][-1] == "ê"): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"er"+buf[2], buf[1]]+pos | |
| elif (buf[0][-1] == "í"): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"ir"+buf[2], buf[1]]+pos | |
| elif (buf[0][-1] == "ô"): | |
| parts = pre+["*^*"+b, buf[0][:-1]+"or"+buf[2], buf[1]]+pos | |
| else: | |
| parts = pre+[b]+pos | |
| # transform parts into tokens to be added | |
| i = 0 | |
| while (i < len(parts)): | |
| if (i == len(parts)-1): | |
| lastField = "_" | |
| else: | |
| lastField = "SpaceAfter=No" | |
| if (parts[i][:3] == "*^*"): | |
| if (i+3 == len(parts)): | |
| tokens.append([parts[i][3:], "c_"]) | |
| else: | |
| tokens.append([parts[i][3:], "cSpaceAfter=No"]) | |
| i += 1 | |
| tokens.append([parts[i], "_"]) | |
| i += 1 | |
| tokens.append([parts[i], "_"]) | |
| elif (parts[i][:4] == "*^^*"): | |
| if (i+4 == len(parts)): | |
| tokens.append([parts[i][4:], "C_"]) | |
| else: | |
| tokens.append([parts[i][4:], "CSpaceAfter=No"]) | |
| i += 1 | |
| tokens.append([parts[i], "_"]) | |
| i += 1 | |
| tokens.append([parts[i], "_"]) | |
| i += 1 | |
| tokens.append([parts[i], "_"]) | |
| elif (parts[i] not in ambigous): | |
| ans = contracts.get(parts[i].lower()) | |
| if (ans == None): | |
| tokens.append([parts[i], lastField]) | |
| else: | |
| tokens.append([parts[i], "c"+lastField]) | |
| if (parts[i].isupper()): | |
| tokens.append([ans[0].upper(),"_"]) | |
| tokens.append([ans[1].upper(),"_"]) | |
| elif (parts[i][0].isupper()): | |
| tokens.append([ans[0][0].upper()+ans[0][1:],"_"]) | |
| tokens.append([ans[1],"_"]) | |
| else: | |
| tokens.append([ans[0],"_"]) | |
| tokens.append([ans[1],"_"]) | |
| else: | |
| desambIt(parts[i], bits, k, lastField, s, SID, tokens) | |
| i += 1 | |
| k += 1 | |
| # output the sentence with all the tokens | |
| print("# sent_id =", SID, file=outfile) | |
| print("# text =", s.replace("//*||*\\\\", "").replace("//*||*\\\\", "").replace("//*|(|*\\\\", ""), file=outfile) | |
| ## printout tokens | |
| toks = 0 | |
| for i in range(len(tokens)): | |
| if (tokens[i][1][0] == "c"): | |
| # contracted word (two parts) | |
| print(str(toks+1)+"-"+str(toks+2), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1][1:], sep="\t", file=outfile) | |
| elif (tokens[i][1][0] == "C"): | |
| # contracted word (three parts) | |
| print(str(toks+1)+"-"+str(toks+3), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1][1:], sep="\t", file=outfile) | |
| elif (tokens[i][0].strip() != ""): | |
| # non contracted word | |
| toks += 1 | |
| print(str(toks), tokens[i][0].strip(), "_", "_", "_", "_", "_", "_", "_", tokens[i][1], sep="\t", file=outfile) | |
| print(file=outfile) | |
| return(toks) | |
| ################################################# | |
| ### Deal with a sentence, clean it, if required, then tokenize it | |
| ################################################# | |
| def dealWith(outfile, sent, SID, preserve, match, trim): | |
| if (trim): # step 1 | |
| sent = trimIt(sent) | |
| if (preserve): # step 2 | |
| sent = tagIt(sent) | |
| if (match): # step 3 | |
| sent = punctIt(sent) | |
| if (sent != ""): # step 4 | |
| return 1, tokenizeIt(sent, SID, outfile) | |
| else: | |
| return 0, 0 | |
| ################################################# | |
| ### função principal do programa - busca argumentos e chama 'tokenize' para cada sentença da entrada | |
| ################################################# | |
| def portTok(): | |
| if (len(sys.argv) == 1): | |
| #arguments = ["/Users/pf64/Desktop/alienista/alienista_empty.conllu", "/Users/pf64/Desktop/alienista/alienista.txt", False, True, False, "ATENISTA_SENT0000"] | |
| #arguments = ["/Users/pf64/Desktop/tst.conllu", "/Users/pf64/Desktop/tst.txt", True, True, True, "S000000"] | |
| arguments = ["sents.conllu", "sents.txt", True, True, True, "S000000"] | |
| print("Assumindo default: 'sents.conllu' como arquivo de saída, 'sents.txt' como arquivo de entrada, correções, remoções e S0000 como sid.") | |
| else: | |
| arguments = parseOptions(sys.argv) | |
| if (arguments != None): | |
| if (arguments[0] == ""): | |
| print("Assumindo 'sents.conllu' como arquivo de saída") | |
| arguments[0] = 'sents.conllu' | |
| if (arguments[1] == []): | |
| print("Arquivo de entrada inválido - por favor corrija e tente novamente") | |
| else: | |
| outfile = open(arguments[0], "w") | |
| #print("# newdoc id = {}\n# newpar".format(arguments[0]), file=outfile) | |
| infile = open(arguments[1], "r") | |
| SID = arguments[5] | |
| sTOTAL, tTOTAL = 0, 0 | |
| for line in infile: | |
| SID = nextName(SID) | |
| s, t = dealWith(outfile, line[:-1], SID, arguments[2], arguments[3], arguments[4]) | |
| if (s == 1): | |
| sTOTAL += 1 | |
| tTOTAL += t | |
| outfile.close() | |
| infile.close() | |
| print("Tokenização terminada com {} sentenças extraídas ({} tokens) e salvas em {}".format(sTOTAL, tTOTAL, arguments[0])) | |
| else: | |
| print("Problemas com parâmetros - por favor corrija e tente novamente") | |
| portTok() | |