Spaces:

NILC-ICMC-USP
/

Portparser.v2

Running

App Files Files Community

Portparser.v2 / src /portTokenizer /portTok.py

NILC-ICMC-USP

Upload portTok.py

80e6e3f verified 2 months ago

raw

history blame contribute delete

42.8 kB

	# portTokenizer - tokenizador de sentenças em Portugues para um arquivo CoNLL-U
	#
	# Este programa recebe um arquivo de entrada textual com uma sentença
	# por linha e gera um arquivo CoNLL-U devidamente tokenizado.
	#
	# Este programa utiliza um léxico de Português, PortiLexicon-UD através da
	# chamada da classe UDlexPT incluída no arquivo "lexikon.py" disponível
	# em conjunto com este arquivo ("portTok.py") e com os arquivos textuais
	# do léxico ("ADJ.tsv", "ADP.tsv", "ADV.tsv", "AUX.tsv", "CCONJ.tsv",
	# "DET.tsv", "INTJ.tsv", "NOUN.tsv", "NUM.tsv", "PRON.tsv", "SCONJ.tsv",
	# "VERB.tsv", "WORDmaster.txt").
	#
	# Opções:
	#
	# -h help
	# -o output file
	# -p preserve as one token itemizations as a) b) and i) ii)
	# -m matches the paired punctuations
	# -t trims headlines (heuristic)
	# -s sentence id (sid) model
	#
	# Exemplo de utilização:
	#
	# portTok -o sents.conllu -p -m -t -s S000000 sents.txt
	#
	# Busca as sentenças no arquivo 'sents.txt',
	# preserva tokens em itens como a) b) i) ii),
	# corrige pontuações casadas (aspas, parenteses, etc),
	# remove possíveis MANCHETES que precedem as frases,
	# usa S000000 como modelo de identificador de sentença e
	# salva as sentenças devidamente tokenizadas no arquivo 'sents.conllu'
	#
	# last edit: 10/05/2025
	# created by Lucelene Lopes - [email protected]

	import sys, os
	import lexikon
	lex = lexikon.UDlexPT()


	#################################################
	### Captura de argumentos da linha de comando
	#################################################
	def parseOptions(arguments):
	# default options
	output_file, input_file, preserve, match, trim, model = "", [], False, False, False, "S000000"
	i = 1
	while i < len(arguments):
	if (arguments[i][0] == "-"):
	# ajuda (help) - mostra ajuda, nada é executado
	if ((arguments[i][1] == "h") and (len(arguments[i])==2)) or \
	(arguments[i] == "-help"):
	print("Opções:\n-h ajuda\n-o arquivo de saída", \
	"-m corrige pontuações casadas (aspas, parenteses, etc)", \
	"-t remove possíveis MANCHETES que precedem as frases", \
	"Exemplo de utilização:", \
	"portTok -o sents.conllu -m -t -s S000000 sents.txt", \
	"Busca as sentenças no arquivo 'sents.txt',", \
	" corrige pontuações casadas (aspas, parenteses, etc),", \
	" remove possíveis MANCHETES que precedem as frases", \
	" usa S0000000 como modelo de identificador de sentença e"
	" salva as sentenças devidamente tokenizadas no arquivo 'sents.conllu''", \
	sep="\n")
	return None
	# opção de correção (preserve) para itens a) b) i) ii)
	elif ((arguments[i][1] == "p") and (len(arguments[i])==2)) or \
	(arguments[i] == "-preserve"):
	preserve = True
	i += 1
	# opção de correção (matching) de pontuações pareadas
	elif ((arguments[i][1] == "m") and (len(arguments[i])==2)) or \
	(arguments[i] == "-match"):
	match = True
	i += 1
	# opção de remoção (trim) de manchetes no início da sentença
	elif ((arguments[i][1] == "t") and (len(arguments[i])==2)) or \
	(arguments[i] == "-trim"):
	trim = True
	i += 1
	# opção de modelo de identificador de sentença (sid) - 0 para sem limite
	elif ((arguments[i][1] == "s") and (len(arguments[i])==2)) or \
	(arguments[i] == "-sid"):
	try:
	model = arguments[i+1]
	i += 2
	except:
	print("modelo de identificador de sentença não informado - assumindo S00000")
	i += 1
	# opção de arquivo de saída (um nome de arquivo)
	elif ((arguments[i][1] == "o") and (len(arguments[i])==2)) or \
	(arguments[i] == "-output"):
	output_file = arguments[i+1]
	i += 2
	# opções inválidas - nada é executado
	else:
	print("Opção {} inválida, demais opções ignoradas, por favor execute novamente".format(arguments[i]))
	return None
	# arquivo de entrada - só é incluído se existir
	else:
	if (os.path.isfile(arguments[i])):
	input_file = arguments[i]
	i += 1
	else:
	print("O arquivo {} não foi encontrado, por favor execute novamente".format(arguments[i]))
	return None
	return [output_file, input_file, preserve, match, trim, model]

	#############################################################################
	# Increment a name index
	#############################################################################
	def nextName(name):
	# increment the digits from right to left
	ans = ""
	while name != "":
	digit, name = name[-1], name[:-1]
	if digit == "9":
	ans = "0" + ans
	elif digit == "8":
	ans = "9" + ans
	return name+ans
	elif digit == "7":
	ans = "8" + ans
	return name+ans
	elif digit == "6":
	ans = "7" + ans
	return name+ans
	elif digit == "5":
	ans = "6" + ans
	return name+ans
	elif digit == "4":
	ans = "5" + ans
	return name+ans
	elif digit == "3":
	ans = "4" + ans
	return name+ans
	elif digit == "2":
	ans = "3" + ans
	return name+ans
	elif digit == "1":
	ans = "2" + ans
	return name+ans
	elif digit == "0":
	ans = "1" + ans
	return name+ans
	else:
	ans = "1" + ans
	return name+ans
	return "overflow"+ans

	#############################################################################
	# Trim the unwanted bits at the sentence - trimIt (step 1)
	#############################################################################
	def trimIt(s):
	# generate the bits separated by blanks trimming blanks before, after, and multiples
	bits = s.strip().replace(" ", " ").replace(" ", " ").split(" ")
	start = 0
	# remove itemize symbols
	if (bits[0] in ["*", "★", "-", "—", "–", ">", "."]):
	if (len(bits) == 1):
	return ""
	else:
	start = 1
	# remove (BELO HORIZONTE) ... kind
	if (bits[start][0] == "(") and (bits[-1][-1] != ")"):
	for i in range(len(bits)):
	if (bits[i][-1] == ")"):
	start = i+1
	break
	# remove CRONOLOGIA .... kind
	i = start
	while (i<len(bits)):
	if (bits[i].isupper()):
	start = i
	i += 1
	else:
	break
	if ((len(bits[start]) > 1) and (bits[start].isupper())) and \
	(start+1 < len(bits)): # make sure the next after all upper
	if (bits[start+1][0].isupper()): # is not a beginning of sentence
	start += 1
	ans = bits[start]
	for i in range(start+1,len(bits)):
	ans += " "+bits[i]
	return ans

	#############################################################################
	# Tag the itemize prompts and double paragraph with //\|\|\\ or //\|(\|\\ - tagIt (step 2)
	#############################################################################
	def tagIt(s):
	romans = ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", \
	"xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", \
	"xix", "xx", "xxi", "xxii", "xxiii", "xxiv", "xxvi", "xxvii", \
	"xxviii", "xxix", "xxx", "xxxi", "xxxii", "xxxiii", "xxxiv", "xxxv"]
	# limited up to 35
	letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", \
	"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
	# limited to single letters
	itemizePrompts = romans+letters
	# go over the sent string looking for itemize prompt patern
	ans = ""
	bits = s.split(" ")
	for i in range(len(bits)):
	if (bits[i][-1] == ")"):
	if (bits[i][0] == "("):
	if (bits[i][1:-1] in itemizePrompts):
	ans += "//\|\|\\\\"+bits[i]+"//\|\|\\\\ "
	else:
	ans += bits[i]+" "
	else:
	if (bits[i][0:-1] in itemizePrompts):
	ans += "//\|(\|\\\\"+bits[i]+"//\|\|\\\\ "
	else:
	ans += bits[i]+" "
	elif (bits[i] == "§§"):
	ans += "//\|\|\\\\"+bits[i]+"//\|\|\\\\ "
	else:
	ans += bits[i]+" "
	return ans

	#############################################################################
	# Clear matching punctuations - punctIt (step 3)
	#############################################################################
	def punctIt(s):
	def notAlphaNum(sent):
	ans = True
	for c in sent:
	if c.isalpha() or c.isdigit():
	ans = False
	break
	return ans
	doubleQuotes = s.count('"')
	singleQuotes = s.count("'")
	openParentes = s.count("(")
	closParentes = s.count(")")
	openBrackets = s.count("[")
	closBrackets = s.count("]")
	openCurBrace = s.count("{")
	closCurBrace = s.count("}")
	openAligator = s.count("<")
	closAligator = s.count(">")
	if ((doubleQuotes == 2 ) and (s[0] == '"') and (s[-1] == '"')) or \
	((singleQuotes == 2 ) and (s[0] == "'") and (s[-1] == "'")) or \
	((openParentes == 1 ) and (closParentes == 1 ) and (s[0] == "(") and (s[-1] == ")")) or \
	((openBrackets == 1 ) and (closBrackets == 1 ) and (s[0] == "[") and (s[-1] == "]")) or \
	((openCurBrace == 1 ) and (closCurBrace == 1 ) and (s[0] == "{") and (s[-1] == "}")) or \
	((openAligator == 1 ) and (closAligator == 1 ) and (s[0] == "<") and (s[-1] == ">")):
	S = s[1:-1].strip()
	else:
	S = s.strip()
	if (doubleQuotes % 2 != 0):
	S = S.replace('"', '')
	if (singleQuotes % 2 != 0):
	S = S.replace("'", "")
	if (openParentes != closParentes):
	S = S.replace("(", "").replace(")", "")
	if (openBrackets != closBrackets):
	S = S.replace("[", "").replace("]", "")
	if (openCurBrace != closCurBrace):
	S = S.replace("{", "").replace("}", "")
	if (openAligator != closAligator):
	S = S.replace("<", "").replace(">", "")
	if (S == "") or (notAlphaNum(S) and ()):
	return ""
	elif (S[-2:] == "..") and S[-3:] != "...":
	S = S[:-2]+"."
	elif (S[-2:] in [":.", ";."]):
	S = S[:-2]+"."
	elif (S[-1] not in [".", "!", "?", ":", ";"]):
	if (S[-1] in ["'", '"', ")", "]", "}", ">"]) and (S[-2] in [".", "!", "?", ":", ";"]):
	S = S[:-2]+S[-1]+S[-2]
	else:
	S = S+"."
	return S.replace(" ", " ").replace(" ", " ")

	#############################################################################
	# Decide if ambiguous tokens are contracted or not - desambIt (within step 4)
	#############################################################################
	def desambIt(token, bits, i, lastField, s, SID, tokens):
	def stripWord(w):
	start, end = 0, len(w)
	for j in range(len(w)):
	if (not w[j].isalpha()):
	start = j+1
	else:
	break
	for j in range(start,len(w)):
	if (not w[j].isalpha()):
	end = j
	break
	return w[start:end].lower()
	# nos - em os - nos
	if (token.lower() == "nos"):
	if (i > 0):
	preVERB = lex.pexists(stripWord(bits[i-1]), "VERB") or lex.pexists(stripWord(bits[i-1]), "AUX")
	else:
	preVERB = False
	if (i < len(bits)-1):
	posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX")
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET")
	if (posNOUNDET):
	possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]):
	agree = True
	break
	if (not agree):
	posNOUNDET = False
	else:
	posVERB = False
	posNOUNDET = False
	if (posVERB and not posNOUNDET):
	tokens.append([token, lastField]) # don't break
	else:
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["EM","_"])
	tokens.append(["OS","_"])
	elif (token[0].isupper()):
	tokens.append(["Em","_"])
	tokens.append(["os","_"])
	else:
	tokens.append(["em","_"])
	tokens.append(["os","_"])
	# consigo - com si - consigo
	elif (token.lower() == "consigo"):
	if (i > 0):
	prePRONADV = lex.pexists(stripWord(bits[i-1]), "PRON") or lex.pexists(stripWord(bits[i-1]), "ADV")
	else:
	prePRONADV = False
	if (i < len(bits)-1):
	posVERB = lex.pexists(stripWord(bits[i+1]), "VERB") or lex.pexists(stripWord(bits[i+1]), "AUX")
	else:
	posVERB = False
	if (i < len(bits)-2):
	doQue = ((bits[i+1] == "do") and (bits[i+2] == "que")) or ((bits[i+1] == "sua"))
	else:
	doQue = False
	if ((prePRONADV) or (posVERB)) and (not doQue):
	tokens.append([token, lastField]) # don't break
	else:
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["COM","_"])
	tokens.append(["SI","_"])
	elif (token[0].isupper()):
	tokens.append(["Com","_"])
	tokens.append(["si","_"])
	else:
	tokens.append(["com","_"])
	tokens.append(["si","_"])
	# pra - para a - para
	elif (token.lower() == "pra"):
	if (i < len(bits)-1):
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET")
	if (posNOUNDET):
	possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Plur" not in feats[2]) and ("Gender=Masc" not in feats[2]):
	agree = True
	break
	if (not agree):
	posNOUNDET = False
	else:
	posNOUNDET = False
	if (posNOUNDET):
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["PARA","_"])
	tokens.append(["A","_"])
	elif (token[0].isupper()):
	tokens.append(["Para","_"])
	tokens.append(["a","_"])
	else:
	tokens.append(["para","_"])
	tokens.append(["a","_"])
	else:
	tokens.append([token, lastField]) # don't break
	# pela - por a - pela
	elif (token.lower() == "pela"):
	if (i < len(bits)-1):
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET")
	properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric()
	else:
	posNOUNDET = False
	properNOUNDIGIT = False
	if (posNOUNDET) or (properNOUNDIGIT):
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["POR","_"])
	tokens.append(["A","_"])
	elif (token[0].isupper()):
	tokens.append(["Por","_"])
	tokens.append(["a","_"])
	else:
	tokens.append(["por","_"])
	tokens.append(["a","_"])
	else:
	tokens.append([token, lastField]) # don't break
	# pelas - por as - pelas
	elif (token.lower() == "pelas"):
	if (i < len(bits)-1):
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "NUM") or lex.pexists(stripWord(bits[i+1]), "DET")
	properNOUNDIGIT = bits[i+1][0].isupper() or bits[i+1][0].isnumeric()
	else:
	posNOUNDET = False
	properNOUNDIGIT = False
	if (posNOUNDET) or (properNOUNDIGIT):
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["POR","_"])
	tokens.append(["AS","_"])
	elif (token[0].isupper()):
	tokens.append(["Por","_"])
	tokens.append(["as","_"])
	else:
	tokens.append(["por","_"])
	tokens.append(["as","_"])
	else:
	tokens.append([token, lastField]) # don't break
	# pelo - por o - pelo
	elif (token.lower() == "pelo"):
	if (i > 0):
	preART = lex.pexists(stripWord(bits[i-1]), "DET")
	if (preART):
	possible = lex.pget(stripWord(bits[i-1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]):
	agree = True
	break
	if (not agree):
	preART = False
	else:
	preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais")
	else:
	preART = False
	if (i < len(bits)-1):
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET")
	posLower = not bits[i+1][0].isupper()
	if (posNOUNDET):
	possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Plur" not in feats[2]) and ("Gender=Fem" not in feats[2]):
	agree = True
	break
	if (not agree):
	posNOUNDET = False
	else:
	posNOUNDET = False
	posLower = True
	if (preART) and (not posNOUNDET) and (posLower):
	tokens.append([token, lastField]) # don't break
	else:
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["POR","_"])
	tokens.append(["O","_"])
	elif (token[0].isupper()):
	tokens.append(["Por","_"])
	tokens.append(["o","_"])
	else:
	tokens.append(["por","_"])
	tokens.append(["o","_"])
	# pelos - por os - pelos
	elif (token.lower() == "pelos"):
	if (i > 0):
	preART = lex.pexists(stripWord(bits[i-1]), "DET")
	if (preART):
	possible = lex.pget(stripWord(bits[i-1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]):
	agree = True
	break
	if (not agree):
	preART = False
	else:
	preART = (stripWord(bits[i-1]) != "que") and (stripWord(bits[i-1]) != "dado") and (stripWord(bits[i-1]) != "tanto") and (stripWord(bits[i-1]) != "quanto") and (stripWord(bits[i-1]) != "mais")
	else:
	preART = False
	if (i < len(bits)-1):
	posNOUNDET = lex.pexists(stripWord(bits[i+1]), "NOUN") or lex.pexists(stripWord(bits[i+1]), "ADJ") or lex.pexists(stripWord(bits[i+1]), "DET")
	posLower = not bits[i+1][0].isupper()
	if (posNOUNDET):
	possible = lex.pget(stripWord(bits[i+1]), "NOUN")+lex.pget(stripWord(bits[i+1]), "ADJ")+lex.pget(stripWord(bits[i+1]), "DET")
	agree = False
	for feats in possible:
	if ("Number=Sing" not in feats[2]) and ("Gender=Fem" not in feats[2]) and ("PronType=Art" in feats[2]):
	agree = True
	break
	if (not agree):
	posNOUNDET = False
	else:
	posNOUNDET = False
	posLower = True
	if (preART) and (not posNOUNDET) and (posLower):
	tokens.append([token, lastField]) # don't break
	else:
	tokens.append([token, "c"+lastField]) # break
	if (token.isupper()):
	tokens.append(["POR","_"])
	tokens.append(["OS","_"])
	elif (token[0].isupper()):
	tokens.append(["Por","_"])
	tokens.append(["os","_"])
	else:
	tokens.append(["por","_"])
	tokens.append(["os","_"])

	#############################################################################
	# Tokenizing - tokenizeIt (step 4)
	#############################################################################
	def tokenizeIt(s, SID, outfile):
	removable = ["'", '"', "(", ")", "[", "]", "{", "}", "<", ">", \
	"!", "?", ",", ";", ":", "=", "+", "*", "★", "\|", "/", "\\", \
	"&", "^", "_", "`", "'", "~", "%", "§"]
	ignored = ["@", "#"]
	digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
	contracts = {"à":["a","a"],
	"às":["a","as"],
	"ao":["a", "o"],
	"aos":["a", "os"],
	"àquela":["a", "aquela"],
	"àquelas":["a", "aquelas"],
	"àquele":["a", "aquele"],
	"àqueles":["a", "aqueles"],
	"comigo":["com", "mim"],
	"contigo":["com", "ti"],
	"consigo":["com", "si"],
	"conosco":["com", "nós"],
	"convosco":["com", "vós"],
	"da":["de", "a"],
	"das":["de", "as"],
	"do":["de", "o"],
	"dos":["de", "os"],
	"dali":["de", "ali"],
	"daqui":["de", "aqui"],
	"daí":["de", "aí"],
	"dentre":["de", "entre"],
	"desta":["de", "esta"],
	"destas":["de", "estas"],
	"deste":["de", "este"],
	"destes":["de", "estes"],
	"dessa":["de", "essa"],
	"dessas":["de", "essas"],
	"desse":["de", "esse"],
	"desses":["de", "esses"],
	"daquela":["de", "aquela"],
	"daquelas":["de", "aquelas"],
	"daquele":["de", "aquele"],
	"daqueles":["de", "aqueles"],
	"disto":["de", "isto"],
	"disso":["de", "isso"],
	"daquilo":["de", "aquilo"],
	"dela":["de", "ela"],
	"delas":["de", "elas"],
	"dele":["de", "ele"],
	"deles":["de", "eles"],
	"doutra":["de", "outra"],
	"doutras":["de", "outras"],
	"doutro":["de", "outro"],
	"doutros":["de", "outros"],
	"dum":["de", "um"],
	"duns":["de", "uns"],
	"duma":["de", "uma"],
	"dumas":["de", "umas"],
	"na":["em", "a"],
	"nas":["em", "as"],
	"no":["em", "o"],
	"nos":["em", "os"],
	"nesta":["em", "esta"],
	"nestas":["em", "estas"],
	"neste":["em", "este"],
	"nestes":["em", "estes"],
	"nessa":["em", "essa"],
	"nessas":["em", "essas"],
	"nesse":["em", "esse"],
	"nesses":["em", "esses"],
	"naquela":["em", "aquela"],
	"naquelas":["em", "aquelas"],
	"naquele":["em", "aquele"],
	"naqueles":["em", "aqueles"],
	"nisto":["em", "isto"],
	"nisso":["em", "isso"],
	"naquilo":["em", "aquilo"],
	"nela":["em", "ela"],
	"nelas":["em", "elas"],
	"nele":["em", "ele"],
	"neles":["em", "eles"],
	"noutra":["em", "outra"],
	"noutras":["em", "outras"],
	"noutro":["em", "outro"],
	"noutros":["em", "outros"],
	"num":["em", "um"],
	"nuns":["em", "uns"],
	"numa":["em", "uma"],
	"numas":["em", "umas"],
	"pela":["por", "a"],
	"pelas":["por", "as"],
	"pelo":["por", "o"],
	"pelos":["por", "os"],
	"pra":["para", "a"],
	"pras":["para", "as"],
	"pro":["para", "o"],
	"pros":["para", "os"],
	"prum":["para", "um"],
	"pruns":["para", "uns"],
	"pruma":["para", "uma"],
	"prumas":["para", "umas"]}
	ambigous = ["nos", "consigo", "pra", "pela", "pelas", "pelo", "pelos"]
	# ambigous = ["nos", "consigo", "pra", "pelo", "pelos"]
	enclisis = ['me', 'te', 'se', 'lhe', 'o', 'a', 'nos', 'vos', 'lhes', 'os', 'as', 'lo', 'la', 'los', 'las']
	doubleEnclisis = ["mo", "to", "lho", "lhos", "ma", "ta", "lha", "lhas", "mos", "tos"]
	doubleEnclisisTri = ["no-lo", "vo-lo", "no-la", "vo-la", "no-los", "vo-los", "no-las", "vo-las"]
	terminations = ["ia", "ias", "as", "iamos", "ieis", "iam", "ei", "as", "a", "emos", "eis", "ão", "á"]
	abbrev = [
	"dr.", "dra.", "sr.", "sra.", "prof.", "profa.", "Dr.", "Dra.", "Sr.", "Sra.", "Prof.", "Profa.", "DR.", "DRA.", "SR.", "SRA.", "PROF.", "PROFA.",
	"ilmo.", "Ilmo.", "ILMO.", "bel.", "Bel.", "BEL.", "eng.", "Eng.", "ENG.", "reg.", "Reg.", "REG.", "visc.", "Visc.", "VISC.", "bar.", "Bar.", "BAR.",
	"cond.", "Cond.", "COND.", "séc.", "Séc.", "SÉC.", "jr.", "Jr.", "JR.", "ir.", "Ir.", "IR.", "st.", "St.", "ST.", "app.", "App.", "APP.",
	"gov.", "Gov.", "GOV.", "des.", "Des.", "DES.", "gen.", "Gen.", "GEN.", "gal.", "Gal.", "GAL.", "cel.", "Cel.", "CEL.", "col.", "Col.", "COL.",
	"maj.", "Maj.", "MAJ.", "ten.", "Ten.", "TEN.", "cap.", "Cap.", "CAP.", "capt.", "Capt.", "CAPT.", "com.", "Com.", "COM.", "brig.", "Brig.", "BRIG.",
	"estac.", "Estac.", "ESTAC.", "tel.", "Tel.", "TEL.", "ave.", "Ave.", "AVE.", "av.", "Av.", "AV.", "trav.", "Trav.", "TRAV.", "con.", "Con.", "CON.",
	"jd.", "Jd.", "JD.", "ed.", "Ed.", "ED.", "lj.", "Lj.", "LJ.", "cj.", "Cj.", "CJ.", "apto.", "Apto.", "APTO.", "apt.", "Apt.", "APT.", "ingr.", "Ingr.", "INGR.",
	"ap.", "Ap.", "AP.", "dir.", "Dir.", "DIR.", "min.", "Min.", "MIN.", "sec.", "Sec.", "SEC.", "kg.", "Kg.", "KG.", "ml.", "Ml.", "ML.", "km.", "Km.", "KM.", "cm.", "Cm.", "CM.",
	"vol.", "Vol.", "VOL.", "PP.", "pp.", "Pp", "pag.", "Pag", "PAG.", "pág.", "Pág", "PÁG.", "al.", "Al.", "AL.", "etc.", "i.e.", "e.g.", "cia.", "Cia.", "CIA.",
	"co.", "Co.", "CO.", "ltda.", "Ltda.", "LTDA.", "ex.", "Ex.", "EX.", "ac.", "Ac.", "AC.", "dc.", "Dc.", "DC.", "bros.", "Bros.", "BROS.", "pq.", "Pq.", "PQ.",
	"br.", "Br.", "BR.", "cent.", "Cent.", "CENT.", "ft.", "Ft.", "FT.", "net.", "Net.", "NET.", "no.", "No.", "NO.", "nr.", "Nr.", "NR.", "tr.", "Tr.", "TR.",
	"mi.", "Mi.", "MI.", "sta.", "Sta.", "STA.", "sto.", "Sto.", "STO.", "int.", "Int.", "INT.", "inf.", "Inf.", "INF.", "cult.", "Cult.", "CULT.", "op.", "Op.", "OP.",
	"aprox.", "Aprox.", "APROX.", "it.", "It.", "IT.", "ex.", "Ex.", "EX.", "flex.", "Flex.", "FLEX.", "ass.", "Ass.", "ASS.", "pç.", "Pç.", "PÇ.", "ind.", "Ind.", "IND.",
	"vl.", "Vl.", "VL.", "imp.", "Imp.", "IMP.", "emp.", "Emp.", "EMP.", "esq.", "Esq.", "ESQ.", "dir.", "Dir.", "DIR.", "ingr.", "Ingr.", "INGR.", "pça.", "Pça.", "PÇA.",
	"art.", "Art.", "ART.", "sec.", "Sec.", "SEC.", "inc.", "Inc.", "INC.", "a.", "A.", "b.", "B.", "c.", "C.", "d.", "D.", "e.", "E.", "f.", "F.", "g.", "G.", "h.", "H.", "i.", "I.",
	"j.", "J.", "k.", "K.", "l.", "L.", "m.", "M.", "n.", "N.", "o.", "O.", "p.", "P.", "q.", "Q.", "r.", "R.", "s.", "S.", "t.", "T.", "u.", "U.", "v.", "V.", "w.", "W.", "x.", "X.", "y.", "Y.", "z.", "Z.",
	"seg.", "ter.", "qua.", "qui.", "sex.", "sab.", "sáb.", "dom.", "Seg.", "Ter.", "Qua.", "Qui.", "Sex.", "Sab.", "Sáb.", "Dom.", "SEG.", "TER.", "QUA.", "QUI.", "SEX.", "SAB.", "SÁB.", "DOM.",
	"jan.", "fev.", "mar.", "abr.", "mai.", "jun.", "jul.", "ago.", "sep.", "out.", "nov.", "dez.", "Jan.", "Fev.", "Mar.", "Abr.", "Mai.", "Jun.", "Jul.", "Ago.", "Sep.", "Out.", "Nov.", "Dez.",
	"JAN.", "FEV.", "MAR.", "ABR.", "MAI.", "JUN.", "JUL.", "AGO.", "SET.", "OUT.", "NOV.", "DEZ."
	]
	#abbrev = []
	#infile = open("abbrev.txt", "r")
	#for line in infile:
	# abbrev.append(line[:-1])
	#infile.close()
	def isAbbrev(chunk, abbrev):
	abbr = False
	for a in abbrev:
	if (chunk == a):
	abbr = True
	break
	#else:
	# lasts = -len(a)
	# if (chunk[lasts:] == a) and (not chunk[lasts-1].isalpha()):
	# abbr = True
	# break
	return abbr
	tokens = []
	bits = s.split(" ")
	k = 0
	for b in bits:
	pretagged = False
	if (len(b) > 16):
	if (b[:8] == "//\|\|\\\\") or (b[:9] == "//\|(\|\\\\"):
	pretagged = True
	if (pretagged):
	# keep the bit as token and clean the tags //\|\|\\ before and after
	tokens.append([b.replace("//\|\|\\\\", "").replace("//\|\|\\\\", "").replace("//\|(\|\\\\", ""), "_"])
	else:
	# deal with the pre (before) middle
	pre = []
	changed = True
	while (changed) and (len(b) > 1):
	changed = False
	if (b[0] in removable) or ((b[0] == "$") and (b[1] in digits)) or ((b[0] == "-") and (b[1] not in digits)):
	pre.append(b[0])
	b = b[1:]
	changed = True
	# deal with the pos (after) middle
	tmp = []
	changed = True
	while (changed) and (len(b) > 1):
	if (isAbbrev(b, abbrev)):
	break
	changed = False
	if (b[-1] in removable+["-", "."]):
	tmp.append(b[-1])
	b = b[:-1]
	changed = True
	pos = []
	reticent = ""
	for i in range(len(tmp)-1, -1, -1):
	if (tmp[i] == "."):
	if (reticent == ""):
	reticent = "."
	elif (reticent == "."):
	reticent = ".."
	elif (reticent == ".."):
	pos.append("...")
	reticent = ""
	else:
	if (reticent != ""):
	pos.append(reticent)
	reticent = ""
	pos.append(tmp[i])
	if (reticent != ""):
	pos.append(reticent)
	# deal with the middle
	buf = b.split("-")
	if (len(buf) == 1):
	parts = pre+[b]+pos
	# enclisis (types I - infinitive e.g. cumprí-lo and type II - sonore e.g. satisfê-lo)
	elif (len(buf) == 2) and (buf[1] in enclisis):
	if (buf[0][-1] == "á"):
	if (lex.pexists(buf[0][:-1]+"ar", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"ar", buf[1]]+pos
	else:
	if (lex.pexists(buf[0][:-1]+"as", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"as", buf[1]]+pos
	else:
	parts = pre+["^"+b, buf[0][:-1]+"az", buf[1]]+pos
	elif (buf[0][-1] == "ê"):
	if (lex.pexists(buf[0][:-1]+"er", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"er", buf[1]]+pos
	else:
	if (lex.pexists(buf[0][:-1]+"es", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"es", buf[1]]+pos
	else:
	parts = pre+["^"+b, buf[0][:-1]+"ez", buf[1]]+pos
	elif (buf[0][-1] == "í"):
	if (lex.pexists(buf[0][:-1]+"ir", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"ir", buf[1]]+pos
	else:
	if (lex.pexists(buf[0][:-1]+"is", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"is", buf[1]]+pos
	else:
	parts = pre+["^"+b, buf[0][:-1]+"iz", buf[1]]+pos
	elif (buf[0][-1] == "ô"):
	if (lex.pexists(buf[0][:-1]+"or", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"or", buf[1]]+pos
	else:
	if (lex.pexists(buf[0][:-1]+"os", "VERB")):
	parts = pre+["^"+b, buf[0][:-1]+"os", buf[1]]+pos
	else:
	parts = pre+["^"+b, buf[0][:-1]+"oz", buf[1]]+pos
	else:
	parts = pre+["^"+b, buf[0], buf[1]]+pos
	# double enclisis - type II (e.g. disse-lhos, dei-ta)
	elif (len(buf) == 2) and (buf[1] in doubleEnclisis):
	if (buf[1][-1] == "a"):
	parts = pre+["^^"+b, buf[0], buf[1][:-1]+"e", buf[1][-1]]+pos
	elif (buf[1][-1] == "o"):
	parts = pre+["^^"+b, buf[0], buf[1][:-1]+"e", buf[1][-1]]+pos
	elif (buf[1][-2:] == "as"):
	parts = pre+["^^"+b, buf[0], buf[1][:-2]+"e", buf[1][-2:]]+pos
	elif (buf[1][-2:] == "os"):
	parts = pre+["^^"+b, buf[0], buf[1][:-2]+"e", buf[1][-2:]]+pos
	else:
	parts = pre+["^"+b, buf[0], buf[1]]+pos
	# double enclisis - type I (e.g. dá-se-lhes)
	elif (len(buf) == 3) and (buf[1] in enclisis) and (buf[2] in enclisis):
	if (buf[0][-1] == "á"):
	parts = pre+["^^"+b, buf[0][:-1]+"ar", buf[1], buf[2]]+pos
	elif (buf[0][-1] == "ê"):
	parts = pre+["^^"+b, buf[0][:-1]+"er", buf[1], buf[2]]+pos
	elif (buf[0][-1] == "í"):
	parts = pre+["^^"+b, buf[0][:-1]+"ir", buf[1], buf[2]]+pos
	elif (buf[0][-1] == "ô"):
	parts = pre+["^^"+b, buf[0][:-1]+"or", buf[1], buf[2]]+pos
	else:
	parts = pre+["^^"+b, buf[0], buf[1], buf[2]]+pos
	# mesoclisis - type I (e.g. dar-lo-ia)
	elif (len(buf) == 3) and (buf[1] in enclisis) \
	and (buf[0][-1] == "r") and (buf[2] in terminations):
	parts = pre+["^"+b, buf[0]+buf[2], buf[1]]+pos
	# mesoclisis - type II (e.g. dá-lo-ia)
	elif (len(buf) == 3) and (buf[1] in enclisis) \
	and (buf[0][-1] in ["á", "ê", "í", "ô"]) and (buf[2] in terminations):
	if (buf[0][-1] == "á"):
	parts = pre+["^"+b, buf[0][:-1]+"ar"+buf[2], buf[1]]+pos
	elif (buf[0][-1] == "ê"):
	parts = pre+["^"+b, buf[0][:-1]+"er"+buf[2], buf[1]]+pos
	elif (buf[0][-1] == "í"):
	parts = pre+["^"+b, buf[0][:-1]+"ir"+buf[2], buf[1]]+pos
	elif (buf[0][-1] == "ô"):
	parts = pre+["^"+b, buf[0][:-1]+"or"+buf[2], buf[1]]+pos
	else:
	parts = pre+[b]+pos
	# transform parts into tokens to be added
	i = 0
	while (i < len(parts)):
	if (i == len(parts)-1):
	lastField = "_"
	else:
	lastField = "SpaceAfter=No"
	if (parts[i][:3] == "^"):
	if (i+3 == len(parts)):
	tokens.append([parts[i][3:], "c_"])
	else:
	tokens.append([parts[i][3:], "cSpaceAfter=No"])
	i += 1
	tokens.append([parts[i], "_"])
	i += 1
	tokens.append([parts[i], "_"])
	elif (parts[i][:4] == "^^"):
	if (i+4 == len(parts)):
	tokens.append([parts[i][4:], "C_"])
	else:
	tokens.append([parts[i][4:], "CSpaceAfter=No"])
	i += 1
	tokens.append([parts[i], "_"])
	i += 1
	tokens.append([parts[i], "_"])
	i += 1
	tokens.append([parts[i], "_"])
	elif (parts[i] not in ambigous):
	ans = contracts.get(parts[i].lower())
	if (ans == None):
	tokens.append([parts[i], lastField])
	else:
	tokens.append([parts[i], "c"+lastField])
	if (parts[i].isupper()):
	tokens.append([ans[0].upper(),"_"])
	tokens.append([ans[1].upper(),"_"])
	elif (parts[i][0].isupper()):
	tokens.append([ans[0][0].upper()+ans[0][1:],"_"])
	tokens.append([ans[1],"_"])
	else:
	tokens.append([ans[0],"_"])
	tokens.append([ans[1],"_"])
	else:
	desambIt(parts[i], bits, k, lastField, s, SID, tokens)
	i += 1
	k += 1
	# output the sentence with all the tokens
	print("# sent_id =", SID, file=outfile)
	print("# text =", s.replace("//\|\|\\\\", "").replace("//\|\|\\\\", "").replace("//\|(\|\\\\", ""), file=outfile)
	## printout tokens
	toks = 0
	for i in range(len(tokens)):
	if (tokens[i][1][0] == "c"):
	# contracted word (two parts)
	print(str(toks+1)+"-"+str(toks+2), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1][1:], sep="\t", file=outfile)
	elif (tokens[i][1][0] == "C"):
	# contracted word (three parts)
	print(str(toks+1)+"-"+str(toks+3), tokens[i][0], "_", "_", "_", "_", "_", "_", "_", tokens[i][1][1:], sep="\t", file=outfile)
	elif (tokens[i][0].strip() != ""):
	# non contracted word
	toks += 1
	print(str(toks), tokens[i][0].strip(), "_", "_", "_", "_", "_", "_", "_", tokens[i][1], sep="\t", file=outfile)
	print(file=outfile)
	return(toks)

	#################################################
	### Deal with a sentence, clean it, if required, then tokenize it
	#################################################
	def dealWith(outfile, sent, SID, preserve, match, trim):
	if (trim): # step 1
	sent = trimIt(sent)
	if (preserve): # step 2
	sent = tagIt(sent)
	if (match): # step 3
	sent = punctIt(sent)
	if (sent != ""): # step 4
	return 1, tokenizeIt(sent, SID, outfile)
	else:
	return 0, 0

	#################################################
	### função principal do programa - busca argumentos e chama 'tokenize' para cada sentença da entrada
	#################################################
	def portTok():
	if (len(sys.argv) == 1):
	#arguments = ["/Users/pf64/Desktop/alienista/alienista_empty.conllu", "/Users/pf64/Desktop/alienista/alienista.txt", False, True, False, "ATENISTA_SENT0000"]
	#arguments = ["/Users/pf64/Desktop/tst.conllu", "/Users/pf64/Desktop/tst.txt", True, True, True, "S000000"]
	arguments = ["sents.conllu", "sents.txt", True, True, True, "S000000"]
	print("Assumindo default: 'sents.conllu' como arquivo de saída, 'sents.txt' como arquivo de entrada, correções, remoções e S0000 como sid.")
	else:
	arguments = parseOptions(sys.argv)
	if (arguments != None):
	if (arguments[0] == ""):
	print("Assumindo 'sents.conllu' como arquivo de saída")
	arguments[0] = 'sents.conllu'
	if (arguments[1] == []):
	print("Arquivo de entrada inválido - por favor corrija e tente novamente")
	else:
	outfile = open(arguments[0], "w")
	#print("# newdoc id = {}\n# newpar".format(arguments[0]), file=outfile)
	infile = open(arguments[1], "r")
	SID = arguments[5]
	sTOTAL, tTOTAL = 0, 0
	for line in infile:
	SID = nextName(SID)
	s, t = dealWith(outfile, line[:-1], SID, arguments[2], arguments[3], arguments[4])
	if (s == 1):
	sTOTAL += 1
	tTOTAL += t
	outfile.close()
	infile.close()
	print("Tokenização terminada com {} sentenças extraídas ({} tokens) e salvas em {}".format(sTOTAL, tTOTAL, arguments[0]))
	else:
	print("Problemas com parâmetros - por favor corrija e tente novamente")

	portTok()