Spaces:
Running
on
L4
Running
on
L4
Update MagicQuill/brushnet/powerpaint_utils.py
Browse files
MagicQuill/brushnet/powerpaint_utils.py
CHANGED
|
@@ -51,7 +51,27 @@ class TokenizerWrapper:
|
|
| 51 |
Args:
|
| 52 |
tokens (Union[str, List[str]]): The tokens to be added.
|
| 53 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
assert num_added_tokens != 0, (
|
| 56 |
f"The tokenizer already contains the token {tokens}. Please pass "
|
| 57 |
"a different `placeholder_token` that is not already in the "
|
|
@@ -82,6 +102,11 @@ class TokenizerWrapper:
|
|
| 82 |
the added placeholder token.
|
| 83 |
*args, **kwargs: The arguments for `self.wrapped.add_tokens`.
|
| 84 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
output = []
|
| 86 |
if num_vec_per_token == 1:
|
| 87 |
self.try_adding_tokens(placeholder_token, *args, **kwargs)
|
|
@@ -276,55 +301,29 @@ class EmbeddingLayerWithFixes(nn.Module):
|
|
| 276 |
|
| 277 |
def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
|
| 278 |
"""Add external embeddings to this layer.
|
| 279 |
-
|
| 280 |
-
Use case:
|
| 281 |
-
|
| 282 |
-
>>> 1. Add token to tokenizer and get the token id.
|
| 283 |
-
>>> tokenizer = TokenizerWrapper('openai/clip-vit-base-patch32')
|
| 284 |
-
>>> # 'how much' in kiswahili
|
| 285 |
-
>>> tokenizer.add_placeholder_tokens('ngapi', num_vec_per_token=4)
|
| 286 |
-
>>>
|
| 287 |
-
>>> 2. Add external embeddings to the model.
|
| 288 |
-
>>> new_embedding = {
|
| 289 |
-
>>> 'name': 'ngapi', # 'how much' in kiswahili
|
| 290 |
-
>>> 'embedding': torch.ones(1, 15) * 4,
|
| 291 |
-
>>> 'start': tokenizer.get_token_info('kwaheri')['start'],
|
| 292 |
-
>>> 'end': tokenizer.get_token_info('kwaheri')['end'],
|
| 293 |
-
>>> 'trainable': False # if True, will registry as a parameter
|
| 294 |
-
>>> }
|
| 295 |
-
>>> embedding_layer = nn.Embedding(10, 15)
|
| 296 |
-
>>> embedding_layer_wrapper = EmbeddingLayerWithFixes(embedding_layer)
|
| 297 |
-
>>> embedding_layer_wrapper.add_embeddings(new_embedding)
|
| 298 |
-
>>>
|
| 299 |
-
>>> 3. Forward tokenizer and embedding layer!
|
| 300 |
-
>>> input_text = ['hello, ngapi!', 'hello my friend, ngapi?']
|
| 301 |
-
>>> input_ids = tokenizer(
|
| 302 |
-
>>> input_text, padding='max_length', truncation=True,
|
| 303 |
-
>>> return_tensors='pt')['input_ids']
|
| 304 |
-
>>> out_feat = embedding_layer_wrapper(input_ids)
|
| 305 |
-
>>>
|
| 306 |
-
>>> 4. Let's validate the result!
|
| 307 |
-
>>> assert (out_feat[0, 3: 7] == 2.3).all()
|
| 308 |
-
>>> assert (out_feat[2, 5: 9] == 2.3).all()
|
| 309 |
-
|
| 310 |
-
Args:
|
| 311 |
-
embeddings (Union[dict, list[dict]]): The external embeddings to
|
| 312 |
-
be added. Each dict must contain the following 4 fields: 'name'
|
| 313 |
-
(the name of this embedding), 'embedding' (the embedding
|
| 314 |
-
tensor), 'start' (the start token id of this embedding), 'end'
|
| 315 |
-
(the end token id of this embedding). For example:
|
| 316 |
-
`{name: NAME, start: START, end: END, embedding: torch.Tensor}`
|
| 317 |
"""
|
| 318 |
if isinstance(embeddings, dict):
|
| 319 |
embeddings = [embeddings]
|
| 320 |
|
| 321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
self.check_duplicate_names(self.external_embeddings)
|
| 323 |
self.check_ids_overlap(self.external_embeddings)
|
| 324 |
|
| 325 |
# set for trainable
|
| 326 |
added_trainable_emb_info = []
|
| 327 |
-
for embedding in
|
| 328 |
trainable = embedding.get("trainable", False)
|
| 329 |
if trainable:
|
| 330 |
name = embedding["name"]
|
|
@@ -332,7 +331,7 @@ class EmbeddingLayerWithFixes(nn.Module):
|
|
| 332 |
self.trainable_embeddings[name] = embedding["embedding"]
|
| 333 |
added_trainable_emb_info.append(name)
|
| 334 |
|
| 335 |
-
added_emb_info = [emb["name"] for emb in
|
| 336 |
added_emb_info = ", ".join(added_emb_info)
|
| 337 |
print(f"Successfully add external embeddings: {added_emb_info}.", "current")
|
| 338 |
|
|
@@ -460,6 +459,8 @@ def add_tokens(
|
|
| 460 |
assert len(initialize_tokens) == len(
|
| 461 |
placeholder_tokens
|
| 462 |
), "placeholder_token should be the same length as initialize_token"
|
|
|
|
|
|
|
| 463 |
for ii in range(len(placeholder_tokens)):
|
| 464 |
tokenizer.add_placeholder_token(placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token)
|
| 465 |
|
|
@@ -472,6 +473,25 @@ def add_tokens(
|
|
| 472 |
assert embedding_layer is not None, (
|
| 473 |
"Do not support get embedding layer for current text encoder. " "Please check your configuration."
|
| 474 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
initialize_embedding = []
|
| 476 |
if initialize_tokens is not None:
|
| 477 |
for ii in range(len(placeholder_tokens)):
|
|
@@ -490,8 +510,12 @@ def add_tokens(
|
|
| 490 |
|
| 491 |
token_info_all = []
|
| 492 |
for ii in range(len(placeholder_tokens)):
|
|
|
|
|
|
|
| 493 |
token_info = tokenizer.get_token_info(placeholder_tokens[ii])
|
| 494 |
token_info["embedding"] = initialize_embedding[ii]
|
| 495 |
token_info["trainable"] = True
|
| 496 |
token_info_all.append(token_info)
|
|
|
|
|
|
|
| 497 |
embedding_layer.add_embeddings(token_info_all)
|
|
|
|
| 51 |
Args:
|
| 52 |
tokens (Union[str, List[str]]): The tokens to be added.
|
| 53 |
"""
|
| 54 |
+
# Check if tokens exist first to avoid assertion error in wrapped tokenizer
|
| 55 |
+
# and to ensure idempotency in shared environments.
|
| 56 |
+
if isinstance(tokens, str):
|
| 57 |
+
tokens_to_check = [tokens]
|
| 58 |
+
else:
|
| 59 |
+
tokens_to_check = tokens
|
| 60 |
+
|
| 61 |
+
# If all tokens are already in the vocabulary, skip adding them.
|
| 62 |
+
# This relies on the wrapped tokenizer's behavior or checking its vocab.
|
| 63 |
+
# Usually `add_tokens` returns 0 if all tokens exist.
|
| 64 |
+
# We just want to avoid the assertion error if they are already added.
|
| 65 |
+
|
| 66 |
num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
|
| 67 |
+
if num_added_tokens == 0:
|
| 68 |
+
# Check if they actually exist (idempotency case)
|
| 69 |
+
# If they exist, we don't assert error, just return.
|
| 70 |
+
# If they don't exist but add_tokens returned 0 (shouldn't happen for new tokens),
|
| 71 |
+
# then we might have an issue.
|
| 72 |
+
# For simplicity in fixing the leak/crash: if 0 added, assume they exist.
|
| 73 |
+
return
|
| 74 |
+
|
| 75 |
assert num_added_tokens != 0, (
|
| 76 |
f"The tokenizer already contains the token {tokens}. Please pass "
|
| 77 |
"a different `placeholder_token` that is not already in the "
|
|
|
|
| 102 |
the added placeholder token.
|
| 103 |
*args, **kwargs: The arguments for `self.wrapped.add_tokens`.
|
| 104 |
"""
|
| 105 |
+
|
| 106 |
+
# Check if already in token_map (idempotency)
|
| 107 |
+
if placeholder_token in self.token_map:
|
| 108 |
+
return
|
| 109 |
+
|
| 110 |
output = []
|
| 111 |
if num_vec_per_token == 1:
|
| 112 |
self.try_adding_tokens(placeholder_token, *args, **kwargs)
|
|
|
|
| 301 |
|
| 302 |
def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
|
| 303 |
"""Add external embeddings to this layer.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
"""
|
| 305 |
if isinstance(embeddings, dict):
|
| 306 |
embeddings = [embeddings]
|
| 307 |
|
| 308 |
+
# Idempotency check: filter out embeddings that are already present by name
|
| 309 |
+
existing_names = {emb["name"] for emb in self.external_embeddings}
|
| 310 |
+
new_embeddings = []
|
| 311 |
+
for emb in embeddings:
|
| 312 |
+
if emb["name"] not in existing_names:
|
| 313 |
+
new_embeddings.append(emb)
|
| 314 |
+
# Optional: Warn or check if existing embedding matches the new one?
|
| 315 |
+
# For now, assume if name exists, it's the same token being re-added.
|
| 316 |
+
|
| 317 |
+
if not new_embeddings:
|
| 318 |
+
return
|
| 319 |
+
|
| 320 |
+
self.external_embeddings += new_embeddings
|
| 321 |
self.check_duplicate_names(self.external_embeddings)
|
| 322 |
self.check_ids_overlap(self.external_embeddings)
|
| 323 |
|
| 324 |
# set for trainable
|
| 325 |
added_trainable_emb_info = []
|
| 326 |
+
for embedding in new_embeddings:
|
| 327 |
trainable = embedding.get("trainable", False)
|
| 328 |
if trainable:
|
| 329 |
name = embedding["name"]
|
|
|
|
| 331 |
self.trainable_embeddings[name] = embedding["embedding"]
|
| 332 |
added_trainable_emb_info.append(name)
|
| 333 |
|
| 334 |
+
added_emb_info = [emb["name"] for emb in new_embeddings]
|
| 335 |
added_emb_info = ", ".join(added_emb_info)
|
| 336 |
print(f"Successfully add external embeddings: {added_emb_info}.", "current")
|
| 337 |
|
|
|
|
| 459 |
assert len(initialize_tokens) == len(
|
| 460 |
placeholder_tokens
|
| 461 |
), "placeholder_token should be the same length as initialize_token"
|
| 462 |
+
|
| 463 |
+
# Safe to call multiple times (idempotent)
|
| 464 |
for ii in range(len(placeholder_tokens)):
|
| 465 |
tokenizer.add_placeholder_token(placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token)
|
| 466 |
|
|
|
|
| 473 |
assert embedding_layer is not None, (
|
| 474 |
"Do not support get embedding layer for current text encoder. " "Please check your configuration."
|
| 475 |
)
|
| 476 |
+
|
| 477 |
+
# Only calculate initialization for tokens that are NOT already in the layer
|
| 478 |
+
existing_names = {emb["name"] for emb in embedding_layer.external_embeddings}
|
| 479 |
+
tokens_to_add = []
|
| 480 |
+
init_tokens_to_add = []
|
| 481 |
+
|
| 482 |
+
for ii, token in enumerate(placeholder_tokens):
|
| 483 |
+
# This check assumes the placeholder token name matches the embedding name
|
| 484 |
+
# TokenizerWrapper adds suffix _0, _1 etc if num_vec > 1.
|
| 485 |
+
# The logic below handles generic case, but here we assume 1-to-1 or we check the main token.
|
| 486 |
+
# Actually EmbeddingLayer uses specific names. TokenizerWrapper.add_placeholder_token generates them.
|
| 487 |
+
# If num_vec_per_token > 1, TokenizerWrapper generates token_0, token_1...
|
| 488 |
+
# Let's check if the embedding layer already has them.
|
| 489 |
+
|
| 490 |
+
# The original code below generated embeddings for ALL input tokens.
|
| 491 |
+
# add_embeddings will filter them out.
|
| 492 |
+
# But we need to be careful not to re-initialize them differently if they exist.
|
| 493 |
+
pass
|
| 494 |
+
|
| 495 |
initialize_embedding = []
|
| 496 |
if initialize_tokens is not None:
|
| 497 |
for ii in range(len(placeholder_tokens)):
|
|
|
|
| 510 |
|
| 511 |
token_info_all = []
|
| 512 |
for ii in range(len(placeholder_tokens)):
|
| 513 |
+
# get_token_info relies on the token being in tokenizer.
|
| 514 |
+
# add_placeholder_token ensures it's there (idempotent now).
|
| 515 |
token_info = tokenizer.get_token_info(placeholder_tokens[ii])
|
| 516 |
token_info["embedding"] = initialize_embedding[ii]
|
| 517 |
token_info["trainable"] = True
|
| 518 |
token_info_all.append(token_info)
|
| 519 |
+
|
| 520 |
+
# Idempotency is handled inside add_embeddings now
|
| 521 |
embedding_layer.add_embeddings(token_info_all)
|