LiuZichen commited on
Commit
e009812
·
verified ·
1 Parent(s): a4ae08e

Update MagicQuill/brushnet/powerpaint_utils.py

Browse files
MagicQuill/brushnet/powerpaint_utils.py CHANGED
@@ -51,7 +51,27 @@ class TokenizerWrapper:
51
  Args:
52
  tokens (Union[str, List[str]]): The tokens to be added.
53
  """
 
 
 
 
 
 
 
 
 
 
 
 
54
  num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
 
 
 
 
 
 
 
 
55
  assert num_added_tokens != 0, (
56
  f"The tokenizer already contains the token {tokens}. Please pass "
57
  "a different `placeholder_token` that is not already in the "
@@ -82,6 +102,11 @@ class TokenizerWrapper:
82
  the added placeholder token.
83
  *args, **kwargs: The arguments for `self.wrapped.add_tokens`.
84
  """
 
 
 
 
 
85
  output = []
86
  if num_vec_per_token == 1:
87
  self.try_adding_tokens(placeholder_token, *args, **kwargs)
@@ -276,55 +301,29 @@ class EmbeddingLayerWithFixes(nn.Module):
276
 
277
  def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
278
  """Add external embeddings to this layer.
279
-
280
- Use case:
281
-
282
- >>> 1. Add token to tokenizer and get the token id.
283
- >>> tokenizer = TokenizerWrapper('openai/clip-vit-base-patch32')
284
- >>> # 'how much' in kiswahili
285
- >>> tokenizer.add_placeholder_tokens('ngapi', num_vec_per_token=4)
286
- >>>
287
- >>> 2. Add external embeddings to the model.
288
- >>> new_embedding = {
289
- >>> 'name': 'ngapi', # 'how much' in kiswahili
290
- >>> 'embedding': torch.ones(1, 15) * 4,
291
- >>> 'start': tokenizer.get_token_info('kwaheri')['start'],
292
- >>> 'end': tokenizer.get_token_info('kwaheri')['end'],
293
- >>> 'trainable': False # if True, will registry as a parameter
294
- >>> }
295
- >>> embedding_layer = nn.Embedding(10, 15)
296
- >>> embedding_layer_wrapper = EmbeddingLayerWithFixes(embedding_layer)
297
- >>> embedding_layer_wrapper.add_embeddings(new_embedding)
298
- >>>
299
- >>> 3. Forward tokenizer and embedding layer!
300
- >>> input_text = ['hello, ngapi!', 'hello my friend, ngapi?']
301
- >>> input_ids = tokenizer(
302
- >>> input_text, padding='max_length', truncation=True,
303
- >>> return_tensors='pt')['input_ids']
304
- >>> out_feat = embedding_layer_wrapper(input_ids)
305
- >>>
306
- >>> 4. Let's validate the result!
307
- >>> assert (out_feat[0, 3: 7] == 2.3).all()
308
- >>> assert (out_feat[2, 5: 9] == 2.3).all()
309
-
310
- Args:
311
- embeddings (Union[dict, list[dict]]): The external embeddings to
312
- be added. Each dict must contain the following 4 fields: 'name'
313
- (the name of this embedding), 'embedding' (the embedding
314
- tensor), 'start' (the start token id of this embedding), 'end'
315
- (the end token id of this embedding). For example:
316
- `{name: NAME, start: START, end: END, embedding: torch.Tensor}`
317
  """
318
  if isinstance(embeddings, dict):
319
  embeddings = [embeddings]
320
 
321
- self.external_embeddings += embeddings
 
 
 
 
 
 
 
 
 
 
 
 
322
  self.check_duplicate_names(self.external_embeddings)
323
  self.check_ids_overlap(self.external_embeddings)
324
 
325
  # set for trainable
326
  added_trainable_emb_info = []
327
- for embedding in embeddings:
328
  trainable = embedding.get("trainable", False)
329
  if trainable:
330
  name = embedding["name"]
@@ -332,7 +331,7 @@ class EmbeddingLayerWithFixes(nn.Module):
332
  self.trainable_embeddings[name] = embedding["embedding"]
333
  added_trainable_emb_info.append(name)
334
 
335
- added_emb_info = [emb["name"] for emb in embeddings]
336
  added_emb_info = ", ".join(added_emb_info)
337
  print(f"Successfully add external embeddings: {added_emb_info}.", "current")
338
 
@@ -460,6 +459,8 @@ def add_tokens(
460
  assert len(initialize_tokens) == len(
461
  placeholder_tokens
462
  ), "placeholder_token should be the same length as initialize_token"
 
 
463
  for ii in range(len(placeholder_tokens)):
464
  tokenizer.add_placeholder_token(placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token)
465
 
@@ -472,6 +473,25 @@ def add_tokens(
472
  assert embedding_layer is not None, (
473
  "Do not support get embedding layer for current text encoder. " "Please check your configuration."
474
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  initialize_embedding = []
476
  if initialize_tokens is not None:
477
  for ii in range(len(placeholder_tokens)):
@@ -490,8 +510,12 @@ def add_tokens(
490
 
491
  token_info_all = []
492
  for ii in range(len(placeholder_tokens)):
 
 
493
  token_info = tokenizer.get_token_info(placeholder_tokens[ii])
494
  token_info["embedding"] = initialize_embedding[ii]
495
  token_info["trainable"] = True
496
  token_info_all.append(token_info)
 
 
497
  embedding_layer.add_embeddings(token_info_all)
 
51
  Args:
52
  tokens (Union[str, List[str]]): The tokens to be added.
53
  """
54
+ # Check if tokens exist first to avoid assertion error in wrapped tokenizer
55
+ # and to ensure idempotency in shared environments.
56
+ if isinstance(tokens, str):
57
+ tokens_to_check = [tokens]
58
+ else:
59
+ tokens_to_check = tokens
60
+
61
+ # If all tokens are already in the vocabulary, skip adding them.
62
+ # This relies on the wrapped tokenizer's behavior or checking its vocab.
63
+ # Usually `add_tokens` returns 0 if all tokens exist.
64
+ # We just want to avoid the assertion error if they are already added.
65
+
66
  num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs)
67
+ if num_added_tokens == 0:
68
+ # Check if they actually exist (idempotency case)
69
+ # If they exist, we don't assert error, just return.
70
+ # If they don't exist but add_tokens returned 0 (shouldn't happen for new tokens),
71
+ # then we might have an issue.
72
+ # For simplicity in fixing the leak/crash: if 0 added, assume they exist.
73
+ return
74
+
75
  assert num_added_tokens != 0, (
76
  f"The tokenizer already contains the token {tokens}. Please pass "
77
  "a different `placeholder_token` that is not already in the "
 
102
  the added placeholder token.
103
  *args, **kwargs: The arguments for `self.wrapped.add_tokens`.
104
  """
105
+
106
+ # Check if already in token_map (idempotency)
107
+ if placeholder_token in self.token_map:
108
+ return
109
+
110
  output = []
111
  if num_vec_per_token == 1:
112
  self.try_adding_tokens(placeholder_token, *args, **kwargs)
 
301
 
302
  def add_embeddings(self, embeddings: Optional[Union[dict, List[dict]]]):
303
  """Add external embeddings to this layer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  """
305
  if isinstance(embeddings, dict):
306
  embeddings = [embeddings]
307
 
308
+ # Idempotency check: filter out embeddings that are already present by name
309
+ existing_names = {emb["name"] for emb in self.external_embeddings}
310
+ new_embeddings = []
311
+ for emb in embeddings:
312
+ if emb["name"] not in existing_names:
313
+ new_embeddings.append(emb)
314
+ # Optional: Warn or check if existing embedding matches the new one?
315
+ # For now, assume if name exists, it's the same token being re-added.
316
+
317
+ if not new_embeddings:
318
+ return
319
+
320
+ self.external_embeddings += new_embeddings
321
  self.check_duplicate_names(self.external_embeddings)
322
  self.check_ids_overlap(self.external_embeddings)
323
 
324
  # set for trainable
325
  added_trainable_emb_info = []
326
+ for embedding in new_embeddings:
327
  trainable = embedding.get("trainable", False)
328
  if trainable:
329
  name = embedding["name"]
 
331
  self.trainable_embeddings[name] = embedding["embedding"]
332
  added_trainable_emb_info.append(name)
333
 
334
+ added_emb_info = [emb["name"] for emb in new_embeddings]
335
  added_emb_info = ", ".join(added_emb_info)
336
  print(f"Successfully add external embeddings: {added_emb_info}.", "current")
337
 
 
459
  assert len(initialize_tokens) == len(
460
  placeholder_tokens
461
  ), "placeholder_token should be the same length as initialize_token"
462
+
463
+ # Safe to call multiple times (idempotent)
464
  for ii in range(len(placeholder_tokens)):
465
  tokenizer.add_placeholder_token(placeholder_tokens[ii], num_vec_per_token=num_vectors_per_token)
466
 
 
473
  assert embedding_layer is not None, (
474
  "Do not support get embedding layer for current text encoder. " "Please check your configuration."
475
  )
476
+
477
+ # Only calculate initialization for tokens that are NOT already in the layer
478
+ existing_names = {emb["name"] for emb in embedding_layer.external_embeddings}
479
+ tokens_to_add = []
480
+ init_tokens_to_add = []
481
+
482
+ for ii, token in enumerate(placeholder_tokens):
483
+ # This check assumes the placeholder token name matches the embedding name
484
+ # TokenizerWrapper adds suffix _0, _1 etc if num_vec > 1.
485
+ # The logic below handles generic case, but here we assume 1-to-1 or we check the main token.
486
+ # Actually EmbeddingLayer uses specific names. TokenizerWrapper.add_placeholder_token generates them.
487
+ # If num_vec_per_token > 1, TokenizerWrapper generates token_0, token_1...
488
+ # Let's check if the embedding layer already has them.
489
+
490
+ # The original code below generated embeddings for ALL input tokens.
491
+ # add_embeddings will filter them out.
492
+ # But we need to be careful not to re-initialize them differently if they exist.
493
+ pass
494
+
495
  initialize_embedding = []
496
  if initialize_tokens is not None:
497
  for ii in range(len(placeholder_tokens)):
 
510
 
511
  token_info_all = []
512
  for ii in range(len(placeholder_tokens)):
513
+ # get_token_info relies on the token being in tokenizer.
514
+ # add_placeholder_token ensures it's there (idempotent now).
515
  token_info = tokenizer.get_token_info(placeholder_tokens[ii])
516
  token_info["embedding"] = initialize_embedding[ii]
517
  token_info["trainable"] = True
518
  token_info_all.append(token_info)
519
+
520
+ # Idempotency is handled inside add_embeddings now
521
  embedding_layer.add_embeddings(token_info_all)