Spaces:

jaeikkim
/

AIDAS-Omni-Modal-Diffusion

Running on Zero

App Files Files Community

jaeikkim commited on 28 days ago

Commit

db39f43

1 Parent(s): 50db107

Cleanup binaries before space push

Browse files

Files changed (2) hide show

MMaDA/inference/common.py +1 -0
MMaDA/models/modeling_omada.py +195 -0

MMaDA/inference/common.py CHANGED Viewed

@@ -51,6 +51,7 @@ def build_uni_prompting(cfg) -> Tuple[UniversalPrompting, AutoTokenizer]:
             "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>",
             "<|i2i|>", "<|v2s|>", "<|s2s|>",
             "<|v2t|>", "<|s2t|>", "<|t2s|>", "<|soa|>", "<|eoa|>",
         ),
         ignore_id=-100,
         cond_dropout_prob=cfg.training.cond_dropout_prob,

             "<|mmu|>", "<|t2v|>", "<|v2v|>", "<|lvg|>",
             "<|i2i|>", "<|v2s|>", "<|s2s|>",
             "<|v2t|>", "<|s2t|>", "<|t2s|>", "<|soa|>", "<|eoa|>",
+            "<|ti2ti|>", "<|t2ti|>",
         ),
         ignore_id=-100,
         cond_dropout_prob=cfg.training.cond_dropout_prob,

MMaDA/models/modeling_omada.py CHANGED Viewed

@@ -1802,6 +1802,201 @@ class OMadaModelLM(LLaDAModelLM):
         return x
     @torch.no_grad()
     def mmu_generate_fast(self, idx=None, input_embeddings=None, max_new_tokens=128, steps=128,block_length=128, temperature=0.0, top_k=None, eot_token=None, cfg_scale=0.0, remasking='low_confidence', mask_id=126336, attention_mask=None):
         """

         return x
+    @torch.no_grad()
+    def ti2ti_generate(
+            self,
+            input_ids: torch.LongTensor = None,
+            uncond_input_ids: torch.LongTensor = None,
+            attention_mask=None,
+            uncond_attention_mask=None,
+            temperature=1.0,
+            timesteps=18,
+            timesteps_text: int | None = None,
+            timesteps_image: int | None = None,
+            guidance_scale=0,
+            noise_schedule=cosine_schedule,
+            generator: torch.Generator = None,
+            config=None,
+            seq_len=1024,
+            mask_token_id=126336,
+            resolution=512,
+            codebook_size=8192,
+            uni_prompting=None,
+            **kwargs,
+    ):
+        """
+        TI2TI generation that fills masked text and image tokens; allows separate timesteps.
+        Returns (filled_tokens, decoded_texts).
+        """
+        if input_ids is None or attention_mask is None:
+            raise ValueError("input_ids and attention_mask are required for ti2ti_generate.")
+        if uni_prompting is None:
+            raise ValueError("uni_prompting is required for ti2ti_generate.")
+        device = input_ids.device
+        text_vocab_size = len(uni_prompting.text_tokenizer)
+        image_vocab_start = text_vocab_size
+        image_vocab_end = image_vocab_start + codebook_size
+        timesteps_text = timesteps if timesteps_text is None else timesteps_text
+        timesteps_image = timesteps if timesteps_image is None else timesteps_image
+        seq = input_ids.clone()
+        if attention_mask is None:
+            attn = torch.ones_like(seq, dtype=torch.long)
+        else:
+            attn = attention_mask
+        use_guidance = uncond_input_ids is not None and guidance_scale > 0
+        if use_guidance:
+            seq_uncond = uncond_input_ids.clone()
+            if uncond_attention_mask is None:
+                attn_uncond = torch.ones_like(seq_uncond, dtype=torch.long)
+            else:
+                attn_uncond = uncond_attention_mask
+        else:
+            seq_uncond = None
+            attn_uncond = None
+        total_len = seq.shape[1]
+        def _uniform_transfer_plan(mask_bool: torch.Tensor, steps_count: int) -> Optional[torch.Tensor]:
+            """Evenly divide masked token updates across steps."""
+            if steps_count is None or steps_count <= 0:
+                return None
+            mask_num = mask_bool.sum(dim=1, keepdim=True)
+            if mask_num.numel() == 0:
+                return None
+            base = mask_num // steps_count
+            remainder = mask_num % steps_count
+            plan = torch.zeros(mask_num.size(0), steps_count, device=mask_bool.device, dtype=torch.int64) + base
+            for idx in range(mask_num.size(0)):
+                rem_val = remainder[idx].item()
+                if rem_val > 0:
+                    plan[idx, :rem_val] += 1
+            return plan
+        prompt_block_len = uni_prompting.max_text_len
+        soi_id = int(uni_prompting.sptids_dict.get("<|soi|>", torch.tensor([-1]))[0].item())
+        eoi_id = int(uni_prompting.sptids_dict.get("<|eoi|>", torch.tensor([-1]))[0].item())
+        pad_id = int(getattr(uni_prompting, "pad_id", 0))
+        def _locate_blocks(sample_seq: torch.Tensor, sample_attn: Optional[torch.Tensor]):
+            # Find second (target) soi/eoi pair; fallback to template formula.
+            soi_positions = (sample_seq == soi_id).nonzero(as_tuple=True)[0]
+            eoi_positions = (sample_seq == eoi_id).nonzero(as_tuple=True)[0]
+            tgt_soi = None
+            tgt_eoi = None
+            if soi_positions.numel() >= 2:
+                tgt_soi = int(soi_positions[1].item())
+                tgt_eoi_candidates = [int(e.item()) for e in eoi_positions if int(e.item()) > tgt_soi]
+                if tgt_eoi_candidates:
+                    tgt_eoi = tgt_eoi_candidates[0]
+            if tgt_soi is None or tgt_eoi is None:
+                # fallback: compute with pad offset the old way
+                non_pad = (sample_seq != pad_id).nonzero(as_tuple=True)
+                pad_offset = int(non_pad[0][0].item()) if len(non_pad) > 0 and non_pad[0].numel() > 0 else 0
+                tgt_soi = pad_offset + 1 + 1 + seq_len + 1 + prompt_block_len + 1  # soi before target img
+                tgt_eoi = tgt_soi + seq_len + 1  # eoi after target img
+            img_start_local = tgt_soi + 1
+            img_end_local = min(tgt_eoi, sample_seq.size(0))
+            if sample_attn is not None:
+                text_attn = sample_attn[tgt_eoi + 1 :]
+                nonzero = (text_attn != 0).nonzero(as_tuple=True)
+                if len(nonzero) > 0 and nonzero[0].numel() > 0:
+                    last_idx = int(nonzero[0][-1].item())
+                    text_end_local = tgt_eoi + 1 + last_idx + 1
+                else:
+                    text_end_local = tgt_eoi + 1 + prompt_block_len
+            else:
+                text_end_local = tgt_eoi + 1 + prompt_block_len
+            text_start_local = tgt_eoi + 1
+            text_end_local = min(text_end_local, sample_seq.size(0))
+            return img_start_local, img_end_local, text_start_local, text_end_local
+        img_start, img_end, text_start, text_end = _locate_blocks(seq[0], attn[0] if attn is not None else None)
+        text_indices = torch.arange(total_len, device=device)
+        initial_text_mask = (seq == mask_token_id) & (text_indices >= text_start) & (text_indices < text_end)
+        text_transfer_plan = _uniform_transfer_plan(initial_text_mask, timesteps_text)
+        text_step_idx = 0
+        # Simultaneous fill: at each step, update image/text masks that still remain
+        max_steps = max(timesteps_image, timesteps_text)
+        for step in range(max_steps):
+            mask_map = seq == mask_token_id
+            img_mask = mask_map & (text_indices >= img_start) & (text_indices < img_end) if step < timesteps_image else None
+            text_mask = mask_map & (text_indices >= text_start) & (text_indices < text_end) if step < timesteps_text else None
+            if not ((img_mask is not None and img_mask.any()) or (text_mask is not None and text_mask.any())):
+                break
+            attn_bias = (attn[:, :, None] & attn[:, None, :]).bool().unsqueeze(1)
+            logits_cond = self(seq, attention_bias=attn_bias).logits
+            if use_guidance:
+                attn_bias_uncond = (attn_uncond[:, :, None] & attn_uncond[:, None, :]).bool().unsqueeze(1)
+                logits_uncond = self(seq_uncond, attention_bias=attn_bias_uncond).logits
+                logits = logits_uncond + (guidance_scale + 1.0) * (logits_cond - logits_uncond)
+            else:
+                logits = logits_cond
+            if text_mask is not None and text_mask.any():
+                logits_text = logits[..., :text_vocab_size]
+                probs_text = logits_text.softmax(dim=-1)
+                sampled_text = torch.multinomial(
+                    probs_text.view(-1, text_vocab_size),
+                    1,
+                    replacement=False
+                ).view(*logits_text.shape[:2])
+                sampled_probs = torch.gather(
+                    probs_text, dim=-1, index=sampled_text.unsqueeze(-1)
+                ).squeeze(-1)
+                candidate_seq = torch.where(text_mask, sampled_text, seq)
+                confidence = torch.full_like(sampled_probs, float("-inf"))
+                confidence = torch.where(text_mask, sampled_probs, confidence)
+                if text_transfer_plan is not None and text_step_idx < text_transfer_plan.shape[1]:
+                    transfer_counts = text_transfer_plan[:, text_step_idx]
+                else:
+                    transfer_counts = text_mask.sum(dim=1)
+                transfer_mask = torch.zeros_like(text_mask, dtype=torch.bool)
+                for b_idx in range(seq.shape[0]):
+                    mask_count = int(text_mask[b_idx].sum().item())
+                    if mask_count == 0:
+                        continue
+                    k = int(min(max(transfer_counts[b_idx].item(), 0), mask_count))
+                    if k <= 0:
+                        continue
+                    _, top_idx = torch.topk(confidence[b_idx], k=k)
+                    transfer_mask[b_idx, top_idx] = True
+                if transfer_mask.any():
+                    seq = torch.where(transfer_mask, candidate_seq, seq)
+                text_step_idx += 1
+            if img_mask is not None and img_mask.any():
+                logits_img = logits[..., image_vocab_start:image_vocab_end]
+                probs_img = logits_img.softmax(dim=-1)
+                sampled_img = torch.multinomial(
+                    probs_img.view(-1, codebook_size),
+                    1,
+                    replacement=False
+                ).view(*logits_img.shape[:2]) + image_vocab_start
+                seq = torch.where(img_mask, sampled_img, seq)
+            if use_guidance:
+                updated_mask = torch.zeros_like(seq, dtype=torch.bool)
+                if img_mask is not None:
+                    updated_mask |= img_mask
+                if text_mask is not None:
+                    updated_mask |= text_mask
+                seq_uncond = torch.where(updated_mask, seq, seq_uncond)
+        # Decode text tokens from filled sequence
+        pred_texts = []
+        for row in seq:
+            text_tokens = [int(t) for t in row.tolist() if 0 <= t < text_vocab_size]
+            pred_texts.append(uni_prompting.text_tokenizer.decode(text_tokens, skip_special_tokens=True))
+        return seq, pred_texts
     @torch.no_grad()
     def mmu_generate_fast(self, idx=None, input_embeddings=None, max_new_tokens=128, steps=128,block_length=128, temperature=0.0, top_k=None, eot_token=None, cfg_scale=0.0, remasking='low_confidence', mask_id=126336, attention_mask=None):
         """