File size: 13,148 Bytes

d56eb1d

# -*- coding: utf-8 -*-

"""# shared_subspace_encoder.py"""

from typing import Optional

import torch
from torch import nn

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa

from .mla import MultiheadLatentAttention, RotaryEmbedding
from .feedforward import SubspaceFeedForward
from .shared_space_config import SharedSpaceDecoderConfig

"""`RMSNorm`

From:
https://huggingface.co/deepseek-ai/DeepSeek-R1/blob/main/modeling_deepseek.py

TODO - May not need?
"""

class DeepseekV3RMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        DeepseekV3RMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.Module:
    """
    Create a normalization layer based on the config norm_type.
    
    Args:
        hidden_size: The dimension to normalize over
        config: Configuration containing norm_type and epsilon values
    
    Returns:
        Either a LayerNorm or RMSNorm layer
    """
    if config.norm_type == "layernorm":
        return nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
    elif config.norm_type == "rmsnorm":
        return DeepseekV3RMSNorm(hidden_size, eps=config.rms_norm_eps)
    else:
        # This should be caught by config validation, but being defensive
        raise ValueError(f"Unknown norm_type: {config.norm_type}")

"""#### *PreTrainedModel"""

class SharedSpaceDecoderPreTrainedModel(PreTrainedModel):
    """
    The **PreTrainedModel object:
      - Is instantiated when TODO
      - Initializes:
        - TODO
      - Provides access to TODO
      - Executes TODO
    """

    config_class = SharedSpaceDecoderConfig
    base_model_prefix = "model"

    def _init_weights(self, module: nn.Module) -> None:
        """Weight initialization hook used by :class:`PreTrainedModel`.

        ``PreTrainedModel.post_init`` will recursively apply this function to
        every submodule right after construction.  HuggingFace models override
        it so that creating a model from scratch yields the same initialization
        as ``from_pretrained`` when no checkpoint is supplied.

        This decoder-specific initialization strategy includes:
        - Proper handling of configurable normalization layers (LayerNorm or RMSNorm)
        - Special initialization for language modeling heads
        - Considerations for causal attention and autoregressive modeling
        - Support for both dense and decomposed vocabulary embeddings
        """

        if isinstance(module, nn.Linear):
            # Standard linear layer initialization
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                
        elif isinstance(module, nn.Embedding):
            # Initialize embeddings with normal distribution
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                
        elif isinstance(module, DeepseekV3RMSNorm):
            # RMSNorm initialization: weight to 1.0, no bias term
            module.weight.data.fill_(1.0)
            
        elif isinstance(module, nn.LayerNorm):
            # LayerNorm initialization: bias to 0, weight to 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

"""# ▂▂▂▂▂▂▂▂▂▂▂▂

# Classes
"""

"""#### `*Layer`"""

class SharedSpaceDecoderLayer(nn.Module):
    """
    The **Layer object:
      - Is instantiated by :class:`SharedSpaceDecoderModel` for each
        Transformer block in the decoder.
      - Initializes:
        - ``self_attn`` – multi-head latent attention implementing either
          dense or latent projections depending on the configuration.
        - ``ffn`` – a :class:`SubspaceFeedForward` block.
        - RMSNorm layers for pre-attention and pre-FFN normalization.
      - Provides access to the attention and feed-forward submodules via the
        attributes ``self_attn`` and ``ffn``.
      - Executes a single decoder block in :meth:`forward`.
    """

    def __init__(self, config: SharedSpaceDecoderConfig, layer_idx: int) -> None:

        super().__init__()

        # Norm applied prior to attention.
        self.attn_input_norm = create_norm_layer(config.hidden_size, config)
        
        # Attention block
        self.self_attn = MultiheadLatentAttention(config, layer_idx)

        # Norm applied prior to FFN
        self.ffn_input_norm = create_norm_layer(config.hidden_size, config)

        # Feed-forward network used after attention
        self.ffn = SubspaceFeedForward(config, layer_idx)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor], # RoPE embeddings
        attention_mask: Optional[torch.Tensor],
    ) -> torch.Tensor:

        # ========================
        #     Self Attention
        # ========================
        residual_strm = hidden_states

        # Normalize the hidden states to create the input to attention.
        attn_input = self.attn_input_norm(hidden_states)

        # Evaluate
        attn_output = self.self_attn(
            attn_input,
            position_embeddings,
            attention_mask,
        )

        # Add the attention output (the residual) back to the non-normalized
        # hidden_states.
        hidden_states = residual_strm + attn_output

        # ===========================
        #     Feed-Forward Network
        # ===========================
        residual_strm = hidden_states

        # Normalize the updated hidden states prior to the FFN
        ffn_input = self.ffn_input_norm(hidden_states)

        # Evaluate
        ffn_output = self.ffn(ffn_input)

        # Add the output the un-normalized hidden states.
        hidden_states = residual_strm + ffn_output

        return hidden_states

"""#### *Model"""

class SharedSpaceDecoderModel(SharedSpaceDecoderPreTrainedModel):
    """
    The **Model object:
      - Initializes:
        - The vocabulary embeddings (and optional decomposition)
        - Position embeddings (calculated in RotaryEmbedding)
        - All of the **Layer objects.
      - Provides interface to vocab embeddings.
      - Executes the whole decoder model in `forward` with causal attention.

      This is the base decoder without the language modeling head.
      Use SubspaceDecoderForCausalLM for language modeling tasks.
    """

    def __init__(self, config: SharedSpaceDecoderConfig) -> None:
        super().__init__(config)

        # ============================
        #    Vocabulary Embeddings
        # ============================
        # Decomposing the vocabulary (if enabled) defines a shared projection
        # which constrains the model to store semantic information (and
        # whatever other static token knowledge) into a limited set of
        # feature directions.

        # If we're decomposing the token embeddings,
        # TODO - Rename to vocab_subspace.
        if config.vocab_subspace:

            # Create the embedding table. Vocabulary embeddings are learned
            # in a lower dimensional latent space.
            self.vocab_embed = nn.Embedding(
                config.vocab_size, # Number of tokens
                config.vocab_rank  # Subspace dimension
            )

            # Create a
            # Selected token latents will be projected up to model size.
            # vocab_proj has shape [vocab_rank x model_size]
            self.vocab_proj = nn.Linear(
                config.vocab_rank,  # Size of latents
                config.hidden_size, # Model size
                bias=False
            )

        # Otherwise, for a dense vocabulary,
        else:
            # Create the dense embedding table in model space.
            self.vocab_embed = nn.Embedding(
                config.vocab_size,  # Number of tokens
                config.hidden_size  # Model size
            )

            self.vocab_proj = None

        # =====================
        #   RoPE Embeddings
        # =====================

        # Pre-computes the table of RoPE embeddings, leaving them in
        # GPU memory.
        self.rope = RotaryEmbedding(config)

        # ===================
        #    Create Layers
        # ===================

        layers = []

        # For each layer,
        for i in range(config.num_hidden_layers):
            # Create a **Layer, providing the config and indicating its number.
            layers.append(
                SharedSpaceDecoderLayer(
                    config,
                    layer_idx = i
                )
            )

        # Wrap in torch ModuleList
        self.layers = nn.ModuleList(layers)

        # Whatever huggingface does behind the scenes...
        self.post_init()

    # Agents: Do not define boilerplate helpers, e.g., get/set_input_embeddings


    def embed(self, input_ids: torch.LongTensor) -> torch.Tensor:
        """
        Return token embeddings for input ids.
        This will perform the up projection to model space if the vocabulary is
        decomposed.

        input_ids have shape [batch_size, seq_len]
        """

        # If the vocabulary is decomposed,
        if self.vocab_proj is not None:

            # Retrieve the latents
            #  input_ids: [batch_size, seq_len]
            #          x: [batch_size, seq_len, latent_dim]
            x = self.vocab_embed(input_ids)

            #  Project the latents back to model space and return.
            return(self.vocab_proj(x))

        # If the vocabulary is dense,
        else:
            # Just return the embeddings.
            return self.vocab_embed(input_ids)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        Run the full decoder stack with causal attention.

        Inputs:
            input_ids       [batch_size, seq_len]
            attention_mask  [batch_size, seq_len] - 1 for real tokens, 0 for padding

        Returns:
            Final decoder layer output   [batch_size, seq_len, model_size]
        """

        # Retrieve the token embeddings for this sequence.
        # These are model_size, regardless of whether the vocab is decompd.
        hidden_states = self.embed(input_ids)

        # Retrieve the rotary position embeddings for all of the positions in
        # our current input sequence.

        seq_len = hidden_states.size(1)

        # Retrieves just the ones necessary for the sequence length of the
        # input. These are vectors, two per token. Their length is the
        # number of head dimensions we're applying RoPE to.
        #  Input
        #     cos: [max_seq_len, rope_dims]
        #     sin: [max_seq_len, rope_dims]
        #  Outputs:
        #     R_cos [seq_len, rope_dims]
        #     R_sin [seq_len, rope_dims]
        R_cos = self.rope.cos[:seq_len]
        R_sin = self.rope.sin[:seq_len]


        # ===============================
        #   Attention Mask Conversion
        # ===============================

        """
        use_sdpa_attention_masks = (
            self.attn_implementation == "sdpa"
            and self.position_embedding_type == "absolute"
            and head_mask is None
            and not output_attentions
        )
        """

        # Expand the attention mask
        #if use_sdpa_attention_masks and attention_mask.dim() == 2:
        if True:
            # Expand the attention mask for SDPA.
            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
            extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                attention_mask,
                hidden_states.dtype,
                tgt_len = seq_len
            )
            attention_mask = extended_attention_mask


        # Run the model!

        # For each decoder layer,
        for layer_i, layer in enumerate(self.layers):

            # Evaluate the layer
            hidden_states = layer(
                hidden_states,       # Token embeddings
                (R_cos, R_sin),      # Rope embeddings, passed as a tuple.
                attention_mask,      # Attn mask
            )

        # Return the final output of the decoder stack.
        return hidden_states