| from typing import Any | |
| from transformers.configuration_utils import PretrainedConfig | |
| __all__ = ["VoRAConfig"] | |
| class VoRAConfig(PretrainedConfig): | |
| model_type = "vora" | |
| _auto_class = "AutoConfig" | |
| def __init__( | |
| self, | |
| llm: str = "", | |
| aux_vision: str = "", | |
| lora: dict = {}, | |
| image_size: int = 448, | |
| vision_embedding_type: str = "", | |
| vision_embedding_intermediate_size: int = 1536, | |
| patch_size: int = 14, | |
| vision_attention_mask: str = "bidirectional", | |
| rms_norm_eps: float = 1e-5, | |
| **kwargs: Any, | |
| ): | |
| super().__init__(**kwargs) | |
| self.llm = llm | |
| self.aux_vision = aux_vision | |
| self.lora = lora | |
| self.image_size = image_size | |
| self.vision_embedding_type = vision_embedding_type | |
| self.vision_embedding_intermediate_size = vision_embedding_intermediate_size | |
| self.patch_size = patch_size | |
| self.vision_attention_mask = vision_attention_mask | |
| self.rms_norm_eps = rms_norm_eps | |