PenPaperKeyCode commited on 3 days ago

Commit

f5e4236

0 Parent(s):

Init history

Browse files

Files changed (35) hide show

.gitattributes +39 -0
LICENSE +121 -0
README.md +289 -0
added_tokens.json +258 -0
chat_template.jinja +147 -0
config.json +229 -0
configuration_hyperclovax.py +228 -0
configuration_vlm.py +111 -0
generation_config.json +8 -0
merges.txt +0 -0
model-00001-of-00014.safetensors +3 -0
model-00002-of-00014.safetensors +3 -0
model-00003-of-00014.safetensors +3 -0
model-00004-of-00014.safetensors +3 -0
model-00005-of-00014.safetensors +3 -0
model-00006-of-00014.safetensors +3 -0
model-00007-of-00014.safetensors +3 -0
model-00008-of-00014.safetensors +3 -0
model-00009-of-00014.safetensors +3 -0
model-00010-of-00014.safetensors +3 -0
model-00011-of-00014.safetensors +3 -0
model-00012-of-00014.safetensors +3 -0
model-00013-of-00014.safetensors +3 -0
model-00014-of-00014.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_hyperclovax.py +1866 -0
modeling_vlm.py +1913 -0
preprocessor_config.json +32 -0
processing_vlm.py +823 -0
processor_config.json +6 -0
special_tokens_map.json +39 -0
tokenizer.json +0 -0
tokenizer_config.json +2079 -0
video_preprocessor_config.json +89 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+eval_output_dir_if_think/IFBench_multi-turn_input_response_data_hf.jsonl filter=lfs diff=lfs merge=lfs -text
+eval_output_dir_dev_think/IFBench_multi-turn_input_response_data_hf.jsonl filter=lfs diff=lfs merge=lfs -text
+eval_output_dir_dev_think/humaneval_plus_prediction.jsonl filter=lfs diff=lfs merge=lfs -text
+eval_output_dir_if/IFBench_multi-turn_input_response_data_hf.jsonl filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,121 @@

+HyperCLOVA X SEED 32B Think Model License Agreement
+Model Release Date: December 29, 2025
+This HyperCLOVA X SEED 32B Think Model License Agreement (the “Agreement”) is a legal agreement between you and NAVER Corporation (“Naver Corp.”) and NAVER Cloud Corporation (“Naver Cloud Corp.”) (Naver Corp. and Naver Cloud Corp. are collectively referred to as “NAVER”) and governs your use of the Models that NAVER provides to You under this Agreement.
+NAVER Corp., as the holder of the intellectual property of the Model, and its affiliate, NAVER Cloud Corp., as the exclusive business operator of HyperCLOVA X, enter into this Agreement with you. NAVER and you are each a “party” and collectively the “parties.”
+By using, reproducing, modifying, distributing, performing or displaying any portion or element of the Model or Derivative Model, or otherwise accepting the terms of this Agreement, you agree to be bound by this Agreement. You represent to us that you are lawfully able to enter into contracts, and if you are entering into this Agreement for an entity, that you have legal authority to bind that entity.
+1.	Definitions.
+1.1.	"Affiliate” means any entity directly or indirectly controlling, controlled by or under common control with either party, where “control” means the possession, directly or indirectly, of the power to independently direct or cause the direction of the management and policies of an entity, whether through ownership of more than fifty percent (50%) of the stock or other equity interests entitled to vote for representation on its board of directors, or body performing similar functions, by contract or otherwise.
+1.2.	“Derivative Model” means all (i) modifications to the Model, (ii) works based on the Model, or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of the Model, to that model in order to cause that model to perform similarly to the Model, including distillation methods that use intermediate data representations or methods based on the generation of synthetic data Outputs by the Model for training that Model. For clarity, Outputs are not deemed Derivative Model.
+1.3.	“Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
+1.4.	“Model” means the foundational large language models and software and algorithms, including machine-learning model code and trained model weights distributed by NAVER.
+1.5.	“Output” means the information content output of the Model or a Derivative Model that results from operating or otherwise using the Model or Derivative Model.
+2.	Conditions for Use, License Grant and Restrictions
+2.1.	Conditions for Use. The Model and any Derivative Model are subject to the terms of this Agreement and govern your use. If You institute copyright or patent litigation against any entity (including a crossclaim or counterclaim in a lawsuit) alleging that the Model or Derivative Model constitutes direct or contributory copyright or patent infringement, then any license granted to you under this Agreement for that Model or Derivative Model will terminate as of the date such litigation is filed. NAVER may update this Agreement to comply with legal and regulatory requirements any time and You agree to either comply with any updated license or cease your copying, use, and distribution of the Model and any Derivative Model.
+2.2.	License Grant. Subject to the terms and conditions of this Agreement, NAVER hereby grants to you a non-exclusive, worldwide, non-transferable, revocable and royalty-free limited license under NAVER’s intellectual property or other rights owned by NAVER embodied in the Model to access, download, install, copy, use, reproduce, distribute, create derivative works of, and make modifications to the Model.
+2.3.	Prohibited Use Policy. NAVER is committed to ensuring safety trust, and transparency in the development and use of AI technologies. Accordingly, your use of the Model and any Derivative Models is subject to the following conditions:
+(i)	You must ensure that any product or service you develop, use, offer as a service, or distribute complies with all applicable laws and regulations, and is operated appropriately for the relevant industry or use case.
+(ii)	You must comply with the Acceptable Use Policy applicable to the Model and any Derivative Models, which is attached hereto as Addendum A and incorporated by reference into this Agreement.
+(iii)	NAVER expressly prohibits the use of its products or services for any purpose in violation of applicable law and regulation, including but not limited to:
+(a)	illegal surveillance,
+(b)	illegal collection or processing of biometric information without the consent of the subject which is required under applicable law, or
+(c)	illegal harassment, abuse, threatening or bullying of individuals or groups of individuals or intentionally misleading or deceiving others.
+(iv)	You must take reasonable measures to address unintended bias and to mitigate harm to others, including underrepresented or vulnerable groups.
+3.	Redistribution.
+3.1.	You may reproduce, distribute or make available the Model or Derivative Models thereof, or a product or service (including another AI model) that contains any of them, if you meet all of the following conditions: you must (i) include the Prohibited Use Policy referenced in Section 2.3. as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of the Model or Derivative Model and you must provide notice to subsequence users you distribute to the Model or Derivative Models are subject to the use restrictions in Section 2.3., (ii) provide all third party recipients of the Model or Derivative Models a copy of this Agreement, (iii) cause any modified files to carry prominent notices stating that you modified the files; (iv) include the following attribution notice within a “Notice” text file distributed as part of such copies: “HyperCLOVA X SEED 32B Think Model is licensed under the HyperCLOVA X SEED 32B Think Model License Agreement, Copyright © NAVER Corp. All Rights Reserved.”, and (v) prominently display “Powered by HyperCLOVA X” on a related website, user interface, blogpost, about page, or product documentation. If you use the Model or any Outputs of the Model to create, train, fine tune, or otherwise improve an AI model, which is distributed or made available, you shall also include “HyperCLOVA X” at the beginning of any such AI model name.
+3.2.	You may add your own copyright statement to your modifications and, except as set forth in this Section, may provide additional or different license terms and conditions for use, reproduction, or distribution of your modifications, or for any such Derivative Models as a whole, provided your use, reproduction, and distribution of the Model or Derivative Models otherwise comply with the terms and conditions stated in this Agreement. Any additional or different terms and conditions you impose must not conflict with the terms of this Agreement.
+4.	Additional Commercial Terms. If (i) as of the Model Release Date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s Affiliates, is greater than 10 million monthly active users in the preceding calendar month, or (ii) the Licensee or its Affiliate distributes or makes available any product or service, which is substantially similar to or directly competes with any product and service provided by NAVER, then the Licensee must request a license from NAVER. Such a license may be granted by NAVER at its sole discretion, and the Licensee is not authorized to exercise any rights under this Agreement unless and until NAVER expressly grants you such rights.
+5.	Generated Output. NAVER claims no rights in Outputs you generate using the Model. You and your use are solely responsible for Outputs and their subsequent uses.
+6.	DISCLAIMER OF WARRANTY. UNLESS REQUIRED BY APPLICABLE LAW, THE MODEL AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR ANY KIND, AND NAVER DISCLAIMS ALL WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE MODEL, DERIVATIVE MODELS, OUTPUTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE MODEL AND ANY OUTPUTS AND RESULTS AND YOUR EXERCISE OF PERMISSION UNDER THIS AGREEMENT.
+7.	LIMITATION OF LIABILITY. IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, UNLESS REQUIRED BY APPLICABLE LAW (SUCH AS IN CASES OF DELIBERATE AND GROSSLY NEGLIGENT ACTS), WILL NAVER BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY, OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND, ARISING FROM OR RELATED TO THIS AGREEMENT, OR RESULTING FROM THE USE OR INABILITY TO USE THE MODEL, DERIVATIVE MODELS OR, OUTPUTS (INCLUDING, BUT NOT LIMITED TO, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGES, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF NAVER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+8.	Indemnity. You will indemnify and hold harmless NAVER from and against any claim by any third party arising out of or related to your use or distribution of the Model, Derivative Model or Outputs.
+9.	Intellectual Property.
+9.1.	This Agreement does not grant permission to use the trade names, trademarks, service marks, or product names of NAVER, except as required for reasonable and customary use in describing the origin of the Model and reproducing the content of the “Notice” text file.
+9.2.	NAVER Corp. owns the Model and any Derivative Model created by NAVER Corp. Except as expressively granted in this Agreement, NAVER Corp. reserves all rights, interests and remedies in connection with the Model and Derivative Model created by NAVER Corp. and no other license or right is granted to you by implication, estoppel or otherwise. Subject to NAVER Corp.’s ownership of the Model and any Derivative Model made by or for NAVER Corp., with respect to any derivative works and modifications of the Model that are made by you, as between you and NAVER Corp., you are and will be the owner of such derivative works and modifications.
+10.	Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Model and will continue in full force and effect until terminated in accordance with the terms and conditions of this Agreement. NAVER may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Model and Derivative Model. Section 5, 6, 7 and 10 shall survive the termination of this Agreement.
+11.	Governing Law and Jurisdiction.
+11.1.	This Agreement will be governed by and construed in accordance with the laws of the Republic of Korea, without regard to its conflicts of laws principles.
+11.2.	Any disputes, controversies, or claims arising out of or relating to this Agreement, including its existence, validity, interpretation, performance, breach, or termination, shall be referred to and finally resolved by arbitration administered by the Korean Commercial Arbitration Board (KCAB) in accordance with the International Arbitration Rules of the Korean Commercial Arbitration Board in force at the time of the commencement of the arbitration. The seat of arbitration shall be Seoul, Republic of Korea. The tribunal shall consist of one arbitrator. The language of the arbitration shall be English. Either party may seek interim or provisional relief from a court of competent jurisdiction and doing so shall not be considered a waiver of any provision in this section. The arbitral tribunal also has the authority to issue orders for interim or provisional relief.
+12.	Modifications. NAVER reserves the right to modify or amend this Agreement at any time, in its sole discretion. Any modifications will be effective upon posting the updated Agreement on our website or through other means of communication. You are responsible for reviewing the Agreement periodically for changes.
+13.	No Waiver. NAVER will not be treated as having waived any rights by not exercising (or delaying the exercise of) any rights under this Agreement.
+Addendum A – Acceptable Use Policy
+NAVER is committed to promoting safe and responsible use of its AI technologies, including the HyperCLOVA X SEED 32B Think Model (the “Model”). By accessing or using the Model and Derivative Model (Defined in the Model License Agreement) (the Model and Derivative Model are collectively referred to as the “Models”), you agree to this Acceptable Use Policy (“Policy”).
+We want everyone to use the Models safely, legally, and ethically. You agree that you will not use, or allow others to use, the Models to:
+1. Violate applicable laws or the rights of others, including by:
+a. Engaging in, promoting, contributing to, encouraging, planning, inciting, or furthering illegal or unlawful activity or content, such as:
+	Violence or terrorism
+	Exploitation or harm to children, including the creation or dissemination of child exploitative content
+	Human trafficking, exploitation, or sexual violence
+	The unlawful distribution of obscene or harmful material to minors, or failure to apply legally required age restrictions
+	Sexual solicitation or sexually exploitative behavior
+	Any other criminal activity
+b. Engaging in, promoting, inciting, or facilitating the harassment, abuse, threatening, or bullying of individuals or groups
+c. Engaging in, promoting, inciting, or facilitating discrimination or other unlawful or harmful conduct in the provision of employment, credit, housing, or access to essential goods and services
+d. Providing unauthorized or unlicensed professional services, including but not limited to financial, legal, medical/health, or related services
+e. Collecting, processing, disclosing, generating, or inferring private or sensitive personal information, including identity, health, or demographic data, unless lawfully permitted under applicable laws
+f. Infringing, misappropriating, or otherwise violating third-party rights, including through the generation or use of outputs derived from the Models
+g. Creating, generating, or facilitating malicious code, malware, or computer viruses, or interfering with the functioning, security, or integrity of a website, application, or system
+h. Intentionally bypassing or disabling usage restrictions, safety measures, or access controls imposed by NAVER
+2. Engage in or promote use cases that may pose a risk of death, bodily harm, or significant safety hazard to individuals, including use of the Models in connection with:
+a. Military, warfare, nuclear technology or espionage
+b. The development or distribution of firearms or illegal weapons
+c. Illegal drugs or regulated controlled substances
+d. Operation of critical infrastructure, transportation systems, or heavy machinery
+e. Content promoting self-harm, including suicide, or eating disorders
+f. Any other use intended to incite or cause physical harm
+3. Intentionally deceive or mislead others, including by:
+a. Generating, promoting, or disseminating fraudulent or misleading content
+b. Creating or sharing defamatory content
+c. Generating or distributing spam
+d. Impersonating another individual or entity without proper authorization
+e. Representing Model output as human-generated
+f. Generating or enabling fake online engagement, such as fake reviews or fake users
+4. Fail to disclose to end users any known risks or limitations of an AI system that incorporates the Models.
+5. Use the Models in conjunction with third-party tools, models, or software designed to generate unlawful content or conduct, or falsely represent outputs from such tools as associated with NAVER or HyperCLOVA X.
+If you become aware of a violation of this Policy, a bug, or any behavior that could result in a breach of this Policy, please report it to us:
+Reporting risky outputs: [email protected]
+Reporting policy violations or unauthorized use: [email protected]

README.md ADDED Viewed

	@@ -0,0 +1,289 @@

+---
+license: other
+license_name: hyperclovax
+license_link: LICENSE
+library_name: transformers
+---
+![image](https://cdn-uploads.huggingface.co/production/uploads/64383d54c5a91b84ece18d62/2wkHd-bv3M9Zsma_ykIf8.png)
+# Overview
+HyperCLOVA X SEED 32B Think is an updated vision-language thinking model that advances the [SEED Think 14B](https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Think-14B) line beyond simple scaling, pairing a unified vision-language Transformer backbone with a reasoning-centric training recipe. SEED 32B Think processes text tokens and visual patches within a shared embedding space, supports long-context multimodal understanding up to 128K tokens, and provides an optional “thinking mode” for deep, controllable reasoning. Building on the earlier 14B model, SEED 32B Think further strengthens Korean-centric reasoning and agentic capabilities, improving practical reasoning quality and reliability in real-world use.
+# Basic Information
+- **Architecture** : Transformer-based vision-language model (VLM) architecture (Dense Model)
+- **Parameters** : 32B
+- **Input Format**: Text/Image/Video
+- **Output Format**: Text
+- **Context Length** : 128K
+# Benchmarks
+![테크니컬 리포트 04@2x](https://cdn-uploads.huggingface.co/production/uploads/646acf46086023e36edce4c4/qfIKiKlFVJWyCx3Dl1qN0.png)
+- General Knowledge (Korean Text): KoBalt, CLIcK, HAERAE Bench 1.0
+- Vision Understanding : ChartVQA, TextVQA, K-MMBench, K-DTCBench
+- Agentic Tasks: Tau^2-Airline, Tau^2-Retail, Tau^2-Telecom
+# Examples
+- Solving 2026 Korean CSAT Math Problem
+<img src="https://cdn-uploads.huggingface.co/production/uploads/67ff242cee08737feaf18cb2/LPU8kNbYQ8FN_piQ_p6Je.jpeg" style="width: 640px;">
+- Understanding Text layout
+<img src="https://cdn-uploads.huggingface.co/production/uploads/67ff242cee08737feaf18cb2/Y8lHa7s1TmJcS6F82d41L.jpeg" style="width: 640px;">
+<!-- - Understanding Charts
+<img src="https://cdn-uploads.huggingface.co/production/uploads/67ff242cee08737feaf18cb2/zoH2Lh6CSkgdzvXz7JaHo.jpeg" style="width: 640px;"> -->
+# Inference
+We provide [OmniServe](https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe), a production-ready multimodal inference system with OpenAI-compatible API.
+## Capabilities
+- **Inputs**: Text, Image
+- **Outputs**: Text
+## Requirements
+- 4x NVIDIA A100 80GB
+- Docker & Docker Compose
+- NVIDIA Driver 525+, CUDA 12.1+
+## Installation
+```bash
+# Clone OmniServe
+git clone https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe.git
+cd OmniServe
+# Install dependencies
+pip install huggingface_hub safetensors torch openai easydict
+# Download model (~60GB)
+huggingface-cli download naver-hyperclovax/HyperCLOVAX-SEED-Think-32B \
+    --local-dir ./models/HyperCLOVAX-SEED-Think-32B
+# Convert model to component format
+python convert_model.py \
+    --input ./models/HyperCLOVAX-SEED-Think-32B \
+    --output ./track_a \
+    --track a
+# Configure environment
+cp .env.example .env
+# Edit .env:
+# VLM_MODEL_PATH=./track_a/llm/HyperCLOVAX-SEED-Think-32B
+# VLM_ENCODER_VISION_MODEL_PATH=./track_a/ve/HyperCLOVAX-SEED-Think-32B
+# Build and run
+docker compose --profile track-a build
+docker compose --profile track-a up -d
+# Wait for model loading (~5 minutes)
+docker compose logs -f vlm
+```
+## Basic Usage
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/a/v1",
+    api_key="not-needed"
+)
+# Image understanding
+response = client.chat.completions.create(
+    model="track_a_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}},
+                {"type": "text", "text": "Describe this image."}
+            ]
+        }
+    ],
+    max_tokens=512,
+    extra_body={"chat_template_kwargs": {"thinking": False}}
+)
+print(response.choices[0].message.content)
+```
+## Reasoning Mode
+Enable chain-of-thought reasoning for complex tasks:
+```python
+response = client.chat.completions.create(
+    model="track_a_model",
+    messages=[
+        {"role": "user", "content": "Solve step by step: 3x + 7 = 22"}
+    ],
+    max_tokens=1024,
+    extra_body={
+        "thinking_token_budget": 500,
+        "chat_template_kwargs": {"thinking": True}
+    }
+)
+# Response includes <think>...</think> with reasoning process
+print(response.choices[0].message.content)
+```
+## More Examples
+<details>
+<summary>Video Understanding</summary>
+```python
+response = client.chat.completions.create(
+    model="track_a_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": "https://example.com/video.mp4"}},
+                {"type": "text", "text": "Describe this video."}
+            ]
+        }
+    ],
+    max_tokens=512,
+    extra_body={"chat_template_kwargs": {"thinking": False}}
+)
+```
+</details>
+<details>
+<summary>Base64 Image Input</summary>
+```python
+import base64
+with open("image.png", "rb") as f:
+    image_b64 = base64.b64encode(f.read()).decode()
+response = client.chat.completions.create(
+    model="track_a_model",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
+                {"type": "text", "text": "What is in this image?"}
+            ]
+        }
+    ],
+    max_tokens=512,
+    extra_body={"chat_template_kwargs": {"thinking": False}}
+)
+```
+</details>
+<details>
+<summary>Using curl</summary>
+```bash
+curl -X POST http://localhost:8000/a/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "track_a_model",
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}},
+          {"type": "text", "text": "Describe this image."}
+        ]
+      }
+    ],
+    "max_tokens": 512,
+    "extra_body": {"chat_template_kwargs": {"thinking": false}}
+  }'
+```
+</details>
+## Model Capabilities
+| Input | Output |
+|-------|--------|
+| Text | Text |
+| Image | Text |
+| Video | Text |
+| Image + Text | Text |
+| Video + Text | Text |
+**Features:**
+- Reasoning mode with `<think>...</think>` output
+- Multi-turn conversation support
+- Image/Video understanding
+## Architecture
+```
+                         User Request
+                       (Image/Video/Text)
+                              │
+                              ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                            OmniServe                                    │
+│                  POST /a/v1/chat/completions                            │
+│                                                                         │
+│  ┌──────────────────────────────────────────────────────────────────┐   │
+│  │                     [1] INPUT ENCODING                           │   │
+│  │                                                                  │   │
+│  │                   ┌─────────────────┐                            │   │
+│  │                   │  Vision Encoder │                            │   │
+│  │                   └────────┬────────┘                            │   │
+│  │                            │ embeddings                          │   │
+│  └────────────────────────────┼─────────────────────────────────────┘   │
+│                               ▼                                         │
+│                       ┌──────────────┐                                  │
+│                       │  LLM (32B)   │◀──── text                        │
+│                       └──────┬───────┘                                  │
+│                              │                                          │
+│                              ▼                                          │
+│                        Text Response                                    │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+                              │
+                              ▼
+                           Response
+                            (Text)
+```
+## Hardware Requirements
+| Component | GPU | VRAM |
+|-----------|-----|------|
+| Vision Encoder | 1x | ~8GB |
+| LLM (32B) | 2x | ~60GB |
+| **Total** | **3x** | **~68GB** |
+## Key Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `chat_template_kwargs.thinking` | Enable reasoning | `false` |
+| `thinking_token_budget` | Max reasoning tokens | 500 |
+| `max_tokens` | Max output tokens | - |
+| `temperature` | Sampling temperature | 0.7 |
+For more details, see [OmniServe documentation](https://github.com/NAVER-Cloud-HyperCLOVA-X/OmniServe).
+# Citation
+TBU (Technical Report)
+# Questions
+For any other questions, please feel free to contact us at [email protected].
+# License
+The model is licensed under [HyperCLOVA X SEED 32B Think Model License Agreement](./LICENSE)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,258 @@

+{
+  "</arg_key>": 128045,
+  "</arg_value>": 128047,
+  "</think>": 128041,
+  "</tool_call>": 128043,
+  "</tool_response>": 128049,
+  "</tools>": 128051,
+  "<EMAIL>": 128037,
+  "<KEY>": 128038,
+  "<NAME>": 128036,
+  "<PASSWORD>": 128039,
+  "<arg_key>": 128044,
+  "<arg_value>": 128046,
+  "<code_to_intermediate>": 128018,
+  "<empty_output>": 128017,
+  "<file_sep>": 128008,
+  "<intermediate_to_code>": 128019,
+  "<issue_closed>": 128011,
+  "<issue_comment>": 128010,
+  "<issue_start>": 128009,
+  "<jupyter_code>": 128014,
+  "<jupyter_output>": 128015,
+  "<jupyter_script>": 128016,
+  "<jupyter_start>": 128012,
+  "<jupyter_text>": 128013,
+  "<pr>": 128020,
+  "<pr_base>": 128023,
+  "<pr_base_code>": 128025,
+  "<pr_comment>": 128028,
+  "<pr_diff>": 128026,
+  "<pr_diff_hunk>": 128027,
+  "<pr_diff_hunk_comment_line>": 128035,
+  "<pr_event_id>": 128029,
+  "<pr_file>": 128024,
+  "<pr_in_reply_to_comment_id>": 128034,
+  "<pr_in_reply_to_review_id>": 128033,
+  "<pr_is_merged>": 128022,
+  "<pr_review>": 128030,
+  "<pr_review_comment>": 128032,
+  "<pr_review_state>": 128031,
+  "<pr_status>": 128021,
+  "<repo_name>": 128007,
+  "<think>": 128040,
+  "<tool_call>": 128042,
+  "<tool_response>": 128048,
+  "<tools>": 128050,
+  "<|IMAGE_PAD|>": 128060,
+  "<|VIDEO_PAD|>": 128061,
+  "<|_placeholder_067|>": 128067,
+  "<|_placeholder_068|>": 128068,
+  "<|_placeholder_069|>": 128069,
+  "<|_placeholder_070|>": 128070,
+  "<|_placeholder_071|>": 128071,
+  "<|_placeholder_072|>": 128072,
+  "<|_placeholder_073|>": 128073,
+  "<|_placeholder_074|>": 128074,
+  "<|_placeholder_075|>": 128075,
+  "<|_placeholder_076|>": 128076,
+  "<|_placeholder_077|>": 128077,
+  "<|_placeholder_078|>": 128078,
+  "<|_placeholder_079|>": 128079,
+  "<|_placeholder_080|>": 128080,
+  "<|_placeholder_081|>": 128081,
+  "<|_placeholder_082|>": 128082,
+  "<|_placeholder_083|>": 128083,
+  "<|_placeholder_084|>": 128084,
+  "<|_placeholder_085|>": 128085,
+  "<|_placeholder_086|>": 128086,
+  "<|_placeholder_087|>": 128087,
+  "<|_placeholder_088|>": 128088,
+  "<|_placeholder_089|>": 128089,
+  "<|_placeholder_090|>": 128090,
+  "<|_placeholder_091|>": 128091,
+  "<|_placeholder_092|>": 128092,
+  "<|_placeholder_093|>": 128093,
+  "<|_placeholder_094|>": 128094,
+  "<|_placeholder_095|>": 128095,
+  "<|_placeholder_096|>": 128096,
+  "<|_placeholder_097|>": 128097,
+  "<|_placeholder_098|>": 128098,
+  "<|_placeholder_099|>": 128099,
+  "<|_placeholder_100|>": 128100,
+  "<|_placeholder_101|>": 128101,
+  "<|_placeholder_102|>": 128102,
+  "<|_placeholder_103|>": 128103,
+  "<|_placeholder_104|>": 128104,
+  "<|_placeholder_105|>": 128105,
+  "<|_placeholder_106|>": 128106,
+  "<|_placeholder_107|>": 128107,
+  "<|_placeholder_108|>": 128108,
+  "<|_placeholder_109|>": 128109,
+  "<|_placeholder_110|>": 128110,
+  "<|_placeholder_111|>": 128111,
+  "<|_placeholder_112|>": 128112,
+  "<|_placeholder_113|>": 128113,
+  "<|_placeholder_114|>": 128114,
+  "<|_placeholder_115|>": 128115,
+  "<|_placeholder_116|>": 128116,
+  "<|_placeholder_117|>": 128117,
+  "<|_placeholder_118|>": 128118,
+  "<|_placeholder_119|>": 128119,
+  "<|_placeholder_120|>": 128120,
+  "<|_placeholder_121|>": 128121,
+  "<|_placeholder_122|>": 128122,
+  "<|_placeholder_123|>": 128123,
+  "<|_placeholder_124|>": 128124,
+  "<|_placeholder_125|>": 128125,
+  "<|_placeholder_126|>": 128126,
+  "<|_placeholder_127|>": 128127,
+  "<|_placeholder_128|>": 128128,
+  "<|_placeholder_129|>": 128129,
+  "<|_placeholder_130|>": 128130,
+  "<|_placeholder_131|>": 128131,
+  "<|_placeholder_132|>": 128132,
+  "<|_placeholder_133|>": 128133,
+  "<|_placeholder_134|>": 128134,
+  "<|_placeholder_135|>": 128135,
+  "<|_placeholder_136|>": 128136,
+  "<|_placeholder_137|>": 128137,
+  "<|_placeholder_138|>": 128138,
+  "<|_placeholder_139|>": 128139,
+  "<|_placeholder_140|>": 128140,
+  "<|_placeholder_141|>": 128141,
+  "<|_placeholder_142|>": 128142,
+  "<|_placeholder_143|>": 128143,
+  "<|_placeholder_144|>": 128144,
+  "<|_placeholder_145|>": 128145,
+  "<|_placeholder_146|>": 128146,
+  "<|_placeholder_147|>": 128147,
+  "<|_placeholder_148|>": 128148,
+  "<|_placeholder_149|>": 128149,
+  "<|_placeholder_150|>": 128150,
+  "<|_placeholder_151|>": 128151,
+  "<|_placeholder_152|>": 128152,
+  "<|_placeholder_153|>": 128153,
+  "<|_placeholder_154|>": 128154,
+  "<|_placeholder_155|>": 128155,
+  "<|_placeholder_156|>": 128156,
+  "<|_placeholder_157|>": 128157,
+  "<|_placeholder_158|>": 128158,
+  "<|_placeholder_159|>": 128159,
+  "<|_placeholder_160|>": 128160,
+  "<|_placeholder_161|>": 128161,
+  "<|_placeholder_162|>": 128162,
+  "<|_placeholder_163|>": 128163,
+  "<|_placeholder_164|>": 128164,
+  "<|_placeholder_165|>": 128165,
+  "<|_placeholder_166|>": 128166,
+  "<|_placeholder_167|>": 128167,
+  "<|_placeholder_168|>": 128168,
+  "<|_placeholder_169|>": 128169,
+  "<|_placeholder_170|>": 128170,
+  "<|_placeholder_171|>": 128171,
+  "<|_placeholder_172|>": 128172,
+  "<|_placeholder_173|>": 128173,
+  "<|_placeholder_174|>": 128174,
+  "<|_placeholder_175|>": 128175,
+  "<|_placeholder_176|>": 128176,
+  "<|_placeholder_177|>": 128177,
+  "<|_placeholder_178|>": 128178,
+  "<|_placeholder_179|>": 128179,
+  "<|_placeholder_180|>": 128180,
+  "<|_placeholder_181|>": 128181,
+  "<|_placeholder_182|>": 128182,
+  "<|_placeholder_183|>": 128183,
+  "<|_placeholder_184|>": 128184,
+  "<|_placeholder_185|>": 128185,
+  "<|_placeholder_186|>": 128186,
+  "<|_placeholder_187|>": 128187,
+  "<|_placeholder_188|>": 128188,
+  "<|_placeholder_189|>": 128189,
+  "<|_placeholder_190|>": 128190,
+  "<|_placeholder_191|>": 128191,
+  "<|_placeholder_192|>": 128192,
+  "<|_placeholder_193|>": 128193,
+  "<|_placeholder_194|>": 128194,
+  "<|_placeholder_195|>": 128195,
+  "<|_placeholder_196|>": 128196,
+  "<|_placeholder_197|>": 128197,
+  "<|_placeholder_198|>": 128198,
+  "<|_placeholder_199|>": 128199,
+  "<|_placeholder_200|>": 128200,
+  "<|_placeholder_201|>": 128201,
+  "<|_placeholder_202|>": 128202,
+  "<|_placeholder_203|>": 128203,
+  "<|_placeholder_204|>": 128204,
+  "<|_placeholder_205|>": 128205,
+  "<|_placeholder_206|>": 128206,
+  "<|_placeholder_207|>": 128207,
+  "<|_placeholder_208|>": 128208,
+  "<|_placeholder_209|>": 128209,
+  "<|_placeholder_210|>": 128210,
+  "<|_placeholder_211|>": 128211,
+  "<|_placeholder_212|>": 128212,
+  "<|_placeholder_213|>": 128213,
+  "<|_placeholder_214|>": 128214,
+  "<|_placeholder_215|>": 128215,
+  "<|_placeholder_216|>": 128216,
+  "<|_placeholder_217|>": 128217,
+  "<|_placeholder_218|>": 128218,
+  "<|_placeholder_219|>": 128219,
+  "<|_placeholder_220|>": 128220,
+  "<|_placeholder_221|>": 128221,
+  "<|_placeholder_222|>": 128222,
+  "<|_placeholder_223|>": 128223,
+  "<|_placeholder_224|>": 128224,
+  "<|_placeholder_225|>": 128225,
+  "<|_placeholder_226|>": 128226,
+  "<|_placeholder_227|>": 128227,
+  "<|_placeholder_228|>": 128228,
+  "<|_placeholder_229|>": 128229,
+  "<|_placeholder_230|>": 128230,
+  "<|_placeholder_231|>": 128231,
+  "<|_placeholder_232|>": 128232,
+  "<|_placeholder_233|>": 128233,
+  "<|_placeholder_234|>": 128234,
+  "<|_placeholder_235|>": 128235,
+  "<|_placeholder_236|>": 128236,
+  "<|_placeholder_237|>": 128237,
+  "<|_placeholder_238|>": 128238,
+  "<|_placeholder_239|>": 128239,
+  "<|_placeholder_240|>": 128240,
+  "<|_placeholder_241|>": 128241,
+  "<|_placeholder_242|>": 128242,
+  "<|_placeholder_243|>": 128243,
+  "<|_placeholder_244|>": 128244,
+  "<|_placeholder_245|>": 128245,
+  "<|_placeholder_246|>": 128246,
+  "<|_placeholder_247|>": 128247,
+  "<|_placeholder_248|>": 128248,
+  "<|_placeholder_249|>": 128249,
+  "<|_placeholder_250|>": 128250,
+  "<|_placeholder_251|>": 128251,
+  "<|_placeholder_252|>": 128252,
+  "<|_placeholder_253|>": 128253,
+  "<|_placeholder_254|>": 128254,
+  "<|_placeholder_255|>": 128255,
+  "<|back_translation|>": 128065,
+  "<|code_switching|>": 128064,
+  "<|document_end|>": 128055,
+  "<|document_start|>": 128054,
+  "<|endofturn|>": 128003,
+  "<|fim_middle|>": 128005,
+  "<|fim_prefix|>": 128004,
+  "<|fim_suffix|>": 128006,
+  "<|im_end|>": 128001,
+  "<|im_start|>": 128000,
+  "<|image_end|>": 128057,
+  "<|image_start|>": 128056,
+  "<|instruction_pretraining|>": 128066,
+  "<|mime_end|>": 128053,
+  "<|mime_start|>": 128052,
+  "<|stop|>": 128002,
+  "<|video_end|>": 128059,
+  "<|video_start|>": 128058,
+  "<|vision_aux_end|>": 128063,
+  "<|vision_aux_start|>": 128062
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,147 @@

+{%- set ns_img = namespace(count=0) %}
+{%- set ns_vid = namespace(count=0) %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content + '\n\n' }}
+        {%- elif messages[0].content is sequence %}
+            {%- for content_part in messages[0].content %}
+                {%- if content_part.type == 'text' %}
+                    {{- content_part.text + '\n\n' }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+    {%- endif %}
+    {{- '# Tools\n\n' }}
+    {{- 'You may call one or more functions to assist with the user query.\n\n' }}
+    {{- 'You are provided with function signatures within <tools></tools> XML tags:\n' }}
+    {{- '<tools>\n' }}
+    {%- for tool in tools %}
+        {{- tool | tojson(ensure_ascii=False) }}
+    {%- endfor %}
+    {{- '\n</tools>\n\n' }}
+    {{- 'For each function call, output the function name and arguments within the following XML format:\n' }}
+    {{- '<tool_call>{function-name}\n' }}
+    {{- '<arg_key>{arg-key-1}</arg_key>\n' }}
+    {{- '<arg_value>{arg-value-1}</arg_value>\n' }}
+    {{- '<arg_key>{arg-key-2}</arg_key>\n' }}
+    {{- '<arg_value>{arg-value-2}</arg_value>\n' }}
+    {{- '...\n' }}
+    {{- '</tool_call><|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- elif messages[0].content is sequence %}
+            {%- for content_part in messages[0].content %}
+                {%- if content_part.type == 'text' %}
+                    {{- content_part.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {%- set ns.last_user_index = loop.index0 %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- set content = message.content %}
+    {%- if (message.role == 'system' and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if content is string %}
+            {{- content }}
+        {%- elif content is sequence %}
+            {%- for content_part in content %}
+                {%- if content_part.type == 'text' %}
+                    {{- content_part.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>' + '\n' }}
+    {%- elif message.role == 'user' %}
+        {{- '<|im_start|>user\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] + '<|im_end|>\n' }}
+        {%- elif message['content'] is sequence %}
+            {%- for content in message['content'] %}
+                {%- if not loop.first %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if content['type'] == 'image' %}
+                    {%- set image_id = 'image_%02d' % ns_img.count %}
+                    {%- set ns_img.count = ns_img.count + 1 %}
+                    {{- '<|mime_start|>{"id": "' + image_id + '", "type": "image/jpeg", "filename": "' + content.get('filename', "a.jpg") + '"}<|mime_end|>\n' }}
+                    {{- '<|image_start|><|IMAGE_PAD|><|image_end|>' }}
+                {%- elif content['type'] == 'video' %}
+                    {%- set video_id = 'video_%02d' % ns_vid.count %}
+                    {%- set ns_vid.count = ns_vid.count + 1 %}
+                    {{- '<|mime_start|>{"id": "' + video_id + '", "type": "video/mp4", "filename": "' + content.get('filename', "a.mp4") + '"}<|mime_end|>\n' }}
+                    {{- '<|video_aux_start|>다음 중 video_duration은 비디오 길이 정보입니다. 참고하여 답변하세요. {"video_duration": ' + (content.get('video_duration') | tojson if content.get('video_duration') else '<|video_duration|>') + '}<|video_aux_end|>\n'}}
+                    {{- '<|video_start|><|VIDEO_PAD|><|video_end|>\n'}}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+            {{- '<|im_end|>\n'}}
+        {%- endif %}
+    {%- elif message.role == 'assistant' %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_user_index %}
+            {%- if loop.last or reasoning_content %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if not loop.first or content %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>' + tool_call.name + '\n' }}
+                {%- set _args = tool_call.arguments %}
+                {%- for k, v in _args.items() %}
+                    {{- '<arg_key>' + k + '</arg_key>\n' }}
+                    {{- '<arg_value>' + (v | tojson(ensure_ascii=False) if v is not string else v) + '</arg_value>\n' }}
+                {%- endfor %}
+                {{- '</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == 'tool' %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
+            {{- '<|im_start|>tool' }}
+        {%- endif %}
+        {{- '\n<tool_response>' + message.get('name', '') + '\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n<think>\n' }}
+    {%- if skip_reasoning is defined and skip_reasoning is true %}
+        {{- '\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,229 @@

+{
+  "anyres": false,
+  "architectures": [
+    "HCXVisionV2ForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_vlm.HCXVisionConfig",
+    "AutoModelForCausalLM": "modeling_vlm.HCXVisionForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_vlm.HCXVisionForSequenceClassification",
+    "AutoModelForTokenClassification": "modeling_vlm.HCXVisionForTokenClassification"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 128001,
+  "freeze_before_sampler": false,
+  "freeze_decoder": false,
+  "freeze_encoder": true,
+  "freeze_mm_projector": false,
+  "hidden_size": 5120,
+  "ignore_index": -100,
+  "image_token_id": 128060,
+  "img_start_id": 128060,
+  "is_safetensor_save": true,
+  "max_num_grids": -1,
+  "mm_projector_type": "linear",
+  "model_type": "vlm",
+  "num_queries_vis_abstractor": -1,
+  "pad_token_id": 0,
+  "possible_resolutions": [],
+  "proj_pos_emb": true,
+  "proj_prenorm": false,
+  "q_former_model_name_or_path": null,
+  "text_config": {
+    "add_cross_attention": false,
+    "architectures": [
+      "HyperCLOVAXForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_multiplier": 0.08838834764831845,
+    "auto_map": {
+      "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
+      "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
+      "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 128000,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dtype": "bfloat16",
+    "early_stopping": false,
+    "embedding_multiplier": 1.0,
+    "encoder_no_repeat_ngram_size": 0,
+    "end_token_id": 128001,
+    "eos_token_id": 128001,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.006,
+    "intermediate_size": 24192,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "logits_scaling": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 131072,
+    "min_length": 0,
+    "mlp_bias": false,
+    "model_type": "hyperclovax",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 40,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 72,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 0,
+    "prefix": null,
+    "pretraining_tp": 1,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "resid_pdrop": 0.2,
+    "residual_multiplier": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 50000000,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": false,
+    "use_post_norm": false,
+    "vocab_size": 128256,
+    "_name_or_path": "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
+  },
+  "text_model_name_or_path": null,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "unpad": false,
+  "use_1x1_grid": false,
+  "use_nth_layer": -2,
+  "video_first_last_frames_slows": null,
+  "video_max_num_frames": null,
+  "video_num_queries_fast": null,
+  "video_num_queries_slow": null,
+  "video_start_id": 128061,
+  "video_token_id": 128061,
+  "vision_config": {
+    "add_cross_attention": false,
+    "anyres": false,
+    "architectures": [
+      "Qwen2_5_VisionTransformerPretrainedModel"
+    ],
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3456,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_num_grids": -1,
+    "min_length": 0,
+    "model_type": "qwen2_5_vl",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_heads": 16,
+    "num_return_sequences": 1,
+    "out_hidden_size": 5120,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "tokens_per_second": 2,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "float32",
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "window_size": 112
+  },
+  "vision_input_chunk_size": null,
+  "vision_model_name_or_path": null
+}

configuration_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LLaMA model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+# from transformers.modeling_rope_utils import rope_config_validation
+# from transformers import PretrainedConfig, rope_config_validation
+class HyperCLOVAXConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hyperclovax"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        embedding_multiplier=1.0,  # mup
+        logits_scaling=1.0,  # mup
+        attention_multiplier=1.0,  # mup
+        residual_multiplier=1.0,  # mup
+        use_post_norm=False,  # post-norm
+        auto_map={
+            "AutoConfig": "configuration_hyperclovax.HyperCLOVAXConfig",
+            "AutoModel": "modeling_hyperclovax.HyperCLOVAXModel",
+            "AutoModelForCausalLM": "modeling_hyperclovax.HyperCLOVAXForCausalLM",
+        },
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        # mup
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.attention_multiplier = attention_multiplier
+        self.residual_multiplier = residual_multiplier
+        # post-norm (dual-norm)
+        self.use_post_norm = use_post_norm
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            auto_map=auto_map,
+            **kwargs,
+        )

configuration_vlm.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import transformers
+from transformers import AutoConfig, PretrainedConfig
+class HCXVisionConfig(PretrainedConfig):
+    model_type = "vlm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        text_model_name_or_path=None,
+        vision_model_name_or_path=None,
+        q_former_model_name_or_path=None,
+        mm_projector_type="mlp",
+        use_nth_layer=-2,
+        img_start_id=100271,  # <|IMAGE_PAD|>
+        video_start_id=100270,  # <|VIDEO_PAD|>
+        freeze_encoder=False,
+        freeze_decoder=False,
+        freeze_mm_projector=False,
+        anyres=False,
+        unpad=False,
+        max_num_grids=-1,
+        num_queries_vis_abstractor=-1,
+        video_num_queries_fast=None,
+        video_num_queries_slow=None,
+        video_first_last_frames_slows=None,
+        video_max_num_frames=None,
+        ignore_index=-100,
+        proj_pos_emb=True,
+        proj_prenorm=False,
+        use_1x1_grid=False,
+        possible_resolutions=[],
+        **kwargs,
+    ):
+        from transformers import CONFIG_MAPPING
+        if kwargs.get("language_config", None) is not None:  # for bc
+            text_config = CONFIG_MAPPING[kwargs["language_config"]["model_type"]](**kwargs["language_config"])
+        elif text_config is None and text_model_name_or_path is not None:
+            text_config = AutoConfig.from_pretrained(text_model_name_or_path, trust_remote_code=True)
+        if vision_config is None and vision_model_name_or_path is not None:
+            vision_config = AutoConfig.from_pretrained(vision_model_name_or_path, trust_remote_code=True)
+        if isinstance(text_config, dict):
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        if isinstance(vision_config, dict):
+            if vision_config["model_type"] == "qwen2_5_vl":
+                vision_config["model_type"] = "qwen2_5_vl_visual"
+                assert transformers.__version__ >= "4.52.4", "please upgrade transformers to 4.52.4 or higher"
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        self.text_config = text_config
+        self.vision_config = vision_config
+        if text_config is not None:
+            # deepspeed zero3에서 config의 hidden_size를 보고 메모리 크기를 자동으로 결정함.
+            self.hidden_size = text_config.hidden_size if hasattr(text_config, "hidden_size") else text_config.n_embd
+        # add VLM configs
+        self.text_model_name_or_path = text_model_name_or_path
+        self.vision_model_name_or_path = vision_model_name_or_path
+        self.q_former_model_name_or_path = q_former_model_name_or_path
+        self.mm_projector_type = mm_projector_type
+        self.use_nth_layer = use_nth_layer
+        self.freeze_encoder = freeze_encoder
+        self.freeze_decoder = freeze_decoder
+        self.freeze_mm_projector = freeze_mm_projector
+        self.anyres = anyres
+        self.unpad = unpad
+        self.max_num_grids = max_num_grids
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.video_num_queries_fast = video_num_queries_fast
+        self.video_num_queries_slow = video_num_queries_slow
+        self.video_first_last_frames_slows = video_first_last_frames_slows
+        self.video_max_num_frames = video_max_num_frames
+        self.img_start_id = img_start_id
+        self.image_token_id = img_start_id
+        self.video_start_id = video_start_id
+        self.video_token_id = video_start_id
+        self.ignore_index = ignore_index
+        self.proj_pos_emb = proj_pos_emb
+        self.proj_prenorm = proj_prenorm
+        self.use_1x1_grid = use_1x1_grid
+        self.possible_resolutions = possible_resolutions
+        super().__init__(**kwargs)
+        if self.text_config is not None:  # needed for HCXVisionForSequenceClassification
+            self.pad_token_id = self.text_config.pad_token_id
+AutoConfig.register("vlm", HCXVisionConfig)
+try:
+    from .configuration_hyperclovax import HyperCLOVAXConfig
+    AutoConfig.register("hyperclovax", HyperCLOVAXConfig)
+except:
+    pass
+try:
+    from transformers import CONFIG_MAPPING, MODEL_MAPPING
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+        Qwen2_5_VisionTransformerPretrainedModel,
+        Qwen2_5_VLPatchMerger,
+        Qwen2_5_VLVisionConfig,
+    )
+    MODEL_MAPPING.register(Qwen2_5_VLVisionConfig, Qwen2_5_VisionTransformerPretrainedModel)
+    CONFIG_MAPPING.register("qwen2_5_vl_visual", Qwen2_5_VLVisionConfig)
+except:
+    pass

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "pad_token_id": 0,
+  "transformers_version": "4.52.4",
+  "use_cache": false
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ce534529a94e7ec564597d53844faa017ecc8df3976a019dfa199326850245d
+size 4950209576

model-00002-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17d38e682e956b69951fe2ac43aaeaf7e71041de1b6d63d2b5c2202493feacdc
+size 4770540848

model-00003-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bf73569b760f3c1c7d75b911f26c8d18997052907d5f90e76b59d1630f6804b
+size 4819716376

model-00004-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8358e640759c2b251d38d3f0b39275d26ece4500d090e7f94e781f835479643
+size 4867797112

model-00005-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22983b512ee1d72c8b656a9c40f901b623ebfa8340cfd7e5e977fb64277339ed
+size 4983871936

model-00006-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da16a0bc916243c672232ee00e51f0dd928a8f9c4586622896075b1ccab3ce7
+size 4782422992

model-00007-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3618232ec5fd2dac853df8d3e692e077b573f20ae62a5545856831c45e0928a3
+size 4977005552

model-00008-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5eb08aba7d9623e01c92a142927524270ae28c699d44f5eb19936ee49d8039
+size 4909855088

model-00009-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7643f4d4dcbbe4915ab161ff09badafbc575c9e500ae6c7edc7a1e27d0d71793
+size 4858416608

model-00010-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fcaa327125a328f34bb53f06cea5ab319f6fb9e9b02772a0eace5a3cf0580c7
+size 4937737048

model-00011-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cddf84315c9c0d793223ce7643a670e920791d76ff7438145878bfa8a21a6a83
+size 4817347272

model-00012-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2f015222bf0fef5b9f078b6290ad161b1cb450da7ac0cf481e59c29b4105f26
+size 4824605160

model-00013-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2641a78bd72c036ed15f6e074b3e434a7f72c851854931bed431f3c06a9ac6fd
+size 4976322456

model-00014-of-00014.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42af553c74487cd4b9c1344dc37b09f254bcc6ff7dc6b60ec1a414982460338e
+size 3151109544

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_hyperclovax.py ADDED Viewed

	@@ -0,0 +1,1866 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_hyperclovax import HyperCLOVAXConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "HyperCLOVAXConfig"
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+    return causal_mask
+class HyperCLOVAXRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HyperCLOVAXRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+ALL_LAYERNORM_LAYERS.append(HyperCLOVAXRMSNorm)
+class HyperCLOVAXRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[HyperCLOVAXConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`HyperCLOVAXRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class HyperCLOVAXLinearScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
+    """HyperCLOVAXRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`HyperCLOVAXLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`HyperCLOVAXRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
+        )
+        kwargs["rope_type"] = "linear"
+        super().__init__(*args, **kwargs)
+class HyperCLOVAXDynamicNTKScalingRotaryEmbedding(HyperCLOVAXRotaryEmbedding):
+    """HyperCLOVAXRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`HyperCLOVAXDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`HyperCLOVAXRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
+            "__init__)."
+        )
+        kwargs["rope_type"] = "dynamic"
+        super().__init__(*args, **kwargs)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class HyperCLOVAXMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class HyperCLOVAXAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = config.attention_multiplier
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=self.config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class HyperCLOVAXFlashAttention2(HyperCLOVAXAttention):
+    """
+    HyperCLOVAX flash attention module. This module inherits from `HyperCLOVAXAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (HyperCLOVAXRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            softmax_scale=self.scaling,  # mup
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class HyperCLOVAXSdpaAttention(HyperCLOVAXAttention):
+    """
+    HyperCLOVAX attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `HyperCLOVAXAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from HyperCLOVAXAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "HyperCLOVAXModel is using HyperCLOVAXSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+            scale=self.scaling,  # mup
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+HyperCLOVAX_ATTENTION_CLASSES = {
+    "eager": HyperCLOVAXAttention,
+    "flash_attention_2": HyperCLOVAXFlashAttention2,
+    "sdpa": HyperCLOVAXSdpaAttention,
+}
+class HyperCLOVAXDecoderLayer(nn.Module):
+    def __init__(self, config: HyperCLOVAXConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HyperCLOVAX_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = HyperCLOVAXMLP(config)
+        self.input_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # post-norm (dual-norm)
+        self.use_post_norm = config.use_post_norm
+        if self.use_post_norm:
+            self.post_norm1 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_norm2 = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.residual_multiplier = config.residual_multiplier  # mup
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if self.use_post_norm:
+            hidden_states = self.post_norm1(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # mup
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if self.use_post_norm:
+            hidden_states = self.post_norm2(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier  # mup
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+HyperCLOVAX_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`HyperCLOVAXConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXPreTrainedModel(PreTrainedModel):
+    config_class = HyperCLOVAXConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HyperCLOVAXDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+HyperCLOVAX_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare HyperCLOVAX Model outputting raw hidden-states without any specific head on top.",
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXModel(HyperCLOVAXPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`HyperCLOVAXDecoderLayer`]
+    Args:
+        config: HyperCLOVAXConfig
+    """
+    def __init__(self, config: HyperCLOVAXConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HyperCLOVAXDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HyperCLOVAXRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HyperCLOVAXRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        # mup
+        self.embedding_multiplier = config.embedding_multiplier
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # mup
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+class HyperCLOVAXForCausalLM(HyperCLOVAXPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HyperCLOVAXModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _get_apply_liger_kernel_converter(self):
+        return _apply_liger_kernel_to_instance
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, HyperCLOVAXForCausalLM
+        >>> model = HyperCLOVAXForCausalLM.from_pretrained(YOUR_DIR)
+        >>> tokenizer = AutoTokenizer.from_pretrained(YOUR_DIR)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            if labels is None and not is_torchdynamo_compiling():
+                logger.warning_once(
+                    "Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)"
+                )
+            # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+            # TODO: remove the float() operation in v4.46
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = logits * self.config.logits_scaling  # mup
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+            dtype = self.lm_head.weight.dtype
+            min_dtype = torch.finfo(dtype).min
+            attention_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+@add_start_docstrings(
+    """
+    The HyperCLOVAX Model transformer with a sequence classification head on top (linear layer).
+    [`HyperCLOVAXForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForSequenceClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+The HyperCLOVAX Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForQuestionAnswering(HyperCLOVAXPreTrainedModel):
+    base_model_prefix = "transformer"
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->HyperCLOVAX
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = HyperCLOVAXModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    The HyperCLOVAX Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    HyperCLOVAX_START_DOCSTRING,
+)
+class HyperCLOVAXForTokenClassification(HyperCLOVAXPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = HyperCLOVAXModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(HyperCLOVAX_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+################################################################################################
+################################################################################################
+"""
+liger kernel monkey patching
+https://github.com/linkedin/Liger-Kernel/blob/v0.5.2/src/liger_kernel/transformers/monkey_patch.py
+"""
+import inspect
+import logging
+from functools import partial
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import transformers
+from packaging import version
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedModel
+if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
+import sys
+from packaging.version import parse
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+try:
+    from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+    from liger_kernel.transformers.functional import liger_cross_entropy
+    from liger_kernel.transformers.fused_linear_cross_entropy import (
+        LigerFusedLinearCrossEntropyLoss,
+    )
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+    from liger_kernel.transformers.rope import liger_rotary_pos_emb
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+    _is_liger_kernel_available = True
+    LIGER_KERNEL_MATCHING_VERSION = parse("0.5.2")
+    liger_kernel_version = parse(importlib_metadata.version("liger_kernel"))
+    _is_liger_kernel_version_matching = (
+        liger_kernel_version.major,
+        liger_kernel_version.minor,
+        liger_kernel_version.release[-1],
+    ) == (
+        LIGER_KERNEL_MATCHING_VERSION.major,
+        LIGER_KERNEL_MATCHING_VERSION.minor,
+        LIGER_KERNEL_MATCHING_VERSION.release[-1],
+    )
+except Exception:
+    _is_liger_kernel_available = False
+    _is_liger_kernel_version_matching = False
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: int = 0,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+    hidden_states = outputs[0]
+    loss = None
+    logits = None
+    if self.training and (labels is not None):
+        if num_logits_to_keep != 0:
+            hidden_states = hidden_states[:, -num_logits_to_keep:, :]  # not sure if it has bug
+        hidden_states = hidden_states * self.config.logits_scaling  ## muP
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        assert self.config.pretraining_tp == 1, "not supported"
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        logits = logits * self.config.logits_scaling  ## muP
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+def _bind_method_to_module(module, method_name: str, new_method: Callable):
+    # Binds a new method to a module instance so that self is passed as the first argument
+    module.__dict__[method_name] = new_method.__get__(module, module.__class__)
+def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True):
+    module.offset = offset
+    module.casting_mode = casting_mode
+    module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+    module.in_place = in_place
+    _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
+    _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
+def apply_liger_kernel_to_hyperclovax(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    assert not cross_entropy, "not supported"
+    if rope:
+        apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        HyperCLOVAXRMSNorm = LigerRMSNorm
+    if swiglu:
+        HyperCLOVAXMLP = LigerSwiGLUMLP
+    # to use VLM forward in VLM repo
+    # if fused_linear_cross_entropy:
+    #     HyperCLOVAXForCausalLM.forward = lce_forward_deprecated
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
+        # get the base model from the model instance
+        base_model: HyperCLOVAXModel = getattr(model, model.base_model_prefix, model)
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+                if decoder_layer.use_post_norm:
+                    _patch_rms_norm_module(decoder_layer.post_norm1)
+                    _patch_rms_norm_module(decoder_layer.post_norm2)
+def _apply_liger_kernel_to_instance(model: PreTrainedModel, **kwargs) -> None:
+    model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
+    assert model_type == "hyperclovax"
+    apply_fn = apply_liger_kernel_to_hyperclovax
+    apply_fn_signature = inspect.signature(apply_fn)
+    # Filter out the keyword arguments that are not supported by the apply function
+    applicable_kwargs = {key: value for key, value in kwargs.items() if key in apply_fn_signature.parameters}
+    logger.info(
+        f"Applying Liger kernels to model instance with model type: {model_type} with kwargs: {applicable_kwargs}"
+    )
+    apply_fn(model=model, **applicable_kwargs)
+################################################################################################
+################################################################################################

modeling_vlm.py ADDED Viewed

	@@ -0,0 +1,1913 @@

+import contextlib
+import math
+import os
+from functools import partial
+from itertools import chain
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+try:
+    from einops import rearrange
+    from timm.layers import LayerNorm, LayerNorm2d
+    from timm.models.regnet import RegStage
+except:
+    print("packages needed for anyres are not imported")
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+)
+from transformers.cache_utils import Cache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput
+)
+from transformers.modeling_utils import no_init_weights
+from .configuration_vlm import HCXVisionConfig
+def get_rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    return 0
+def is_ampere_or_newer():
+    if not torch.cuda.is_available():
+        return False
+    gpu_name = torch.cuda.get_device_name()
+    ampere_keywords = [
+        "RTX 30",
+        "RTX 40",
+        "A100",
+        "H100",
+        "A6000",
+        "A5000",
+        "A4000",
+        "A3000",
+        "A2000",
+        "A1000",
+    ]
+    return any(keyword in gpu_name for keyword in ampere_keywords)
+EOT = "<|endofturn|>"
+IMG_LOC = "<|IMAGE_PAD|>"
+# https://github.com/huggingface/transformers/blob/42fe769928b505158bc6a0342f47b10693b81927/src/transformers/models/llama/modeling_llama.py#L315-L330
+class HCXVisionPreTrainedModel(PreTrainedModel):
+    config_class = HCXVisionConfig
+    base_model_prefix = "model"
+    vision_model_name = "vision_model"
+    _no_split_modules = [
+        "CLIPAttention",
+        "SiglipVisionModel",
+        # "Qwen2_5_VLVisionBlock",
+        # "Qwen2_5_VLVisionModel",
+        # "Qwen2_5_VisionTransformerPretrainedModel",
+    ]  # LlavaNext 에도 vision attention은 split 하지 않음
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        # copies from https://github.com/kakaobrain/honeybee/blob/main/honeybee/common_layers.py#L55
+        if (
+            isinstance(module, nn.Conv2d)  # noqa: SIM101
+            or isinstance(module, nn.Embedding)
+            or isinstance(module, nn.Linear)
+        ):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Parameter):
+            embed_std = 1 / torch.sqrt(torch.tensor(module.size(0), dtype=torch.float)).to(module.dtype)
+            module.data.normal_(mean=0.0, std=embed_std)
+class HCXVisionModel(HCXVisionPreTrainedModel):
+    def __init__(
+        self,
+        config: HCXVisionConfig,
+        without_llm=False,
+        **kwargs,
+    ):
+        super().__init__(config)
+        self.flag_changed_max_position_embeddings = False
+        self.without_llm = without_llm
+        vision_model_type = config.vision_config.model_type
+        self.is_qwen_visual = False
+        if vision_model_type == "qwen2_5_vl_visual":
+            self.is_qwen_visual = True
+        self.freeze_before_sampler = kwargs.pop("freeze_before_sampler", False)
+        vision_config = config.vision_config
+        vision_config.anyres = config.anyres
+        vision_config.max_num_grids = config.max_num_grids
+        vision_config.update({"torch_dtype": config.torch_dtype})
+        self.vision_config = vision_config
+        if config.anyres:
+            if not getattr(config, "possible_resolutions", []):
+                possible_resolutions = []
+                if config.anyres:
+                    assert config.max_num_grids > 0
+                    for i in range(1, config.max_num_grids + 1):
+                        for j in range(1, config.max_num_grids + 1):
+                            if i == 1 and j == 1 and not config.use_1x1_grid:
+                                continue
+                            if i * j <= config.max_num_grids:
+                                possible_resolutions.append([i, j])
+                    possible_resolutions = [
+                        [ys * vision_config.image_size, xs * vision_config.image_size]
+                        for ys, xs in possible_resolutions
+                    ]
+                self.config.possible_resolutions = possible_resolutions
+            else:
+                self.config.possible_resolutions = config.possible_resolutions
+        if without_llm:
+            # if vision_config.vision_module_type not in ["officialllava", "cream2"]:
+            # service에서, "vision_model_name_or_path" 의 경로가 vuclip_name2save_path 에 있는 default경로가 아니라, custom한 경로를 따라가야함.
+            vision_config.vison_pretrained_name_or_path = config.vision_model_name_or_path
+        with no_init_weights():
+            if self.is_qwen_visual and is_ampere_or_newer():
+                vision_config._attn_implementation = "flash_attention_2"
+            self.vision_model = AutoModel.from_config(
+                vision_config, trust_remote_code=True
+            )  # weight will be loaded in from_pretrained
+            self.vision_model.gradient_checkpointing_enable()
+            if config.mm_projector_type == "qwen_merger":
+                import torch.nn.functional as F
+                def new_forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+                    """
+                    Args:
+                        hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                            The final hidden states of the model.
+                        grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                            The temporal, height and width of feature shape of each image in LLM.
+                    Returns:
+                        `torch.Tensor`: hidden_states.
+                    """
+                    hidden_states = self.patch_embed(hidden_states)
+                    rotary_pos_emb = self.rot_pos_emb(grid_thw)
+                    window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+                    cu_window_seqlens = torch.tensor(
+                        cu_window_seqlens,
+                        device=hidden_states.device,
+                        dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+                    )
+                    cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+                    seq_len, _ = hidden_states.size()
+                    hidden_states = hidden_states.reshape(
+                        seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+                    )
+                    hidden_states = hidden_states[window_index, :, :]
+                    hidden_states = hidden_states.reshape(seq_len, -1)
+                    rotary_pos_emb = rotary_pos_emb.reshape(
+                        seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+                    )
+                    rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+                    rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+                    emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+                    position_embeddings = (emb.cos(), emb.sin())
+                    cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+                        dim=0,
+                        # Select dtype based on the following factors:
+                        #  - FA2 requires that cu_seqlens_q must have dtype int32
+                        #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+                        # See https://github.com/huggingface/transformers/pull/34852 for more information
+                        dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+                    )
+                    cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+                    for layer_num, blk in enumerate(self.blocks):
+                        if layer_num in self.fullatt_block_indexes:
+                            cu_seqlens_now = cu_seqlens
+                        else:
+                            cu_seqlens_now = cu_window_seqlens
+                        if self.gradient_checkpointing and self.training:
+                            hidden_states = self._gradient_checkpointing_func(
+                                blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings
+                            )
+                        else:
+                            hidden_states = blk(
+                                hidden_states, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings
+                            )
+                    # hidden_states = self.merger(hidden_states)
+                    # reverse_indices = torch.argsort(window_index)
+                    # hidden_states = hidden_states[reverse_indices, :]
+                    return hidden_states, window_index
+                import types
+                self.vision_model.forward = types.MethodType(new_forward, self.vision_model)
+                self.vision_model.merger = nn.Identity()
+        if hasattr(config, "text_config") and config.text_config is not None:
+            text_config = config.text_config
+        else:
+            raise ValueError("text_config is not defined")
+        text_config.update({"torch_dtype": config.torch_dtype})
+        if config.text_config.model_type in ["llama", "hyperclovax", "gpt2"]:
+            text_config._attn_implementation = config._attn_implementation
+        if text_config.model_type != "hyperclovax":
+            text_config.logits_scaling = 1.0
+        text_config.vocab_size = (
+            text_config.padded_vocab_size if hasattr(text_config, "padded_vocab_size") else text_config.vocab_size
+        )
+        if not without_llm:
+            with no_init_weights():
+                self.language_model = AutoModelForCausalLM.from_config(text_config, trust_remote_code=True)
+            if config.text_config.model_type in ["llama", "hyperclovax", "gpt2"]:
+                self.language_model.gradient_checkpointing_enable()
+        self.num_queries_vis_abstractor = config.num_queries_vis_abstractor
+        # mm_projctor(==connector); vision_model_hidden_size -> LLM embedding size
+        input_hidden_size = vision_config.hidden_size
+        if vision_config.model_type == "qwen2_5_vl_visual":
+            input_hidden_size = vision_config.out_hidden_size
+        if config.mm_projector_type == "linear":
+            self.mm_projector = nn.Linear(input_hidden_size, text_config.hidden_size)
+        elif config.mm_projector_type == "cabstractor":
+            self.mm_projector = CAbstractor(
+                num_queries=self.num_queries_vis_abstractor,
+                num_input_tokens=(self.vision_config.image_size // self.vision_config.patch_size) ** 2,
+                encoder_hidden_size=input_hidden_size,
+                hidden_size=input_hidden_size,
+                output_hidden_size=text_config.hidden_size,
+                pos_emb=config.proj_pos_emb,
+                prenorm=config.proj_prenorm,
+            )
+            self.mm_projector.pos_emb.to(config.torch_dtype)
+        elif config.mm_projector_type == "qwen_merger":
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+                Qwen2_5_VLPatchMerger,
+            )
+            self.mm_projector = Qwen2_5_VLPatchMerger(dim=text_config.hidden_size, context_dim=input_hidden_size)
+            def new_forward(self, inputs) -> torch.Tensor:
+                x, window_index = inputs
+                x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+                reverse_indices = torch.argsort(window_index)
+                x = x[reverse_indices, :]
+                return x
+            self.mm_projector.forward = types.MethodType(new_forward, self.mm_projector)
+        else:
+            self.mm_projector = VLM_Mlp(
+                config.mm_projector_type,
+                input_hidden_size,
+                hidden_features=input_hidden_size,  # TODO: llava 처럼 hidden_size 를 input_hidden_size 가 아니라 LLM embedding size 로 바꿔주기
+                out_features=text_config.hidden_size,
+            )
+        self.use_nth_layer = config.use_nth_layer
+        self.model_parallel = False
+        self.device_map = None
+        self.vision_model_use_no_grad = None
+        self.text_config = text_config
+        self.anyres = config.anyres
+        self.unpad = config.unpad
+        self.vision_input_chunk_size = kwargs.pop("vision_input_chunk_size", None)
+        if self.anyres:
+            self.image_newline = nn.Parameter(torch.empty(text_config.hidden_size, dtype=self.dtype))
+        self.is_safetensor_save = kwargs.get("is_safetensor_save", True)
+        self._backward_compatibility_gradient_checkpointing()  # self.post_init() 에 포함되어 있는 gc 가능한지 확인하고 켜주는 함수
+        self.mm_projector.to(config.torch_dtype)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[List[List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[List[int]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        :param input_ids: torch.int64 : torch.size([batchsize, variable)]) : SystemPrompt with Question text token indices for tokenizer.
+         In positions where images are inputted, the value is replaced by config.img_start_id, which is a vocabulary index used to indicate the start of image data.
+        :param pixel_values: List of List of 4D tensor (torch.float32)
+         Each outer list corresponds to a batch and contains inner lists, each holding tensors for images in a sample. The structure accounts for samples with multiple images.
+        :param past_key_values: None
+        :param inputs_embeds: None
+        :param use_cache: None
+        :param output_attentions: Optional[bool] : get attention weights of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param output_hidden_states: Optional[bool] : get hidden states of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param image_sizes: Stacked as a List of List, representing image sizes (width, height).
+         In cases where a sample contains no images, a single dummy image is included.
+        :param vision_query_lengths: A List of List that stores the lengths when each image is converted into visual tokens for LLM input.
+         In cases where a sample does not contain any images, an empty list is included.
+        :param non_vision_query_lengths: contains the lengths of text tokens (excluding visual tokens) for each sample in a batch.
+        :img_start_ids_list: contains the indices of the img_start_id tokens for each sample.
+        :num_queries_vis_abstractors: A List of List that contains the number of visual tokens for each image grid.
+        :num_queries_vis_abstractors_slow: A List of List that contains the number of visual tokens for the slow part when applying the slowfast algorithm to video frames. If the slowfast algorithm is not applied, it will have a value of None.
+        :first_last_frames_slows: A List of List that contains the only first and last frames slow mode for each sample in a batch.
+        :is_videos: A List of List that contains the boolean value indicating whether each sample in a batch is a video.
+        :image_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :pixel_values_videos: A 2D tensor (torch.float32) for qwen2.5-vl visual encoder.
+        :video_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :return:
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.vision_config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.vision_config.output_hidden_states
+        )
+        if inputs_embeds is None and past_key_values is None:
+            inputs_embeds = self.extract_inputs_embeds(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                past_key_values=past_key_values,
+                image_sizes=image_sizes,
+                vision_query_lengths=vision_query_lengths,
+                non_vision_query_lengths=non_vision_query_lengths,
+                img_start_ids_list=img_start_ids_list,
+                num_queries_vis_abstractors=num_queries_vis_abstractors,
+                num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+                first_last_frames_slows=first_last_frames_slows,
+                is_videos=is_videos,
+                image_grid_thw=image_grid_thw,
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+        if inputs_embeds is not None:
+            input_ids = None
+        outputs = self.language_model.base_model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+    def determine_non_vision_query_lengths(self, input_ids, pad_id, img_start_id):
+        """non_vision_query_lengths 를 계산하는 함수
+        input_ids 가 collate 될때, 오른쪽에 pad_id 가 채워지기 때문에 이 값을 찾는 방식을 통해 계산됨
+        또한 img_start_id 는 visual token 이 들어서는 자리이기 때문에, 해당 indices 은 제거
+        """
+        non_vision_query_lengths = []
+        batch_size, len_seq = input_ids.size(0), input_ids.size(1)
+        for i in range(batch_size):
+            temp_idx = (input_ids[i] == pad_id).nonzero()
+            eos_idx = temp_idx[0, 0].item() if len(temp_idx) > 0 else len_seq
+            num_imgs = (input_ids[i] == img_start_id).sum().item()
+            non_vision_query_lengths.append(eos_idx - num_imgs)
+        if all([pad_id in input_id for input_id in input_ids.tolist()]):
+            non_vision_query_lengths = [
+                non_vision_query_length + 1 for non_vision_query_length in non_vision_query_lengths
+            ]
+        return non_vision_query_lengths
+    def determine_vision_query_lengths(self, image_features, image_cnts):
+        """vision_query_lengths 를 계산하는 함수
+        image_features tensor 의 shape 을 통해 계산된다.
+        이미지가 1장도 없는 sample 의 경우 dummy image 1장이 들어가기 때문에, 따로 빈 list 처리 또한 추가
+        """
+        vision_query_lengths = [
+            [image_feature.size(0) for image_feature in image_feature_list] for image_feature_list in image_features
+        ]
+        for i, image_cnt in enumerate(image_cnts):
+            if image_cnt == 0:
+                assert len(vision_query_lengths[i]) == 1  # 현재 검정 이미지 1개 들어가있음
+                vision_query_lengths[i] = []  # 빈 list 로 변환
+        return vision_query_lengths
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.get_input_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.get_output_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
+    def tie_weights(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.tie_weights()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    def extract_inputs_embeds(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[List[List[torch.FloatTensor]]] = None,  # list of list of 4D tensors
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[int]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+    ):
+        """
+        :param input_ids: torch.int64 : torch.size([batchsize, variable)]) : SystemPrompt with Question text token indices for tokenizer.
+         In positions where images are inputted, the value is replaced by config.img_start_id, which is a vocabulary index used to indicate the start of image data.
+         In cases where a sample contains no images, a single dummy image is included.
+        :param pixel_values: List of List of 4D tensor (torch.float32)
+         Each outer list corresponds to a batch and contains inner lists, each holding tensors for images in a sample. The structure accounts for samples with multiple images.
+        :param past_key_values: None : (batch_size, num_heads, sequence_length - 1, embed_size_per_head): Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+        :param image_sizes: Stacked as a List of List, representing image sizes (width, height).
+         In cases where a sample contains no images, a single dummy image is included.
+        :param vision_query_lengths: A List of List that stores the lengths when each image is converted into visual tokens for LLM input.
+         In cases where a sample does not contain any images, an empty list is included.
+        :param non_vision_query_lengths: contains the lengths of text tokens (excluding visual tokens) for each sample in a batch.
+        :img_start_ids_list: contains the indices of the img_start_id tokens for each sample.
+        :num_queries_vis_abstractors: A List of List that contains the number of visual tokens for each image grid.
+        :num_queries_vis_abstractors_slow: A List of List that contains the number of visual tokens for the slow part when applying the slowfast algorithm to video frames. If the slowfast algorithm is not applied, it will have a value of None.
+        :first_last_frames_slows: A List of bool that contains the information of whether the slowfast algorithm is applied to the first or last frames of the video.
+        :is_videos: A List of List that contains the boolean value indicating whether each sample in a batch is a video.
+        :image_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :pixel_values_videos: A 2D tensor (torch.float32) for qwen2.5-vl visual encoder.
+        :video_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :return:
+        """
+        inputs_embeds = None
+        if past_key_values:
+            pass
+        else:
+            if self.is_qwen_visual:
+                inputs_embeds = self.get_input_embeddings()(input_ids)
+                context_vision_model = torch.no_grad() if self.config.freeze_encoder else contextlib.nullcontext()
+                if pixel_values is not None:
+                    with context_vision_model:
+                        image_features = self.vision_model(pixel_values, grid_thw=image_grid_thw)
+                    image_features = self.mm_projector(image_features)
+                    if img_start_ids_list is None:
+                        image_cnts = (input_ids == self.config.img_start_id).sum(dim=1).tolist()
+                    else:
+                        image_cnts = [len(img_start_ids) for img_start_ids in img_start_ids_list]
+                    mask = input_ids.eq(self.config.img_start_id)
+                    positions = mask.nonzero(as_tuple=False)
+                    batch_idx = positions[:, 0]
+                    seq_idx = positions[:, 1]
+                    if sum(image_cnts) == 0:
+                        image_features = image_features[0:0]  # trick for sft1 data
+                    inputs_embeds[batch_idx, seq_idx, :] = image_features.to(device=inputs_embeds.device)
+                if pixel_values_videos is not None:
+                    with context_vision_model:
+                        video_features = self.vision_model(pixel_values_videos, grid_thw=video_grid_thw)
+                    video_features = self.mm_projector(video_features)
+                    video_cnts = (input_ids == self.config.video_start_id).sum(dim=1).tolist()
+                    mask = input_ids.eq(self.config.video_start_id)
+                    positions = mask.nonzero(as_tuple=False)
+                    batch_idx = positions[:, 0]
+                    seq_idx = positions[:, 1]
+                    if sum(video_cnts) == 0:
+                        video_features = video_features[0:0]  # trick for no video batch
+                    inputs_embeds[batch_idx, seq_idx, :] = video_features.to(device=inputs_embeds.device)
+            else:
+                # CLIP, connector 는 flatten 해서 feature encoding 후 다시 List of List 형태로 변환
+                len_pixel_values = [len(pixel_value) for pixel_value in pixel_values]
+                concat_pixel_values = torch.cat(list(chain(*pixel_values)), dim=0)  # list of list of 4D Tensor
+                visual_token_idx = 0 if "siglip" in self.vision_config.model_type else 1
+                # adative anyres 로직을 타야하는지 확인
+                # num_queries_vis_abstractors is not None 이면서,
+                # self.num_queries_vis_abstractor과 다른 하나 이상의 num_queries_vis_abstractors가 있는지
+                is_adaptive_anyres = num_queries_vis_abstractors is not None and any(
+                    self.num_queries_vis_abstractor != num_queries_vis_abstractor
+                    for sublist in num_queries_vis_abstractors
+                    for num_queries_vis_abstractor in sublist
+                )
+                if not is_adaptive_anyres:
+                    image_sizes = list(chain(*image_sizes))
+                    if is_videos is not None:
+                        is_videos = list(chain(*is_videos))
+                    else:
+                        is_videos = [False] * len(image_sizes)
+                    group_ids = None
+                else:
+                    # adaptive anyres 의 경우, CAbstractor 에만 구현, CAbstractor가 CheckpointWrapper로 감싸져있을 수 있음
+                    # assert isinstance(self.mm_projector, CAbstractor)
+                    is_cabstractor = False
+                    for submodule in self.mm_projector.modules():
+                        if isinstance(submodule, CAbstractor):
+                            is_cabstractor = True
+                            break
+                    assert is_cabstractor
+                    assert num_queries_vis_abstractors_slow is not None
+                    num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids = (
+                        self.compute_adaptive_params(
+                            pixel_values,
+                            num_queries_vis_abstractors,
+                            num_queries_vis_abstractors_slow,
+                            image_sizes,
+                            is_videos,
+                            first_last_frames_slows,
+                        )
+                    )
+                # 모델의 모든 파라미터가 requires_grad=False인지 확인합니다.
+                if torch.is_grad_enabled():
+                    if self.vision_model_use_no_grad is None:
+                        self.vision_model_use_no_grad = all(
+                            not p.requires_grad for p in self.vision_model.vision_model.encoder.parameters()
+                        )
+                context_vision_model = torch.no_grad() if self.vision_model_use_no_grad else contextlib.nullcontext()
+                if self.vision_input_chunk_size is not None:
+                    # n_chunks 계산 (몇 번 for loop 돌아야하는지)
+                    chunk_size = self.vision_input_chunk_size
+                    local_batch_size = torch.tensor([concat_pixel_values.size(0)], device=concat_pixel_values.device)
+                    gathered_batch_sizes = [
+                        torch.zeros_like(local_batch_size) for _ in range(torch.distributed.get_world_size())
+                    ]
+                    torch.distributed.all_gather(gathered_batch_sizes, local_batch_size)
+                    gathered_batch_sizes = torch.stack(gathered_batch_sizes)
+                    max_batch_size = gathered_batch_sizes.max().item()
+                    n_chunks = math.ceil(max_batch_size / chunk_size)
+                    if is_adaptive_anyres:
+                        chunk_num_queries_vis_abstractors, chunk_num_grids, chunk_is_splits = (
+                            self.split_adaptive_params(
+                                num_queries_vis_abstractors,
+                                num_grids,
+                                chunk_size,
+                                n_chunks,
+                            )
+                        )
+                    # concat_pixel_values의 shape을 기준으로 dummy tensor 생성
+                    dummy_shape = (1,) + tuple(concat_pixel_values.shape[1:])
+                    dummy = torch.zeros(
+                        dummy_shape, dtype=concat_pixel_values.dtype, device=concat_pixel_values.device
+                    ).to(self.vision_model.dtype)
+                else:
+                    # chunk 하지 않고, 기존 input 그대로 batch 처리
+                    chunk_size = concat_pixel_values.size(0)
+                    n_chunks = 1
+                image_forward_outs = []
+                for i in range(n_chunks):
+                    start = i * chunk_size
+                    end = (i + 1) * chunk_size
+                    # 현재 chunk slice (데이터가 없으면 빈 텐서가 될 수 있음)
+                    chunk = concat_pixel_values[start:end].to(self.vision_model.dtype)
+                    current_chunk_size = chunk.size(0)
+                    # 만약 현재 chunk의 크기가 0이면, 더미 데이터 forward
+                    if current_chunk_size == 0:
+                        chunk = dummy
+                    # vision 모델에 chunk를 통과시킴 (use_nth_layer에 따라 처리)
+                    if self.use_nth_layer == -1:
+                        # 마지막 레이어의 후처리인 post_layernorm을 Identity로 대체
+                        self.vision_model.vision_model.post_layernorm = nn.Identity()
+                        with context_vision_model:
+                            outs = self.vision_model(chunk)
+                        outs = outs.last_hidden_state[:, visual_token_idx:]
+                    else:
+                        with context_vision_model:
+                            outs = self.vision_model(chunk, output_hidden_states=True)
+                        outs = outs.hidden_states[self.use_nth_layer][:, visual_token_idx:]
+                    if self.vision_model_use_no_grad:
+                        outs = outs.detach().requires_grad_(True)
+                    if not is_adaptive_anyres:
+                        if self.freeze_before_sampler and self.training:
+                            outs = self.mm_projector(outs, freeze_before_sampler=True)
+                        else:
+                            outs = self.mm_projector(outs)
+                        if current_chunk_size > 0:
+                            image_forward_outs.append(outs)
+                    else:
+                        if n_chunks != 1:
+                            current_num_queries_vis_abstractors = chunk_num_queries_vis_abstractors[i]
+                            current_num_grids = chunk_num_grids[i]
+                        else:
+                            current_num_queries_vis_abstractors = num_queries_vis_abstractors
+                            current_num_grids = num_grids
+                        if self.freeze_before_sampler and self.training:
+                            outs = self.mm_projector(
+                                outs,
+                                num_queries_vis_abstractors=current_num_queries_vis_abstractors,
+                                num_grids=current_num_grids,
+                                freeze_before_sampler=True,
+                            )
+                        else:
+                            outs = self.mm_projector(
+                                outs,
+                                num_queries_vis_abstractors=current_num_queries_vis_abstractors,
+                                num_grids=current_num_grids,
+                            )
+                        if current_chunk_size > 0:
+                            if i > 0 and chunk_is_splits[i - 1]:
+                                # 첫 번째 인덱스는 이전 결과에 합침
+                                image_forward_outs[-1] = torch.cat([image_forward_outs[-1], outs[0]], dim=0)
+                                image_forward_outs.extend(outs[1:])
+                            else:
+                                image_forward_outs.extend(outs)
+                # 모든 chunk의 결과를 concat
+                if not is_adaptive_anyres:
+                    # adaptive anyres 가 아니면 모든 결과를 합쳐서 torch로 변환
+                    # adaptive anyres 인 경우, 모든 결과가 list 형태로 사용하면 됨
+                    image_forward_outs = torch.cat(image_forward_outs, dim=0).to(image_forward_outs[0].dtype)
+                if img_start_ids_list is None:
+                    image_cnts = (input_ids == self.config.img_start_id).sum(dim=1).tolist()
+                else:
+                    image_cnts = [len(img_start_ids) for img_start_ids in img_start_ids_list]
+                if self.anyres:
+                    split_sizes = [pixel_value.shape[0] for pixel_value in chain(*pixel_values)]
+                    # if not is_adaptive_anyres:
+                    #     image_features = anyres_postprocessing(
+                    #         image_forward_outs=image_forward_outs,
+                    #         split_sizes=split_sizes,
+                    #         image_sizes=image_sizes,
+                    #         num_queries_vis_abstractor=self.num_queries_vis_abstractor,
+                    #         unpad=self.unpad,
+                    #         is_videos=is_videos,
+                    #         patch_size=self.vision_model.config.patch_size,
+                    #         grid_size=self.vision_model.config.image_size,
+                    #         image_newline=self.image_newline,
+                    #         possible_resolutions=self.config.possible_resolutions,
+                    #     )
+                    # else:
+                    #     image_features = adaptive_anyres_postprocessing(
+                    #         image_forward_outs=image_forward_outs,
+                    #         image_sizes=image_sizes,
+                    #         num_queries_vis_abstractors=num_queries_vis_abstractors,
+                    #         unpad=self.unpad,
+                    #         is_videos=is_videos,
+                    #         patch_size=self.vision_model.config.patch_size,
+                    #         grid_size=self.vision_model.config.image_size,
+                    #         image_newline=self.image_newline,
+                    #         possible_resolutions=self.config.possible_resolutions,
+                    #         group_ids=group_ids,
+                    #     )
+                else:
+                    if not is_adaptive_anyres:
+                        image_features = [image_forward_out for image_forward_out in image_forward_outs]
+                    else:
+                        image_features = [image_forward_out.unsqueeze(0) for image_forward_out in image_forward_outs]
+                image_features = [
+                    image_features[sum(len_pixel_values[:i]) : sum(len_pixel_values[: i + 1])]
+                    for i in range(len(len_pixel_values))
+                ]
+                # llm 없이 inference하는 단계에서는, prompt의 조합이 학습과정과 다르기 때문에, 밖에서 조합한다.
+                if self.without_llm:
+                    return image_features
+                batch_size = input_ids.size(0)
+                image_feature_dim = image_features[0][0].size(1)
+                image_feature_dtype = image_features[0][0].dtype
+                if img_start_ids_list is None:
+                    image_cnts = (input_ids == self.config.img_start_id).sum(dim=1).tolist()
+                else:
+                    image_cnts = [len(img_start_ids) for img_start_ids in img_start_ids_list]
+                if non_vision_query_lengths is None:
+                    non_vision_query_lengths = self.determine_non_vision_query_lengths(
+                        input_ids, self.config.text_config.pad_token_id, self.config.img_start_id
+                    )
+                if vision_query_lengths is None:
+                    vision_query_lengths = self.determine_vision_query_lengths(image_features, image_cnts)
+                # concat보다 슬라이싱이 빠름
+                len_inputs_embeds = max(
+                    [
+                        sum(vision_query_length) + non_vision_query_length
+                        for non_vision_query_length, vision_query_length in zip(
+                            non_vision_query_lengths, vision_query_lengths
+                        )
+                    ]
+                )
+                inputs_embeds = torch.zeros(
+                    [batch_size, len_inputs_embeds, image_feature_dim],
+                    dtype=image_feature_dtype,
+                    device=self.device,
+                    requires_grad=True,
+                ).clone()
+                # temp_embeds : torch.bfloat16 : [batchsize, 174, 3072]
+                temp_embeds = self.get_input_embeddings()(input_ids)
+                # 완성본은 <PROMPT><USER_PREFIX><VISION_QUERIES>Sentence 형태
+                for batch_idx, sample in enumerate(input_ids):
+                    # visual token 과 concat 후 slicing
+                    non_vision_query_length = non_vision_query_lengths[batch_idx]
+                    # 안전하게, visual token 과 concat 후 slicing
+                    sample = sample[: non_vision_query_length + image_cnts[batch_idx]]
+                    if image_cnts[batch_idx] == 0:  # text instruction data는 image feature를 삽입하지않음
+                        temp_idx = 0
+                        # 참고: https://github.com/haotian-liu/LLaVA/commit/44e0562f9497fb79f042427307472a87d266d90a#diff-4477387d506ccb1897a13972cba26c9da3fad4d3e1c32ec4b8bd8ff7acd3f292
+                        # https://github.com/intel/intel-extension-for-transformers/issues/1201#issuecomment-1915875119
+                        inputs_embeds[batch_idx, :non_vision_query_length] = temp_embeds[batch_idx][
+                            :non_vision_query_length
+                        ]
+                        inputs_embeds[batch_idx, temp_idx:temp_idx] = image_features[batch_idx][0][
+                            0:0
+                        ]  # batch_idx sample 의 첫번째 이미지 (dummy 이미지)
+                    else:
+                        if img_start_ids_list is None:
+                            img_start_ids = (sample == self.config.img_start_id).nonzero()
+                        else:
+                            img_start_ids = img_start_ids_list[batch_idx]
+                        assert len(img_start_ids) == image_cnts[batch_idx] == len(image_features[batch_idx])
+                        # 입력 임베딩과 임시 임베딩의 시작 지점 초기화
+                        input_start, temp_start = 0, 0
+                        # 배치 내 각 이미지 시작 지점을 순회
+                        for multi_img_idx, img_start_idx in enumerate(img_start_ids):
+                            # 현재 이미지 시작 지점까지의 토큰 길이 계산
+                            token_len = img_start_idx - temp_start
+                            # inputs_embeds으로 토큰 복사
+                            inputs_embeds[batch_idx, input_start : input_start + token_len] = temp_embeds[
+                                batch_idx, temp_start : temp_start + token_len
+                            ]
+                            # image_features 삽입 위치 계산하여 삽입
+                            inputs_embeds[
+                                batch_idx,
+                                input_start
+                                + token_len : input_start
+                                + token_len
+                                + vision_query_lengths[batch_idx][multi_img_idx],
+                            ] = image_features[batch_idx][multi_img_idx]
+                            # 다음 토큰 처리를 위한 시작 지점 업데이트
+                            input_start += token_len + vision_query_lengths[batch_idx][multi_img_idx]
+                            temp_start += token_len + 1  # 이미지 시작 토큰을 넘어서기 위해 1 증가
+                        # 마지막 이미지 종료 토큰 이후의 토큰 처리
+                        token_len = min(sample[temp_start:].size(0), inputs_embeds.size(1) - input_start)
+                        inputs_embeds[batch_idx, input_start : input_start + token_len] = temp_embeds[
+                            batch_idx, temp_start : temp_start + token_len
+                        ]
+        return inputs_embeds
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            **kwargs,
+        )
+        model.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return model
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        super().register_for_auto_class("AutoModel")
+        self.config.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+    def compute_adaptive_params(
+        self,
+        pixel_values: Optional[List[List[torch.FloatTensor]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+    ):
+        # 내부의 모든 원소가 0 이상의 정수인지 확인
+        assert all(
+            all(isinstance(value, int) and value >= 0 for value in sublist) for sublist in num_queries_vis_abstractors
+        ), "All values in num_queries_vis_abstractors must be integers >= 0."
+        assert all(
+            all(isinstance(value, int) and value >= 0 for value in sublist)
+            for sublist in num_queries_vis_abstractors_slow
+        ), "All values in num_queries_vis_abstractors_slow must be integers >= 0."
+        assert is_videos is not None
+        # 첫번째 혹은 마지막 이미지인지? (video 처리 slowfast 적용을 위함)
+        is_first_images = []
+        is_last_images = []
+        for is_video in is_videos:
+            for idx, is_video_item in enumerate(is_video):
+                if idx == 0:
+                    is_first_images.append(True)
+                else:
+                    is_first_images.append(False)
+                if idx == len(is_video) - 1:
+                    is_last_images.append(True)
+                else:
+                    is_last_images.append(False)
+        num_queries_vis_abstractors = list(chain(*num_queries_vis_abstractors))
+        num_queries_vis_abstractors_slow = list(chain(*num_queries_vis_abstractors_slow))
+        image_sizes = list(chain(*image_sizes))
+        is_videos = list(chain(*is_videos))
+        first_last_frames_slows = list(chain(*first_last_frames_slows))
+        # num_queries_vis_abstractors_slow 내에 visual tokens 수가 하나라도 0 이상인게 존재하면 slowfast mode 사용
+        use_slowfast = any([num_query > 0 for num_query in num_queries_vis_abstractors_slow])
+        num_grids = [pixel_value.shape[0] for pixel_value in chain(*pixel_values)]
+        num_grids = [0] + num_grids
+        group_ids = []
+        if use_slowfast:
+            new_num_grids = [num_grids[0]]
+            new_num_queries = []
+            new_image_sizes = []
+            new_is_videos = []
+            # slowfast 를 사용하는 경우, 좀 더 잘게 쪼갠다
+            # 0번째 local grid 는 slow frame, 나머지 local grids 는 fast frame
+            for (
+                num_query,
+                num_query_slow,
+                num_grid,
+                image_size,
+                is_video,
+                first_last_frames_slow,
+                is_first_image,
+                is_last_image,
+            ) in zip(
+                num_queries_vis_abstractors,
+                num_queries_vis_abstractors_slow,
+                num_grids[1:],
+                image_sizes,
+                is_videos,
+                first_last_frames_slows,
+                is_first_images,
+                is_last_images,
+            ):
+                if not first_last_frames_slow and num_query_slow > 0:  # Process all image in slowfast mode
+                    assert is_video is True  # slowfast mode는 video에 대해서만 적용
+                    this_group_ids = [group_ids[-1][-1] + 1 if group_ids else 0]
+                    # slow frame (제일 첫번째 grid)
+                    new_num_grids.append(new_num_grids[-1] + 1)
+                    new_num_queries.append(num_query_slow)
+                    new_image_sizes.append(image_size)
+                    new_is_videos.append(is_video)
+                    if num_grid >= 2:
+                        # fast frames
+                        new_num_grids.append(new_num_grids[-1] + num_grid - 1)
+                        new_num_queries.append(num_query)
+                        new_image_sizes.append(image_size)
+                        new_is_videos.append(is_video)
+                        this_group_ids.append(this_group_ids[-1] + 1)
+                    group_ids.append(this_group_ids)
+                elif (
+                    first_last_frames_slow and num_query_slow > 0 and (is_first_image or is_last_image)
+                ):  # Process only first/last image in slowfast mode
+                    # slow frame 를 하는데 first, last만 특별 취급하는 케이스.
+                    assert is_video is True  # slowfast mode는 video에 대해서만 적용
+                    this_group_ids = [group_ids[-1][-1] + 1 if group_ids else 0]
+                    if num_grid == 1:
+                        # 고민할 것 없이 그냥 1개만 들어있어서 여기에 slow만 처리하면 끝.
+                        new_num_grids.append(new_num_grids[-1] + 1)
+                        new_num_queries.append(num_query_slow)
+                        new_image_sizes.append(image_size)
+                        new_is_videos.append(is_video)
+                    if num_grid >= 2:
+                        if is_first_image:  # first and last 라도 여기에 포함.
+                            # slow frame (제일 첫번째 grid)
+                            new_num_grids.append(new_num_grids[-1] + 1)
+                            new_num_queries.append(num_query_slow)
+                            new_image_sizes.append(image_size)
+                            new_is_videos.append(is_video)
+                            # fast frames
+                            new_num_grids.append(new_num_grids[-1] + num_grid - 1)
+                            new_num_queries.append(num_query)
+                            new_image_sizes.append(image_size)
+                            new_is_videos.append(is_video)
+                            this_group_ids.append(this_group_ids[-1] + 1)
+                        elif is_last_image:
+                            # fast frames
+                            new_num_grids.append(new_num_grids[-1] + num_grid - 1)
+                            new_num_queries.append(num_query)
+                            new_image_sizes.append(image_size)
+                            new_is_videos.append(is_video)
+                            # slow frame (제일 마지막 grid)
+                            new_num_grids.append(new_num_grids[-1] + 1)
+                            new_num_queries.append(num_query_slow)
+                            new_image_sizes.append(image_size)
+                            new_is_videos.append(is_video)
+                            this_group_ids.append(this_group_ids[-1] + 1)
+                        else:
+                            raise Exception("This case should not be reached.")
+                    group_ids.append(this_group_ids)
+                else:
+                    # slowfast mode가 아닌 경우, 즉, 모두다 num_query 만큼 줄임 (fast)
+                    new_num_grids.append(new_num_grids[-1] + num_grid)
+                    new_num_queries.append(num_query)
+                    new_image_sizes.append(image_size)
+                    new_is_videos.append(is_video)
+                    start_group_id = group_ids[-1][-1] + 1 if group_ids else 0
+                    group_ids.append([start_group_id])
+            num_grids = new_num_grids
+            num_queries_vis_abstractors = new_num_queries
+            image_sizes = new_image_sizes
+            is_videos = new_is_videos
+        else:
+            num_grids = [sum(num_grids[:i]) for i in range(1, len(num_grids) + 1)]
+            group_ids = [[group_id] for group_id in range(len(is_videos))]
+        return num_queries_vis_abstractors, num_grids, image_sizes, is_videos, group_ids
+    def split_adaptive_params(
+        self, num_queries_vis_abstractors, num_grids, chunk_size: int, n_chunks: int  # len = n  # len = n+1, 첫 값 0
+    ):
+        """
+        num_grids/num_queries 를 chunk_size 단위로 최대 n_chunks 만큼 자른다.
+        실제 데이터가 부족하면 남은 chunk 는 더미([0,1]) 로 채운다.
+        Returns
+        -------
+        chunk_qs   : List[List[int]]
+        chunk_grids: List[List[int]]
+            각 원소 길이는 동일하며, 전체 길이는 정확히 n_chunks.
+        """
+        total_len = num_grids[-1]  # 마지막 grid 위치
+        chunk_qs, chunk_grids, is_splits = [], [], []
+        # (start, end) = (0,chunk_size), (chunk_size,2*chunk_size), ...
+        # 단, n_chunks 만큼만 만든다.
+        slices = list(zip(num_grids[:-1], num_grids[1:], num_queries_vis_abstractors))
+        slice_idx = 0  # 현재 살펴보는 slice 위치
+        for chunk_idx in range(n_chunks):
+            start = chunk_idx * chunk_size
+            end = start + chunk_size  # [start, end)
+            # 1) 입력을 이미 다 소화한 경우: 더미 chunk (1grid 짜리)
+            if start >= total_len:
+                chunk_grids.append([0, 1])  # 최소 길이 1 dummy
+                chunk_qs.append([num_queries_vis_abstractors[-1]])
+                is_splits.append(False)
+                continue
+            grids_in_chunk = [0]  # 항상 0부터
+            qs_in_chunk = []
+            # 현재 chunk와 겹치지 않는 slice 모두 스킵
+            while slice_idx < len(slices) and slices[slice_idx][1] <= start:
+                slice_idx += 1
+            is_split = False
+            j = slice_idx
+            while j < len(slices) and slices[j][0] < end:
+                s, e, q = slices[j]
+                # chunk 내부 경계
+                left = max(s, start)
+                right = min(e, end)
+                off = right - start  # chunk local offset
+                if off not in grids_in_chunk:
+                    grids_in_chunk.append(off)
+                    qs_in_chunk.append(q)
+                    if right == end and e != end:
+                        is_split = True  # 기존 num_grids 에선 나눠지지 않았던 부분이 잘렸음.
+                # slice 가 chunk를 뚫고 나가면, 다음 chunk에서 이어서 처리
+                if e > end:
+                    break
+                j += 1
+            slice_idx = j
+            # 마지막 offset이 chunk 끝(또는 실제 데이터 끝)과 다르면 보정
+            final_off = min(end, total_len) - start
+            if grids_in_chunk[-1] != final_off:
+                grids_in_chunk.append(final_off)
+                qs_in_chunk.append(qs_in_chunk[-1] if qs_in_chunk else num_queries_vis_abstractors[-1])
+                # 잘렸다는 것 기록
+                is_split = True
+            chunk_grids.append(grids_in_chunk)
+            chunk_qs.append(qs_in_chunk)
+            is_splits.append(is_split)
+        return chunk_qs, chunk_grids, is_splits
+class HCXVisionForCausalLM(HCXVisionPreTrainedModel, GenerationMixin):
+    def __init__(
+        self,
+        config: HCXVisionConfig,
+        without_llm=False,
+        **kwargs,
+    ):
+        super().__init__(config, without_llm=without_llm, **kwargs)
+        text_config = config.get_text_config()
+        self.model = HCXVisionModel(config=config, **kwargs)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[List[List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[List[int]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """
+        :param input_ids: torch.int64 : torch.size([batchsize, variable)]) : SystemPrompt with Question text token indices for tokenizer.
+         In positions where images are inputted, the value is replaced by config.img_start_id, which is a vocabulary index used to indicate the start of image data.
+        :param pixel_values: List of List of 4D tensor (torch.float32)
+         Each outer list corresponds to a batch and contains inner lists, each holding tensors for images in a sample. The structure accounts for samples with multiple images.
+        :param past_key_values: None
+        :param inputs_embeds: None
+        :param labels: Optional[torch.int64] : [batchsize, variable (input_ids.size(1)+ num visual tokens)] visual token 들은 모두 IGNORE_INDEX
+        :param use_cache: None
+        :param output_attentions: Optional[bool] : get attention weights of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param output_hidden_states: Optional[bool] : get hidden states of each layers of transformer network (true: 결과값에 포함, false: 결과값에 미포함)
+        :param image_sizes: Stacked as a List of List, representing image sizes (width, height).
+         In cases where a sample contains no images, a single dummy image is included.
+        :param vision_query_lengths: A List of List that stores the lengths when each image is converted into visual tokens for LLM input.
+         In cases where a sample does not contain any images, an empty list is included.
+        :param non_vision_query_lengths: contains the lengths of text tokens (excluding visual tokens) for each sample in a batch.
+        :img_start_ids_list: contains the indices of the img_start_id tokens for each sample.
+        :num_queries_vis_abstractors: A List of List that contains the number of visual tokens for each image grid.
+        :num_queries_vis_abstractors_slow: A List of List that contains the number of visual tokens for the slow part when applying the slowfast algorithm to video frames. If the slowfast algorithm is not applied, it will have a value of None.
+        :first_last_frames_slows: A List of List that contains the only first and last frames slow mode for each sample in a batch.
+        :is_videos: A List of List that contains the boolean value indicating whether each sample in a batch is a video.
+        :image_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :pixel_values_videos: A 2D tensor (torch.float32) for qwen2.5-vl visual encoder.
+        :video_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :return:
+        """
+        loss = None
+        logits = None
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            image_sizes=image_sizes,
+            vision_query_lengths=vision_query_lengths,
+            non_vision_query_lengths=non_vision_query_lengths,
+            img_start_ids_list=img_start_ids_list,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+            first_last_frames_slows=first_last_frames_slows,
+            is_videos=is_videos,
+            image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.model.language_model.lm_head(hidden_states[:, slice_indices, :]) * getattr(
+            self.config.text_config, "logits_scaling", 1
+        )
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def inference(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[
+            Union[List[List[torch.FloatTensor]], torch.FloatTensor]
+        ] = None,  # torch.FloatTensor for qwen2.5-vl visual encoder
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[int]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        max_length: int = 196,
+        min_length: int = 2,
+        do_sample: bool = True,
+        num_beams: int = 1,
+        top_p: float = 0.6,
+        top_k: int = 0,
+        temperature: float = 0.5,
+        repetition_penalty: float = 1.0,
+        length_penalty: int = 1,
+        early_stopping: Union[bool, str] = False,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        """
+        :param input_ids: torch.int64 : torch.size([batchsize, variable)]) : SystemPrompt with Question text token indices for tokenizer.
+         In positions where images are inputted, the value is replaced by config.img_start_id, which is a vocabulary index used to indicate the start of image data.
+         In cases where a sample contains no images, a single dummy image is included.
+        :param pixel_values: List of List of 4D tensor (torch.float32)
+         Each outer list corresponds to a batch and contains inner lists, each holding tensors for images in a sample. The structure accounts for samples with multiple images.
+        :param attention_mask: not used
+        :param max_length: int :  The maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens.
+        :param min_length: int : The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + min_new_tokens.
+        :param num_beams: int :  Number of beams for beam search. 1 means no beam search.
+        :param top_k: int : The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        :param temperature: float :  The value used to modulate the next token probabilities. ( scores / self.temperature )
+        :param repetition_penalty: float : The parameter for repetition penalty.
+        :param length_penalty: int :  It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence.
+        :param early_stopping: Union[bool, str] : True, where the generation stops as soon as there are num_beams complete candidates;
+            False, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+            "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm)
+        :param use_cache: bool :  Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
+        :param verbose: bool : print debug mention
+        :param image_sizes: Stacked as a List of List, representing image sizes (width, height).
+         In cases where a sample contains no images, a single dummy image is included.
+        :param vision_query_lengths: A List of List that stores the lengths when each image is converted into visual tokens for LLM input.
+         In cases where a sample does not contain any images, an empty list is included.
+        :param non_vision_query_lengths: contains the lengths of text tokens (excluding visual tokens) for each sample in a batch.
+        :param num_queries_vis_abstractors: A List of List that contains the number of visual tokens for each image grid.
+        :param num_queries_vis_abstractors_slow: A List of List that contains the number of visual tokens for the slow part when applying the slowfast algorithm to video frames. If the slowfast algorithm is not applied, it will have a value of None.
+        :param first_last_frames_slows: A List of List that stores the only first and last frames slow mode for each sample in a batch.
+        :param is_videos: A List of List that stores the boolean value indicating whether each sample in a batch is a video.
+        :image_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :pixel_values_videos: A 2D tensor (torch.float32) for qwen2.5-vl visual encoder.
+        :video_grid_thw: A 3D tensor (torch.int64) for qwen2.5-vl visual encoder.
+        :param kwargs:
+        :return:
+        """
+        # inputs_embeds: torch.bfloat16 : [batchsize, variable(visual token, text token, system prompt 모두 포함)]
+        # attention_mask: torch.float32 : [batchsize, variable(위와 동일)]
+        inputs_embeds = self.model.extract_inputs_embeds(
+            input_ids=input_ids,
+            pixel_values=self.to_vision_model_device(pixel_values),
+            image_sizes=image_sizes,
+            vision_query_lengths=vision_query_lengths,
+            non_vision_query_lengths=non_vision_query_lengths,
+            img_start_ids_list=img_start_ids_list,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+            first_last_frames_slows=first_last_frames_slows,
+            is_videos=is_videos,
+            image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+        # inference만을 요구하는 특성상 모두 eval mode라 가정. 또한, inputs_embeds가 list of list tensor임. [batchsize, [num_images, [num_squence, num_chanels]]]
+        # inputs_embeds = inputs_embeds.detach()
+        # inputs_embeds.requires_grad = False
+        # llm 없이 inference할때에는, image_feature 값임.
+        # self.vision_model에 assign된 gpu device와 llm에 assign된 gpu device가 다름
+        if self.without_llm:
+            inputs_embeds = (
+                inputs_embeds.to(self.vision_model.device) if isinstance(inputs_embeds, torch.Tensor) else inputs_embeds
+            )
+            return inputs_embeds
+        inputs_embeds = (
+            inputs_embeds.to(self.base_model.device) if isinstance(inputs_embeds, torch.Tensor) else inputs_embeds
+        )
+        # pred : torch.int64 : [batchsize, generated token_length]
+        pred = self.language_model.generate(  # <|im_end|>
+            inputs_embeds=inputs_embeds,
+            pad_token_id=self.config.text_config.pad_token_id,
+            eos_token_id=self.config.text_config.eos_token_id,
+            bad_words_ids=[
+                [
+                    self.config.text_config.bos_token_id,
+                ],
+                [
+                    self.config.text_config.eos_token_id,
+                ],
+            ],
+            max_new_tokens=max_length,
+            min_length=min_length,
+            num_beams=num_beams,
+            do_sample=False if temperature == 0.0 else do_sample,  # set do_sample=False if invalid temperature
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            early_stopping=False if num_beams <= 1 else True,  # set early_stopping=False when not beam_search
+            use_cache=use_cache,
+        )
+        return pred
+    def to_vision_model_device(self, input_tensor):
+        if isinstance(input_tensor, list):  # 입력 데이터가 리스트인 경우
+            return [self.to_vision_model_device(item) for item in input_tensor]  # 재귀적으로 각 요소에 대해 함수 호출
+        elif isinstance(input_tensor, torch.Tensor):  # 입력 데이터가 정수인 경우
+            return input_tensor.to(self.vision_model.device)
+        else:
+            raise TypeError(
+                "Unsupported data type. Only tensors and lists are allowed."
+            )  # 지원되지 않는 데이터 타입에 대한 에러 처리
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.get_input_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.get_output_embeddings()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
+    def tie_weights(self):
+        if self.without_llm:
+            return None
+        else:
+            return self.language_model.tie_weights()
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        **kwargs,
+    ):
+        model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            **kwargs,
+        )
+        model.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return model
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        super().register_for_auto_class("AutoModelForCausalLM")
+        self.config.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+        self.config.architectures = ["HCXVisionV2ForCausalLM"]
+        self.config.auto_map["AutoModelForCausalLM"] = "modeling_vlm.HCXVisionForCausalLM"
+        self.config.auto_map["AutoModelForSequenceClassification"] = "modeling_vlm.HCXVisionForSequenceClassification"
+        self.config.save_pretrained(save_directory)
+    # https://github.com/huggingface/transformers/blob/v4.53.3/src/transformers/models/llava/modeling_llava.py#L379-L390
+    @property
+    def is_qwen_visual(self):
+        return self.model.is_qwen_visual
+    @property
+    def language_model(self):
+        return self.model.language_model
+    @property
+    def vision_model(self):
+        return self.model.vision_model
+    @property
+    def text_config(self):
+        return self.model.text_config
+    @property
+    def vision_config(self):
+        return self.model.vision_config
+    @property
+    def mm_projector(self):
+        return self.model.mm_projector
+    @property
+    def anyres(self):
+        return self.model.anyres
+    @property
+    def is_safetensor_save(self):
+        return self.model.is_safetensor_save
+    @property
+    def without_llm(self):
+        return self.model.without_llm
+    @property
+    def image_newline(self):
+        return self.model.image_newline
+class HCXVisionForSequenceClassification(HCXVisionPreTrainedModel):
+    """
+    HCX Vision model for sequence classification tasks.
+    """
+    def __init__(self, config, **kwargs):
+        super().__init__(config, without_llm=True, **kwargs)
+        self.num_labels = config.num_labels if hasattr(config, "num_labels") else 2
+        self.model = HCXVisionModel(config=config, **kwargs)
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[List[int]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+    ) -> SequenceClassifierOutputWithPast:
+        """
+        Forward pass for sequence classification.
+        """
+        transformer_outputs: BaseModelOutputWithPast = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            image_sizes=image_sizes,
+            vision_query_lengths=vision_query_lengths,
+            non_vision_query_lengths=non_vision_query_lengths,
+            img_start_ids_list=img_start_ids_list,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+            first_last_frames_slows=first_last_frames_slows,
+            is_videos=is_videos,
+            image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+        hidden_states = transformer_outputs.last_hidden_state
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        super().register_for_auto_class("AutoModelForSequenceClassification")
+        self.config.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+class HCXVisionForTokenClassification(HCXVisionPreTrainedModel):
+    """
+    HCX Vision model for token classification tasks (e.g., per-token value prediction for PPO critic).
+    Returns logits for each token instead of pooled output.
+    """
+    def __init__(self, config, **kwargs):
+        super().__init__(config, without_llm=True, **kwargs)
+        self.num_labels = config.num_labels if hasattr(config, "num_labels") else 1
+        self.model = HCXVisionModel(config=config, **kwargs)
+        # Dropout for regularization
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config.text_config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.text_config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        # Token classification head - projects each token's hidden state to num_labels
+        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        image_sizes: Optional[List[List[List[int]]]] = None,
+        vision_query_lengths: Optional[List[List[int]]] = None,
+        non_vision_query_lengths: Optional[List[List[int]]] = None,
+        img_start_ids_list: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors: Optional[List[List[int]]] = None,
+        num_queries_vis_abstractors_slow: Optional[List[List[int]]] = None,
+        first_last_frames_slows: Optional[List[List[bool]]] = None,
+        is_videos: Optional[List[List[bool]]] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+    ) -> TokenClassifierOutput:
+        """
+        Forward pass for token classification.
+        Returns:
+            TokenClassifierOutput with logits of shape [batch_size, sequence_length, num_labels]
+        """
+        transformer_outputs: BaseModelOutputWithPast = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            image_sizes=image_sizes,
+            vision_query_lengths=vision_query_lengths,
+            non_vision_query_lengths=non_vision_query_lengths,
+            img_start_ids_list=img_start_ids_list,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_queries_vis_abstractors_slow=num_queries_vis_abstractors_slow,
+            first_last_frames_slows=first_last_frames_slows,
+            is_videos=is_videos,
+            image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+        )
+        # Get hidden states for all tokens
+        hidden_states = transformer_outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
+        # Project to num_labels for each token
+        logits = self.score(hidden_states)  # [batch_size, seq_len, num_labels]
+        return TokenClassifierOutput(
+            loss=None,
+            logits=logits,  # [batch_size, seq_len, num_labels] - ALL tokens!
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        super().register_for_auto_class("AutoModelForTokenClassification")
+        self.config.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+class VLM_Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        mm_projector_type,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.mm_projector_type = mm_projector_type
+        if self.mm_projector_type == "mlp":
+            self.fc1 = nn.Linear(in_features, hidden_features)
+            self.act = act_layer()
+            self.fc2 = nn.Linear(hidden_features, out_features)
+        elif self.mm_projector_type == "inverted_mlp":
+            self.fc1 = nn.Linear(in_features, 2 * hidden_features)
+            self.act = act_layer()
+            self.fc2 = nn.Linear(2 * hidden_features, out_features)
+        else:
+            raise NotImplementedError("{} is not implemented".format(self.mm_projector_type))
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class Projector(nn.Module):
+    """Base projector class"""
+    def __init__(
+        self,
+        num_queries: int,
+        num_input_tokens: int,
+        encoder_hidden_size: int,
+        hidden_size: int,
+        output_hidden_size: int,
+        pos_emb=True,
+        prenorm=False,
+    ):
+        super().__init__()
+        self.num_input_tokens = num_input_tokens
+        self.output_hidden_size = output_hidden_size
+        # pos emb
+        if pos_emb:
+            self.pos_emb = torch.nn.Parameter(torch.zeros(1, num_input_tokens, encoder_hidden_size))
+            # nn.init.trunc_normal_(self.pos_emb, mean=0.0, std=0.02)
+            self.pos_emb.data.normal_(mean=0.0, std=0.02)
+        else:
+            self.pos_emb = None
+        if prenorm:
+            self.prenorm = LayerNorm(encoder_hidden_size)
+        else:
+            self.prenorm = None
+        self.build_net(num_queries, encoder_hidden_size, hidden_size, output_hidden_size)
+    def build_net(self):
+        raise NotImplementedError()
+    def _forward(
+        self,
+        x,
+        num_queries_vis_abstractors: Optional[List[int]] = None,
+        num_grids: Optional[List[int]] = None,
+        freeze_before_sampler: bool = False,
+    ):
+        raise NotImplementedError()
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_queries_vis_abstractors: Optional[List[int]] = None,
+        num_grids: Optional[List[int]] = None,
+        freeze_before_sampler: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: (B, L, encoder_hidden_size) tensor from the visual backbone (CLIP visual encoder), including cls token.
+        """
+        if self.prenorm is not None:
+            x = self.prenorm(x)
+        if self.pos_emb is not None:
+            x = x + self.pos_emb
+        x = self._forward(
+            x,
+            num_queries_vis_abstractors=num_queries_vis_abstractors,
+            num_grids=num_grids,
+            freeze_before_sampler=freeze_before_sampler,
+        )  # (B, L, output_hidden_size)
+        return x
+class ConvProjector(Projector):
+    def _forward(
+        self,
+        x,
+        num_queries_vis_abstractors: Optional[List[int]] = None,
+        num_grids: Optional[List[int]] = None,
+        freeze_before_sampler: bool = False,
+    ):
+        # x: [B, L, dim]
+        hw = int(x.size(1) ** 0.5)
+        x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)
+        if num_queries_vis_abstractors is not None:
+            assert num_grids is not None
+            return self._forward_adaptive_num_query(x, num_queries_vis_abstractors, num_grids, freeze_before_sampler)
+        if freeze_before_sampler:
+            with torch.no_grad():
+                x = self.net[0](x)
+            x = self.net[1](x)
+            x = self.net[2](x)
+        else:
+            x = self.net(x)
+        x = rearrange(x, "b d h w -> b (h w) d")
+        x = self.readout(x)
+        return x
+    def _forward_adaptive_num_query(
+        self,
+        x,
+        num_queries_vis_abstractors: Optional[List[int]] = None,
+        num_grids: Optional[List[int]] = None,
+        freeze_before_sampler: bool = False,
+    ):
+        # self.net 은 3 개의 layer로 구성되어 있음 (s1, sampler, s2)
+        # self.net[1] 인 sampler 를 adaptive pooling으로 대체
+        assert len(self.net) == 3
+        if freeze_before_sampler:
+            with torch.no_grad():
+                x = self.net[0](x)
+        else:
+            x = self.net[0](x)
+        new_x = []
+        for i, num_queries in enumerate(num_queries_vis_abstractors):
+            hw = int(num_queries**0.5)
+            sampler = nn.AdaptiveAvgPool2d((hw, hw))
+            out = sampler(x[num_grids[i] : num_grids[i + 1], :])
+            out = self.net[2](out)
+            out = rearrange(out, "b d h w -> b (h w) d")
+            out = self.readout(out)
+            new_x.append(out)
+        return new_x
+class CAbstractor(ConvProjector):
+    """C-Abstractor"""
+    def build_net(self, n_queries, encoder_hidden_size, hidden_size, output_hidden_size, depth=3, mlp_depth=2):
+        assert (n_queries**0.5).is_integer(), "n_queries must be square number"
+        hw = int(n_queries**0.5)
+        # RegBlock = ResBlock + SE
+        RegBlock = partial(
+            RegStage,
+            stride=1,
+            dilation=1,
+            act_layer=nn.SiLU,
+            norm_layer=LayerNorm2d,
+        )
+        s1 = RegBlock(
+            depth,
+            encoder_hidden_size,
+            hidden_size,
+        )
+        sampler = nn.AdaptiveAvgPool2d((hw, hw))
+        s2 = RegBlock(
+            depth,
+            hidden_size,
+            hidden_size,
+        )
+        self.net = nn.Sequential(s1, sampler, s2)
+        self.readout = self.build_mlp(mlp_depth, hidden_size, output_hidden_size)
+    def build_mlp(self, depth, hidden_size, output_hidden_size):
+        layers = [nn.Linear(hidden_size, output_hidden_size)]
+        for _ in range(1, depth):
+            layers.append(nn.SiLU())
+            layers.append(nn.Linear(output_hidden_size, output_hidden_size))
+        return nn.Sequential(*layers)
+AutoConfig.register("vlm", HCXVisionConfig)
+try:
+    from .configuration_hyperclovax import HyperCLOVAXConfig
+    from .modeling_hyperclovax import HyperCLOVAXForCausalLM
+    AutoConfig.register("hyperclovax", HyperCLOVAXConfig)
+    AutoModelForCausalLM.register(
+        HyperCLOVAXConfig,
+        HyperCLOVAXForCausalLM,
+    )
+except:
+    pass

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 2073600,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "HCXVisionV2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 2073600,
+    "shortest_edge": 3136
+  },
+  "temporal_patch_size": 2
+}

processing_vlm.py ADDED Viewed

	@@ -0,0 +1,823 @@

+import copy
+import math
+import os
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+from transformers import Qwen2_5_VLProcessor
+from transformers.image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+    get_size_dict,
+)
+from transformers.image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import (
+    Qwen2_5_VLProcessorKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput
+from typing_extensions import Unpack
+logger = logging.get_logger(__name__)
+def determine_possible_resolutions(anyres: bool, max_num_grids: int, grid_size: int, use_1x1_grid: bool = False):
+    """총 max_num_grids 이하의 possible resolution 조합을 찾아 반환합니다.
+    max_num_grids 가 예를 들어 4인 경우, 총 가능한 grid 조합은 [1x1, 1x2, 1x3, 1x4, 2x1, 2x2, 3x1, 4x1] 이고, 따라서 아래와 같이 계산됩니다.
+    >>> possible_resolutions = determine_possible_resolutions(anyres=True, max_num_grids=4, grid_size=336)
+    >>> print(possible_resolutions)
+    [[336, 336], [336, 672], [336, 1008], [336, 1344], [672, 336], [672, 672], [1008, 336], [1344, 336]]
+    """
+    possible_resolutions = []
+    if anyres:
+        assert max_num_grids > 0
+        for i in range(1, max_num_grids + 1):
+            for j in range(1, max_num_grids + 1):
+                if i == 1 and j == 1 and not use_1x1_grid:
+                    continue
+                if i * j <= max_num_grids:
+                    possible_resolutions.append([i, j])
+        possible_resolutions = [[ys * grid_size, xs * grid_size] for ys, xs in possible_resolutions]
+    return possible_resolutions
+def divide_to_grids(image: np.array, grid_size: int, input_data_format=None) -> List[np.array]:
+    """local image 를 (grid_size x grid_size) grid 로 divide"""
+    grids = []
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    for i in range(0, height, grid_size):
+        for j in range(0, width, grid_size):
+            if input_data_format == ChannelDimension.LAST:
+                grid = image[i : i + grid_size, j : j + grid_size]
+            else:
+                grid = image[:, i : i + grid_size, j : j + grid_size]
+            grids.append(grid)
+    return grids
+def pad(image: np.array, target_size: tuple, background_color=(127, 127, 127), input_data_format=None) -> np.array:
+    """image 양옆, 좌우에 padding 을 하여 target_height, target_width 만큼 키움"""
+    target_height, target_width = target_size
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    # result = np.ones((target_height, target_width, image.shape[2]), dtype=image.dtype) * background_color
+    result = np.empty((target_height, target_width, image.shape[2]), dtype=image.dtype)
+    for i in range(image.shape[2]):
+        result[..., i].fill(background_color[i])
+    paste_x = (target_width - width) // 2
+    paste_y = (target_height - height) // 2
+    result[paste_y : paste_y + height, paste_x : paste_x + width, :] = image
+    return result
+def expand2square(
+    image: np.array, bboxes_dict=None, background_color=(127, 127, 127), input_data_format=None
+) -> np.array:
+    """
+    새로운 canvas 를 만들어 두고, 거기에 이미지를 붙여넣는 방식으로 이미지를 정사각형으로 만드는 함수
+    유의할 사항은, 이미지를 붙여 넣을 때 중앙으로 붙여넣는다는 점. 양옆 또는 위아래로 PADDING 이 들어가는 형태
+    Args:
+        pil_img: numpy array
+        bboxes_dict: dict, {"ocr": NDArray shape (N, 4, 2), "html": NDArray shape (N, 4, 2), ... }
+            `[[xtl, ytl], [xtr, ytr], [xbr, ybr], [xbl, ybl]]` 형태로 박스 형태는 통일. OCR, HTML 등 다양한 박스들을 한번에 처리 가능
+        background_color: tuple, RGB
+    # >>> _img = np.ones((80, 100), dtype=np.uint8) * 100
+    # >>> _bboxes_dict = {"words": np.array([[[10, 10], [20, 10], [20, 20], [10, 20]],
+    # ...                                    [[30, 30], [40, 30], [40, 40], [30, 40]]])}
+    # >>> _img, _bboxes_dict = expand2square(_img, _bboxes_dict, (255, 255, 255))
+    # >>> _img.shape
+    # (100, 100)
+    # >>> guessed_ocr_bboxes = np.array([[[20, 10], [30, 10], [30, 20], [20, 20]],
+    # ...                                [[40, 30], [50, 30], [50, 40], [40, 40]]])
+    # >>> np.testing.assert_array_almost_equal(_bboxes_dict["words"], guessed_ocr_bboxes) is None
+    # True
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if width == height:
+        return image, bboxes_dict
+    elif width > height:
+        # result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
+        result = np.empty((width, width, image.shape[2]), dtype=image.dtype)
+        for i in range(image.shape[2]):
+            result[..., i].fill(background_color[i])
+        result[(width - height) // 2 : (width - height) // 2 + height, :] = image
+        if bboxes_dict is not None:
+            for key in bboxes_dict:
+                bboxes_dict[key][:, :, 1] += (width - height) // 2
+        return result, bboxes_dict
+    else:
+        # result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
+        result = np.empty((height, height, image.shape[2]), dtype=image.dtype)
+        for i in range(image.shape[2]):
+            result[..., i].fill(background_color[i])
+        result[:, (height - width) // 2 : (height - width) // 2 + width] = image
+        if bboxes_dict is not None:
+            for key in bboxes_dict:
+                bboxes_dict[key][:, :, 0] += (height - width) // 2
+        return result, bboxes_dict
+def resize_longside(
+    image: np.array,
+    size: int,
+    resample: PILImageResampling = PILImageResampling.BICUBIC,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+):
+    """
+    장축 길이를 size 에 맞게 resize
+    """
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if width == height:
+        target_height, target_width = size, size
+    elif width > height:
+        target_width = size
+        target_height = math.ceil(height / width * size)
+    else:
+        target_width = math.ceil(width / height * size)
+        target_height = size
+    return resize(
+        image,
+        size=(target_height, target_width),
+        resample=resample,
+        data_format=data_format,
+        input_data_format=input_data_format,
+    )
+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
+    """From LLaVA-Next (https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llava_next/image_processing_llava_next.py)
+    Selects the best resolution from a list of possible resolutions based on the original size.
+    This is done by calculating the effective and wasted resolution for each possible resolution.
+    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+    Args:
+        original_size (tuple):
+            The original size of the image in the format (height, width).
+        possible_resolutions (list):
+            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+    Returns:
+        tuple: The best fit resolution in the format (height, width).
+    """
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+    return best_fit
+def _get_local_grids_output_size(image: np.array, target_resolution: tuple, input_data_format=None):
+    original_height, original_width = get_image_size(image, channel_dim=input_data_format)
+    target_height, target_width = target_resolution
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+    return new_height, new_width
+def determine_anyres_num_vision_patches(
+    num_grids,
+    image_size,
+    grid_size,
+    patch_size,
+    possible_resolutions,
+    anyres=False,
+    unpad=True,
+    num_queries_vis_abstractor=0,
+    num_queries_vis_abstractor_slow=0,
+    video=False,
+    first_last_frames_slow=False,
+    is_first_or_last_frames=False,
+):
+    """visual tokens 수를 계산해주는 함수"""
+    if not anyres:
+        return num_queries_vis_abstractor if num_queries_vis_abstractor > 0 else (grid_size // patch_size) ** 2
+    if num_queries_vis_abstractor > 0:
+        num_patch_per_grid = int(num_queries_vis_abstractor**0.5)
+    else:
+        num_patch_per_grid = grid_size // patch_size
+    num_global_per_grid = num_patch_per_grid
+    # anyres는 global image가 있어서 2개 이상이지만, video에는 global image가 없어서, 1개가 들어올 수 있어서 주석 처리
+    # assert num_grids > 1
+    # patch 수 계산
+    height, width = select_best_resolution(image_size, possible_resolutions)
+    num_patch_height = (height // grid_size) * num_patch_per_grid
+    num_patch_width = (width // grid_size) * num_patch_per_grid
+    # local images
+    if unpad:
+        original_height, original_width = image_size
+        original_aspect_ratio = original_width / original_height
+        current_aspect_ratio = num_patch_width / num_patch_height
+        if original_aspect_ratio > current_aspect_ratio:
+            scale_factor = num_patch_width / original_width
+            new_height = int(original_height * scale_factor)
+            padding = (num_patch_height - new_height) // 2
+            num_patch_height = num_patch_height - padding * 2
+        else:
+            scale_factor = num_patch_height / original_height
+            new_width = int(original_width * scale_factor)
+            padding = (num_patch_width - new_width) // 2
+            num_patch_width = num_patch_width - padding * 2
+        num_patches = num_patch_width * num_patch_height + num_patch_height
+    else:
+        num_patches = num_patch_width * num_patch_height
+    # slow는 첫프레임 마지막 프레임 적용 전략일때는 첫프레임과 마지막 프레임만 적용
+    if num_queries_vis_abstractor_slow > 0:
+        if first_last_frames_slow:
+            if is_first_or_last_frames:
+                num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
+        else:
+            num_patches += num_queries_vis_abstractor_slow - num_queries_vis_abstractor
+        # slowfast 기능은 unpad False 에만 적용
+        assert unpad is False
+    # video 에는 global image 가 포함되지 않음
+    if not video:
+        num_patches += num_global_per_grid**2
+    return num_patches
+class HCXVisionImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VLM image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques for processing high resolution images.
+    Args:
+        anyres: (bool) anyres 기능을 사용할지 안할지
+        unpad: (bool) anyres 사용시, unpad 기능 (순수 pad 영역에 해당하는 visual tokens 은 LLM input 에서 제거) 을 사용할지 안할지
+        num_queries_vis_abstractor: (int) 각 grid 에 대해서 resampler 를 사용하는 경우, visual query 수
+        possible_resolutions: (List) anyres 기능 사용시, 가능한 resolution 조합, 예: [[336, 336], [336, 672], [672, 336]]
+        patch_size: (int) ViT patch size
+        pad_to_square: (bool) 정사각형으로 padding 을 수행할지, 안할지를 결정. False 이면 정사각형이 아니기 때문에 center crop 을 거쳐 ViT 의 입력으로 들어감
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        anyres: bool = False,
+        unpad: bool = False,
+        num_queries_vis_abstractor: int = 0,
+        possible_resolutions: List = [],
+        patch_size: int = 14,
+        pad_to_square: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 336}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 336, "width": 336}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+        self.do_resize = do_resize
+        self.size = size
+        self.anyres = anyres
+        self.unpad = unpad
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.possible_resolutions = [_resolution for _resolution in possible_resolutions]
+        self.patch_size = patch_size
+        self.pad_to_square = pad_to_square
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Image.Image:
+        images = make_list_of_images(images)
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        return images
+    def _resize_for_local_grids(
+        self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
+    ) -> np.array:
+        new_height, new_width = _get_local_grids_output_size(image, target_resolution, input_data_format)
+        # Resize the image
+        resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
+        return resized_image
+    def _pad_for_patching(
+        self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
+    ) -> np.array:
+        """
+        Pad an image to a target resolution while maintaining aspect ratio.
+        """
+        target_height, target_width = target_resolution
+        background_color = tuple(int(x * 255) for x in self.image_mean)
+        padded_image = pad(
+            image,
+            target_size=(target_height, target_width),
+            background_color=background_color,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+    def get_image_grids(
+        self,
+        image: np.array,
+        possible_resolutions,
+        grid_size: int,
+        resample: PILImageResampling,
+        data_format: ChannelDimension,
+        input_data_format: ChannelDimension,
+    ) -> List[np.array]:
+        if not isinstance(possible_resolutions, list):
+            raise ValueError("possible_resolutions must be a list of possible resolutions.")
+        image_size = get_image_size(image, channel_dim=input_data_format)
+        best_resolution = select_best_resolution(image_size, possible_resolutions)
+        resized_image = self._resize_for_local_grids(
+            image, best_resolution, resample=resample, input_data_format=input_data_format
+        )
+        padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
+        local_grids = divide_to_grids(padded_image, grid_size=grid_size, input_data_format=input_data_format)
+        # make sure that all patches are in the input data format
+        local_grids = [
+            to_channel_dimension_format(grid, channel_dim=data_format, input_channel_dim=input_data_format)
+            for grid in local_grids
+        ]
+        return local_grids
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        anyres: bool = None,
+        unpad: bool = None,
+        video: bool = None,
+        num_queries_vis_abstractor: int = None,
+        possible_resolutions: List = None,
+        patch_size: int = None,
+        pad_to_square: bool = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        return_dummy_image: bool = False,
+        num_queries_vis_abstractor_slow: int = 0,
+        first_last_frames_slow: bool = False,
+        is_first_or_last_frames: bool = False,
+    ):
+        """
+        HCXVisionImageProcessor 로 image tensor, original image size (width, height), visual tokens
+        :return pixel_values: List of 4D tensor 로 image tensor
+        :return image_sizes: List of Dict 로 image width, height [{"width": image 1 의 width, "height": image 1 의 height}, {"width": image 2 의 width, "height": image 2 의 height}, ...]
+        :return vision_query_lengths: List of int 로 각 image 가 LLM 입력으로 전달될때 변환되는 visual token 수
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        anyres = anyres if anyres is not None else self.anyres
+        unpad = unpad if unpad is not None else self.unpad
+        if video:
+            unpad = False
+        num_queries_vis_abstractor = (
+            num_queries_vis_abstractor if num_queries_vis_abstractor is not None else self.num_queries_vis_abstractor
+        )
+        possible_resolutions = possible_resolutions if possible_resolutions is not None else self.possible_resolutions
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        pad_to_square = pad_to_square if pad_to_square is not None else self.pad_to_square
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        if return_dummy_image:
+            images = Image.new("RGB", (224, 224), (0, 0, 0))
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        new_images = []
+        image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
+        vision_query_lengths = []
+        assert crop_size["height"] == crop_size["width"]
+        # global image 의 padding 연산은, image original width, height 가 클 때 bottleneck 이 될 수 있음
+        # 장축의 길이를 size["shortest_edge"] 로 resize 를 먼저 한 뒤에, padding
+        if anyres:
+            anyres_global_images = copy.deepcopy(images)
+            if pad_to_square:
+                background_color = tuple(int(x * 255) for x in self.image_mean)
+                anyres_global_images = [
+                    resize_longside(copy.deepcopy(image), size["shortest_edge"], resample, input_data_format)
+                    for image in anyres_global_images
+                ]
+                anyres_global_images = [
+                    expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
+                    for image in anyres_global_images
+                ]
+            else:
+                anyres_global_images = [
+                    self.resize(
+                        image=image,
+                        size={"height": size["shortest_edge"], "width": size["shortest_edge"]},
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+                    for image in anyres_global_images
+                ]
+        else:
+            anyres_global_images = [None for _ in range(len(images))]
+            if pad_to_square:
+                background_color = tuple(int(x * 255) for x in self.image_mean)
+                images = [
+                    resize_longside(image, size["shortest_edge"], resample, input_data_format) for image in images
+                ]
+                images = [
+                    expand2square(image, background_color=background_color, input_data_format=input_data_format)[0]
+                    for image in images
+                ]
+        for image, anyres_global_image, image_size in zip(images, anyres_global_images, image_sizes):
+            if anyres:
+                # convert image into a list of grids
+                # we intentially use the same data format as the input data format
+                image_grids = self.get_image_grids(
+                    image,
+                    possible_resolutions,
+                    grid_size=crop_size["height"],
+                    resample=resample,
+                    data_format=input_data_format,
+                    input_data_format=input_data_format,
+                )
+                # video 에 대해서는 global image (thumbnail) 를 사용하지 않음
+                if not video:
+                    image_grids = [anyres_global_image] + image_grids
+            else:
+                image_grids = [image]
+            pixel_values = self._preprocess(
+                image_grids,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pixel_values = np.array(pixel_values)
+            new_images.append(pixel_values)
+            num_grids = pixel_values.shape[0]
+            vision_query_length = determine_anyres_num_vision_patches(
+                num_grids=num_grids,
+                image_size=image_size,
+                grid_size=crop_size["height"],
+                patch_size=patch_size,
+                possible_resolutions=possible_resolutions,
+                anyres=anyres,
+                unpad=unpad,
+                num_queries_vis_abstractor=num_queries_vis_abstractor,
+                num_queries_vis_abstractor_slow=num_queries_vis_abstractor_slow,
+                video=video,
+                first_last_frames_slow=first_last_frames_slow,
+                is_first_or_last_frames=is_first_or_last_frames,
+            )
+            vision_query_lengths.append(vision_query_length)
+        if return_dummy_image:
+            vision_query_lengths = []
+        data = {
+            "pixel_values": [torch.tensor(new_image) for new_image in new_images],
+            "image_sizes": [{"width": image_size[1], "height": image_size[0]} for image_size in image_sizes],
+            "vision_query_lengths": vision_query_lengths,
+        }
+        return BatchFeature(data=data)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        self.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+class HCXVisionV2Processor(Qwen2_5_VLProcessor):
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = ("GPT2Tokenizer", "GPT2TokenizerFast", "PreTrainedTokenizer", "PreTrainedTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=self.tokenizer.chat_template)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        *args,
+        **kwargs,
+    ):
+        self.register_for_auto_class()
+        super().save_pretrained(save_directory, *args, **kwargs)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2_5_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_inputs = videos_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grid_thw = videos_inputs["video_grid_thw"]
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()  # below lines change text in-place
+        if images is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    text[i] = text[i].replace(
+                        '{"resolution": [w, h]}', '{"resolution": ' + str(list(images[i].size)) + "}"
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if videos is not None:
+            merge_length = self.video_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    num_video_tokens = video_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "processor_class": "HCXVisionV2Processor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|IMAGE_PAD|>",
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "video_token": "<|VIDEO_PAD|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,2079 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128000": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128001": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128002": {
+      "content": "<|stop|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128003": {
+      "content": "<|endofturn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128004": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128005": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128006": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128007": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128008": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128009": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128010": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128011": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128012": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128013": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128014": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128015": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128016": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128017": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128018": {
+      "content": "<code_to_intermediate>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128019": {
+      "content": "<intermediate_to_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128020": {
+      "content": "<pr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128021": {
+      "content": "<pr_status>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128022": {
+      "content": "<pr_is_merged>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128023": {
+      "content": "<pr_base>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128024": {
+      "content": "<pr_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128025": {
+      "content": "<pr_base_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128026": {
+      "content": "<pr_diff>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128027": {
+      "content": "<pr_diff_hunk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128028": {
+      "content": "<pr_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128029": {
+      "content": "<pr_event_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128030": {
+      "content": "<pr_review>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128031": {
+      "content": "<pr_review_state>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128032": {
+      "content": "<pr_review_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128033": {
+      "content": "<pr_in_reply_to_review_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128034": {
+      "content": "<pr_in_reply_to_comment_id>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128035": {
+      "content": "<pr_diff_hunk_comment_line>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128036": {
+      "content": "<NAME>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128037": {
+      "content": "<EMAIL>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128038": {
+      "content": "<KEY>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128039": {
+      "content": "<PASSWORD>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128040": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128041": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128042": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128043": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128044": {
+      "content": "<arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128045": {
+      "content": "</arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128046": {
+      "content": "<arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128047": {
+      "content": "</arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128048": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128049": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128050": {
+      "content": "<tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128051": {
+      "content": "</tools>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128052": {
+      "content": "<|mime_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128053": {
+      "content": "<|mime_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128054": {
+      "content": "<|document_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128055": {
+      "content": "<|document_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128056": {
+      "content": "<|image_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128057": {
+      "content": "<|image_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128058": {
+      "content": "<|video_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128059": {
+      "content": "<|video_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128060": {
+      "content": "<|IMAGE_PAD|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128061": {
+      "content": "<|VIDEO_PAD|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128062": {
+      "content": "<|vision_aux_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128063": {
+      "content": "<|vision_aux_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128064": {
+      "content": "<|code_switching|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128065": {
+      "content": "<|back_translation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128066": {
+      "content": "<|instruction_pretraining|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128067": {
+      "content": "<|_placeholder_067|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128068": {
+      "content": "<|_placeholder_068|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128069": {
+      "content": "<|_placeholder_069|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128070": {
+      "content": "<|_placeholder_070|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128071": {
+      "content": "<|_placeholder_071|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128072": {
+      "content": "<|_placeholder_072|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128073": {
+      "content": "<|_placeholder_073|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128074": {
+      "content": "<|_placeholder_074|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128075": {
+      "content": "<|_placeholder_075|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128076": {
+      "content": "<|_placeholder_076|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128077": {
+      "content": "<|_placeholder_077|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128078": {
+      "content": "<|_placeholder_078|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128079": {
+      "content": "<|_placeholder_079|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128080": {
+      "content": "<|_placeholder_080|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128081": {
+      "content": "<|_placeholder_081|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128082": {
+      "content": "<|_placeholder_082|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128083": {
+      "content": "<|_placeholder_083|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128084": {
+      "content": "<|_placeholder_084|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128085": {
+      "content": "<|_placeholder_085|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128086": {
+      "content": "<|_placeholder_086|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128087": {
+      "content": "<|_placeholder_087|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128088": {
+      "content": "<|_placeholder_088|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128089": {
+      "content": "<|_placeholder_089|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128090": {
+      "content": "<|_placeholder_090|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128091": {
+      "content": "<|_placeholder_091|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128092": {
+      "content": "<|_placeholder_092|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128093": {
+      "content": "<|_placeholder_093|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128094": {
+      "content": "<|_placeholder_094|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128095": {
+      "content": "<|_placeholder_095|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128096": {
+      "content": "<|_placeholder_096|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128097": {
+      "content": "<|_placeholder_097|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128098": {
+      "content": "<|_placeholder_098|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128099": {
+      "content": "<|_placeholder_099|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128100": {
+      "content": "<|_placeholder_100|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128101": {
+      "content": "<|_placeholder_101|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128102": {
+      "content": "<|_placeholder_102|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128103": {
+      "content": "<|_placeholder_103|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128104": {
+      "content": "<|_placeholder_104|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128105": {
+      "content": "<|_placeholder_105|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128106": {
+      "content": "<|_placeholder_106|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128107": {
+      "content": "<|_placeholder_107|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128108": {
+      "content": "<|_placeholder_108|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128109": {
+      "content": "<|_placeholder_109|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128110": {
+      "content": "<|_placeholder_110|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128111": {
+      "content": "<|_placeholder_111|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128112": {
+      "content": "<|_placeholder_112|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128113": {
+      "content": "<|_placeholder_113|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128114": {
+      "content": "<|_placeholder_114|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128115": {
+      "content": "<|_placeholder_115|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128116": {
+      "content": "<|_placeholder_116|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128117": {
+      "content": "<|_placeholder_117|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128118": {
+      "content": "<|_placeholder_118|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128119": {
+      "content": "<|_placeholder_119|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128120": {
+      "content": "<|_placeholder_120|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128121": {
+      "content": "<|_placeholder_121|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128122": {
+      "content": "<|_placeholder_122|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128123": {
+      "content": "<|_placeholder_123|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128124": {
+      "content": "<|_placeholder_124|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128125": {
+      "content": "<|_placeholder_125|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128126": {
+      "content": "<|_placeholder_126|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128127": {
+      "content": "<|_placeholder_127|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128128": {
+      "content": "<|_placeholder_128|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128129": {
+      "content": "<|_placeholder_129|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128130": {
+      "content": "<|_placeholder_130|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128131": {
+      "content": "<|_placeholder_131|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128132": {
+      "content": "<|_placeholder_132|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128133": {
+      "content": "<|_placeholder_133|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128134": {
+      "content": "<|_placeholder_134|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128135": {
+      "content": "<|_placeholder_135|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128136": {
+      "content": "<|_placeholder_136|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128137": {
+      "content": "<|_placeholder_137|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128138": {
+      "content": "<|_placeholder_138|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128139": {
+      "content": "<|_placeholder_139|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128140": {
+      "content": "<|_placeholder_140|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128141": {
+      "content": "<|_placeholder_141|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128142": {
+      "content": "<|_placeholder_142|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128143": {
+      "content": "<|_placeholder_143|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128144": {
+      "content": "<|_placeholder_144|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128145": {
+      "content": "<|_placeholder_145|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128146": {
+      "content": "<|_placeholder_146|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128147": {
+      "content": "<|_placeholder_147|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128148": {
+      "content": "<|_placeholder_148|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128149": {
+      "content": "<|_placeholder_149|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128150": {
+      "content": "<|_placeholder_150|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128151": {
+      "content": "<|_placeholder_151|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128152": {
+      "content": "<|_placeholder_152|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128153": {
+      "content": "<|_placeholder_153|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128154": {
+      "content": "<|_placeholder_154|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128155": {
+      "content": "<|_placeholder_155|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128156": {
+      "content": "<|_placeholder_156|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128157": {
+      "content": "<|_placeholder_157|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128158": {
+      "content": "<|_placeholder_158|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128159": {
+      "content": "<|_placeholder_159|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128160": {
+      "content": "<|_placeholder_160|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128161": {
+      "content": "<|_placeholder_161|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128162": {
+      "content": "<|_placeholder_162|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128163": {
+      "content": "<|_placeholder_163|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128164": {
+      "content": "<|_placeholder_164|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128165": {
+      "content": "<|_placeholder_165|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128166": {
+      "content": "<|_placeholder_166|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128167": {
+      "content": "<|_placeholder_167|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128168": {
+      "content": "<|_placeholder_168|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128169": {
+      "content": "<|_placeholder_169|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128170": {
+      "content": "<|_placeholder_170|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128171": {
+      "content": "<|_placeholder_171|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128172": {
+      "content": "<|_placeholder_172|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128173": {
+      "content": "<|_placeholder_173|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128174": {
+      "content": "<|_placeholder_174|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128175": {
+      "content": "<|_placeholder_175|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128176": {
+      "content": "<|_placeholder_176|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128177": {
+      "content": "<|_placeholder_177|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128178": {
+      "content": "<|_placeholder_178|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128179": {
+      "content": "<|_placeholder_179|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128180": {
+      "content": "<|_placeholder_180|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128181": {
+      "content": "<|_placeholder_181|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128182": {
+      "content": "<|_placeholder_182|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128183": {
+      "content": "<|_placeholder_183|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128184": {
+      "content": "<|_placeholder_184|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128185": {
+      "content": "<|_placeholder_185|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128186": {
+      "content": "<|_placeholder_186|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128187": {
+      "content": "<|_placeholder_187|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128188": {
+      "content": "<|_placeholder_188|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128189": {
+      "content": "<|_placeholder_189|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128190": {
+      "content": "<|_placeholder_190|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128191": {
+      "content": "<|_placeholder_191|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128192": {
+      "content": "<|_placeholder_192|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128193": {
+      "content": "<|_placeholder_193|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128194": {
+      "content": "<|_placeholder_194|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128195": {
+      "content": "<|_placeholder_195|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128196": {
+      "content": "<|_placeholder_196|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128197": {
+      "content": "<|_placeholder_197|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128198": {
+      "content": "<|_placeholder_198|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128199": {
+      "content": "<|_placeholder_199|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128200": {
+      "content": "<|_placeholder_200|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128201": {
+      "content": "<|_placeholder_201|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128202": {
+      "content": "<|_placeholder_202|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128203": {
+      "content": "<|_placeholder_203|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128204": {
+      "content": "<|_placeholder_204|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128205": {
+      "content": "<|_placeholder_205|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128206": {
+      "content": "<|_placeholder_206|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128207": {
+      "content": "<|_placeholder_207|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128208": {
+      "content": "<|_placeholder_208|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128209": {
+      "content": "<|_placeholder_209|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128210": {
+      "content": "<|_placeholder_210|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128211": {
+      "content": "<|_placeholder_211|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128212": {
+      "content": "<|_placeholder_212|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128213": {
+      "content": "<|_placeholder_213|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128214": {
+      "content": "<|_placeholder_214|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128215": {
+      "content": "<|_placeholder_215|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128216": {
+      "content": "<|_placeholder_216|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128217": {
+      "content": "<|_placeholder_217|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128218": {
+      "content": "<|_placeholder_218|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128219": {
+      "content": "<|_placeholder_219|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128220": {
+      "content": "<|_placeholder_220|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128221": {
+      "content": "<|_placeholder_221|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128222": {
+      "content": "<|_placeholder_222|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128223": {
+      "content": "<|_placeholder_223|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128224": {
+      "content": "<|_placeholder_224|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128225": {
+      "content": "<|_placeholder_225|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128226": {
+      "content": "<|_placeholder_226|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128227": {
+      "content": "<|_placeholder_227|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128228": {
+      "content": "<|_placeholder_228|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128229": {
+      "content": "<|_placeholder_229|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128230": {
+      "content": "<|_placeholder_230|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128231": {
+      "content": "<|_placeholder_231|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128232": {
+      "content": "<|_placeholder_232|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128233": {
+      "content": "<|_placeholder_233|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128234": {
+      "content": "<|_placeholder_234|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128235": {
+      "content": "<|_placeholder_235|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128236": {
+      "content": "<|_placeholder_236|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128237": {
+      "content": "<|_placeholder_237|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128238": {
+      "content": "<|_placeholder_238|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128239": {
+      "content": "<|_placeholder_239|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128240": {
+      "content": "<|_placeholder_240|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128241": {
+      "content": "<|_placeholder_241|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128242": {
+      "content": "<|_placeholder_242|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128243": {
+      "content": "<|_placeholder_243|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128244": {
+      "content": "<|_placeholder_244|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128245": {
+      "content": "<|_placeholder_245|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128246": {
+      "content": "<|_placeholder_246|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128247": {
+      "content": "<|_placeholder_247|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128248": {
+      "content": "<|_placeholder_248|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128249": {
+      "content": "<|_placeholder_249|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128250": {
+      "content": "<|_placeholder_250|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128251": {
+      "content": "<|_placeholder_251|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128252": {
+      "content": "<|_placeholder_252|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128253": {
+      "content": "<|_placeholder_253|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128254": {
+      "content": "<|_placeholder_254|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "128255": {
+      "content": "<|_placeholder_255|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {
+    "image_token": "<|IMAGE_PAD|>",
+    "video_token": "<|VIDEO_PAD|>"
+  },
+  "image_token": "<|IMAGE_PAD|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "HCXVisionV2Processor",
+  "sep_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "video_token": "<|VIDEO_PAD|>"
+}

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "_valid_kwargs_names": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device",
+    "min_pixels",
+    "max_pixels",
+    "patch_size",
+    "temporal_patch_size",
+    "merge_size"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_vlm.HCXVisionV2Processor"
+  },
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "input_data_format": null,
+  "max_pixels": 12845056,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "model_valid_processing_keys": [
+    "do_convert_rgb",
+    "do_resize",
+    "size",
+    "size_divisor",
+    "default_to_square",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_pad",
+    "do_center_crop",
+    "crop_size",
+    "data_format",
+    "input_data_format",
+    "device",
+    "min_pixels",
+    "max_pixels",
+    "patch_size",
+    "temporal_patch_size",
+    "merge_size"
+  ],
+  "patch_size": 14,
+  "processor_class": "HCXVisionV2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "longest_edge": 12845056,
+    "shortest_edge": 3136
+  },
+  "size_divisor": null,
+  "temporal_patch_size": 2,
+  "video_processor_type": "Qwen2VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff