File size: 5,055 Bytes
f205daf
e65aa9f
 
 
 
 
 
f205daf
ac3772f
f205daf
 
 
ac3772f
f205daf
 
 
 
 
 
ac3772f
 
 
 
 
 
 
 
 
f205daf
ac3772f
 
 
f205daf
ac3772f
f205daf
 
 
 
 
ac3772f
 
 
 
 
f205daf
ac3772f
 
 
 
 
 
 
 
 
 
 
 
 
 
f205daf
ac3772f
f205daf
ac3772f
 
f205daf
ac3772f
 
 
f205daf
 
ac3772f
f205daf
 
 
 
 
 
ac3772f
 
 
f205daf
 
 
 
 
ac3772f
f205daf
 
 
ac3772f
f205daf
 
ac3772f
f205daf
 
 
 
 
 
 
 
 
 
ac3772f
 
 
f205daf
 
ac3772f
f205daf
 
 
 
 
 
 
 
ac3772f
f205daf
 
 
 
 
 
ac3772f
f205daf
 
 
ac3772f
f205daf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac3772f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
# ✅ Hugging Face 캐시/토큰 경로를 쓰기 가능한 위치로 지정 (Spaces에서는 /data가 안전)
os.environ["HF_HOME"] = "/data"
os.environ["TRANSFORMERS_CACHE"] = "/data/transformers"
os.environ["HF_HUB_CACHE"] = "/data/hub"
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
import threading
from typing import List, Dict, Iterator

import torch
from fastapi import FastAPI, Body
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)
from peft import PeftModel

# ----------------- 환경 기본값 -----------------
os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# CPU-only: 4bit 비활성화, float32
USE_4BIT = False
COMPUTE_DTYPE = torch.float32

# 베이스/어댑터 경로
MODEL_ID = os.environ.get("MODEL_ID", "unsloth/Qwen2.5-1.5B-Instruct")
ADAPTER_ID = os.environ.get("ADAPTER_ID", "WildOjisan/qwen2_5_lora_adapter_test1")

# 스레드 수
try:
    torch.set_num_threads(max(1, os.cpu_count() or 1))
except Exception:
    pass

# ----------------- 로드 -----------------
print(f"[BOOT] Base: {MODEL_ID}")
print(f"[BOOT] LoRA: {ADAPTER_ID}")

device_map = "cpu"

# 토크나이저: 어댑터 쪽에 커스텀 토큰/템플릿이 있을 수 있으니 우선 시도
try:
    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, use_fast=False, trust_remote_code=True)
    print("[BOOT] Tokenizer loaded from ADAPTER_ID.")
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
    print("[BOOT] Tokenizer loaded from MODEL_ID.")

# pad 토큰 보정(Colab 코드와 동일한 경고 회피)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 베이스 모델 CPU(float32) 로드
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map=device_map,
    trust_remote_code=True,
    torch_dtype=COMPUTE_DTYPE,
    low_cpu_mem_usage=True,
)

# LoRA 어댑터 얹기 (merge 금지: Colab과 같은 동작)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()

# ----------------- API 스키마/앱 -----------------
class ChatMessage(BaseModel):
    role: str = Field(..., description="system | user | assistant")
    content: str

class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    max_new_tokens: int = 128
    temperature: float = 0.7        # Colab 기본에 맞춤
    top_p: float = 0.9              # Colab 기본에 맞춤
    repetition_penalty: float = 1.1

class ChatResponse(BaseModel):
    text: str

app = FastAPI(title="Qwen2.5-1.5B 4bit + LoRA API")

@app.get("/")
def health():
    return {"status": "ok", "base": MODEL_ID, "adapter": ADAPTER_ID, "use_4bit": USE_4BIT}

def build_prompt(messages: List[Dict[str, str]]) -> str:
    # Qwen 권장 chat 템플릿 (Colab과 동일)
    return tokenizer.apply_chat_template(
        [{"role": m["role"], "content": m["content"]} for m in messages],
        tokenize=False,
        add_generation_prompt=True,
    )

@app.post("/v1/chat", response_model=ChatResponse)
def chat(req: ChatRequest):
    prompt = build_prompt([m.dict() for m in req.messages])
    inputs = tokenizer(prompt, return_tensors="pt")
    # 모델의 디바이스로 이동
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=req.max_new_tokens,
            do_sample=True,
            temperature=req.temperature,
            top_p=req.top_p,
            repetition_penalty=req.repetition_penalty,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return ChatResponse(text=text)

def stream_generate(req: ChatRequest) -> Iterator[str]:
    prompt = build_prompt([m.dict() for m in req.messages])
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    gen_kwargs = dict(
        **inputs,
        max_new_tokens=req.max_new_tokens,
        do_sample=True,
        temperature=req.temperature,
        top_p=req.top_p,
        repetition_penalty=req.repetition_penalty,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer,
    )

    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    for token_text in streamer:
        yield f'{{"delta": {token_text.__repr__()}}}\n'

@app.post("/v1/chat/stream")
def chat_stream(req: ChatRequest = Body(...)):
    return StreamingResponse(stream_generate(req), media_type="application/x-ndjson")