File size: 4,031 Bytes
ac3772f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
# ✅ Hugging Face 캐시/토큰 경로를 쓰기 가능한 위치로 지정 (Spaces에서는 /data가 안전)
os.environ["HF_HOME"] = "/data"
os.environ["TRANSFORMERS_CACHE"] = "/data/transformers"
os.environ["HF_HUB_CACHE"] = "/data/hub"
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")

import threading
from typing import List, Optional, Dict, Any, Iterator

import torch
from fastapi import FastAPI, Body
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel, Field
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
)

MODEL_ID = "unsloth/Qwen2.5-1.5B-Instruct"  # :contentReference[oaicite:3]{index=3}

try:
    torch.set_num_threads(max(1, os.cpu_count() or 1))
except Exception:
    pass

# ---- 전역 모델/토크나이저 로드 ----
print(f"[BOOT] Loading {MODEL_ID} on CPU(float32)...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=False,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,
    device_map="cpu",
    low_cpu_mem_usage=True,  # accelerate 필요 (요건 requirements에 반영됨) :contentReference[oaicite:4]{index=4}
    trust_remote_code=True,
)
model.eval()

# ---- API 모델 ----
class ChatMessage(BaseModel):
    role: str = Field(..., description="system | user | assistant")
    content: str

class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.95
    repetition_penalty: float = 1.1

class ChatResponse(BaseModel):
    text: str

app = FastAPI(title="Qwen2.5-1.5B CPU API")

@app.get("/")
def health():
    return {"status": "ok", "model": MODEL_ID}

def build_prompt(messages: List[Dict[str, str]]) -> str:
    # Qwen 계열 권장: chat 템플릿 사용(업로드 스크립트와 동일 컨셉) :contentReference[oaicite:5]{index=5}
    return tokenizer.apply_chat_template(
        [{"role": m["role"], "content": m["content"]} for m in messages],
        tokenize=False,
        add_generation_prompt=True,
    )

@app.post("/v1/chat", response_model=ChatResponse)
def chat(req: ChatRequest):
    prompt = build_prompt([m.dict() for m in req.messages])
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        output_ids = model.generate(
            **{k: v.to("cpu") for k, v in inputs.items()},
            max_new_tokens=req.max_new_tokens,
            do_sample=True,
            temperature=req.temperature,
            top_p=req.top_p,
            repetition_penalty=req.repetition_penalty,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return ChatResponse(text=text)

def stream_generate(req: ChatRequest) -> Iterator[str]:
    prompt = build_prompt([m.dict() for m in req.messages])
    inputs = tokenizer(prompt, return_tensors="pt")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    gen_kwargs = dict(
        **{k: v.to("cpu") for k, v in inputs.items()},
        max_new_tokens=req.max_new_tokens,
        do_sample=True,
        temperature=req.temperature,
        top_p=req.top_p,
        repetition_penalty=req.repetition_penalty,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        streamer=streamer,
    )

    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    # NDJSON(한 줄에 { "delta": "..." }) 형태로 전송
    for token_text in streamer:
        yield f'{{"delta": {token_text.__repr__()}}}\n'

@app.post("/v1/chat/stream")
def chat_stream(req: ChatRequest = Body(...)):
    return StreamingResponse(
        stream_generate(req),
        media_type="application/x-ndjson",
    )