WildOjisan commited on
Commit
ac3772f
·
1 Parent(s): e65aa9f
Files changed (4) hide show
  1. main.py +55 -29
  2. main_old1.py +122 -0
  3. requirements.txt +3 -0
  4. simplerequest.txt +13 -0
main.py CHANGED
@@ -5,67 +5,92 @@ os.environ["TRANSFORMERS_CACHE"] = "/data/transformers"
5
  os.environ["HF_HUB_CACHE"] = "/data/hub"
6
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
7
  os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
8
-
9
  import threading
10
- from typing import List, Optional, Dict, Any, Iterator
11
 
12
  import torch
13
  from fastapi import FastAPI, Body
14
- from fastapi.responses import StreamingResponse, JSONResponse
15
  from pydantic import BaseModel, Field
16
  from transformers import (
17
  AutoTokenizer,
18
  AutoModelForCausalLM,
19
  TextIteratorStreamer,
20
  )
 
 
 
 
 
 
 
 
 
21
 
22
- MODEL_ID = "unsloth/Qwen2.5-1.5B-Instruct" # :contentReference[oaicite:3]{index=3}
 
 
23
 
 
24
  try:
25
  torch.set_num_threads(max(1, os.cpu_count() or 1))
26
  except Exception:
27
  pass
28
 
29
- # ---- 전역 모델/토크나이저 로드 ----
30
- print(f"[BOOT] Loading {MODEL_ID} on CPU(float32)...")
31
- tokenizer = AutoTokenizer.from_pretrained(
32
- MODEL_ID,
33
- use_fast=False,
34
- trust_remote_code=True,
35
- )
36
 
37
- model = AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  MODEL_ID,
39
- torch_dtype=torch.float32,
40
- device_map="cpu",
41
- low_cpu_mem_usage=True, # accelerate 필요 (요건 requirements에 반영됨) :contentReference[oaicite:4]{index=4}
42
  trust_remote_code=True,
 
 
43
  )
 
 
 
44
  model.eval()
45
 
46
- # ---- API 모델 ----
47
  class ChatMessage(BaseModel):
48
  role: str = Field(..., description="system | user | assistant")
49
  content: str
50
 
51
  class ChatRequest(BaseModel):
52
  messages: List[ChatMessage]
53
- max_new_tokens: int = 256
54
- temperature: float = 0.7
55
- top_p: float = 0.95
56
  repetition_penalty: float = 1.1
57
 
58
  class ChatResponse(BaseModel):
59
  text: str
60
 
61
- app = FastAPI(title="Qwen2.5-1.5B CPU API")
62
 
63
  @app.get("/")
64
  def health():
65
- return {"status": "ok", "model": MODEL_ID}
66
 
67
  def build_prompt(messages: List[Dict[str, str]]) -> str:
68
- # Qwen 계열 권장: chat 템플릿 사용(업로드 스크립트와 동일 컨셉) :contentReference[oaicite:5]{index=5}
69
  return tokenizer.apply_chat_template(
70
  [{"role": m["role"], "content": m["content"]} for m in messages],
71
  tokenize=False,
@@ -76,9 +101,12 @@ def build_prompt(messages: List[Dict[str, str]]) -> str:
76
  def chat(req: ChatRequest):
77
  prompt = build_prompt([m.dict() for m in req.messages])
78
  inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
79
  with torch.no_grad():
80
  output_ids = model.generate(
81
- **{k: v.to("cpu") for k, v in inputs.items()},
82
  max_new_tokens=req.max_new_tokens,
83
  do_sample=True,
84
  temperature=req.temperature,
@@ -87,16 +115,18 @@ def chat(req: ChatRequest):
87
  eos_token_id=tokenizer.eos_token_id,
88
  pad_token_id=tokenizer.eos_token_id,
89
  )
 
90
  text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
91
  return ChatResponse(text=text)
92
 
93
  def stream_generate(req: ChatRequest) -> Iterator[str]:
94
  prompt = build_prompt([m.dict() for m in req.messages])
95
  inputs = tokenizer(prompt, return_tensors="pt")
 
96
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
97
 
98
  gen_kwargs = dict(
99
- **{k: v.to("cpu") for k, v in inputs.items()},
100
  max_new_tokens=req.max_new_tokens,
101
  do_sample=True,
102
  temperature=req.temperature,
@@ -110,13 +140,9 @@ def stream_generate(req: ChatRequest) -> Iterator[str]:
110
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
111
  thread.start()
112
 
113
- # NDJSON(한 줄에 { "delta": "..." }) 형태로 전송
114
  for token_text in streamer:
115
  yield f'{{"delta": {token_text.__repr__()}}}\n'
116
 
117
  @app.post("/v1/chat/stream")
118
  def chat_stream(req: ChatRequest = Body(...)):
119
- return StreamingResponse(
120
- stream_generate(req),
121
- media_type="application/x-ndjson",
122
- )
 
5
  os.environ["HF_HUB_CACHE"] = "/data/hub"
6
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
7
  os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
 
8
  import threading
9
+ from typing import List, Dict, Iterator
10
 
11
  import torch
12
  from fastapi import FastAPI, Body
13
+ from fastapi.responses import StreamingResponse
14
  from pydantic import BaseModel, Field
15
  from transformers import (
16
  AutoTokenizer,
17
  AutoModelForCausalLM,
18
  TextIteratorStreamer,
19
  )
20
+ from peft import PeftModel
21
+
22
+ # ----------------- 환경 기본값 -----------------
23
+ os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
24
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
25
+
26
+ # CPU-only: 4bit 비활성화, float32
27
+ USE_4BIT = False
28
+ COMPUTE_DTYPE = torch.float32
29
 
30
+ # 베이스/어댑터 경로
31
+ MODEL_ID = os.environ.get("MODEL_ID", "unsloth/Qwen2.5-1.5B-Instruct")
32
+ ADAPTER_ID = os.environ.get("ADAPTER_ID", "WildOjisan/qwen2_5_lora_adapter_test1")
33
 
34
+ # 스레드 수
35
  try:
36
  torch.set_num_threads(max(1, os.cpu_count() or 1))
37
  except Exception:
38
  pass
39
 
40
+ # ----------------- 로드 -----------------
41
+ print(f"[BOOT] Base: {MODEL_ID}")
42
+ print(f"[BOOT] LoRA: {ADAPTER_ID}")
43
+
44
+ device_map = "cpu"
 
 
45
 
46
+ # 토크나이저: 어댑터 쪽에 커스텀 토큰/템플릿이 있을 수 있으니 우선 시도
47
+ try:
48
+ tokenizer = AutoTokenizer.from_pretrained(ADAPTER_ID, use_fast=False, trust_remote_code=True)
49
+ print("[BOOT] Tokenizer loaded from ADAPTER_ID.")
50
+ except Exception:
51
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
52
+ print("[BOOT] Tokenizer loaded from MODEL_ID.")
53
+
54
+ # pad 토큰 보정(Colab 코드와 동일한 경고 회피)
55
+ if tokenizer.pad_token is None:
56
+ tokenizer.pad_token = tokenizer.eos_token
57
+
58
+ # 베이스 모델 CPU(float32) 로드
59
+ base_model = AutoModelForCausalLM.from_pretrained(
60
  MODEL_ID,
61
+ device_map=device_map,
 
 
62
  trust_remote_code=True,
63
+ torch_dtype=COMPUTE_DTYPE,
64
+ low_cpu_mem_usage=True,
65
  )
66
+
67
+ # LoRA 어댑터 얹기 (merge 금지: Colab과 같은 동작)
68
+ model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
69
  model.eval()
70
 
71
+ # ----------------- API 스키마/앱 -----------------
72
  class ChatMessage(BaseModel):
73
  role: str = Field(..., description="system | user | assistant")
74
  content: str
75
 
76
  class ChatRequest(BaseModel):
77
  messages: List[ChatMessage]
78
+ max_new_tokens: int = 128
79
+ temperature: float = 0.7 # Colab 기본에 맞춤
80
+ top_p: float = 0.9 # Colab 기본에 맞춤
81
  repetition_penalty: float = 1.1
82
 
83
  class ChatResponse(BaseModel):
84
  text: str
85
 
86
+ app = FastAPI(title="Qwen2.5-1.5B 4bit + LoRA API")
87
 
88
  @app.get("/")
89
  def health():
90
+ return {"status": "ok", "base": MODEL_ID, "adapter": ADAPTER_ID, "use_4bit": USE_4BIT}
91
 
92
  def build_prompt(messages: List[Dict[str, str]]) -> str:
93
+ # Qwen 권장 chat 템플릿 (Colab과 동일)
94
  return tokenizer.apply_chat_template(
95
  [{"role": m["role"], "content": m["content"]} for m in messages],
96
  tokenize=False,
 
101
  def chat(req: ChatRequest):
102
  prompt = build_prompt([m.dict() for m in req.messages])
103
  inputs = tokenizer(prompt, return_tensors="pt")
104
+ # 모델의 디바이스로 이동
105
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
106
+
107
  with torch.no_grad():
108
  output_ids = model.generate(
109
+ **inputs,
110
  max_new_tokens=req.max_new_tokens,
111
  do_sample=True,
112
  temperature=req.temperature,
 
115
  eos_token_id=tokenizer.eos_token_id,
116
  pad_token_id=tokenizer.eos_token_id,
117
  )
118
+
119
  text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
120
  return ChatResponse(text=text)
121
 
122
  def stream_generate(req: ChatRequest) -> Iterator[str]:
123
  prompt = build_prompt([m.dict() for m in req.messages])
124
  inputs = tokenizer(prompt, return_tensors="pt")
125
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
126
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
127
 
128
  gen_kwargs = dict(
129
+ **inputs,
130
  max_new_tokens=req.max_new_tokens,
131
  do_sample=True,
132
  temperature=req.temperature,
 
140
  thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
141
  thread.start()
142
 
 
143
  for token_text in streamer:
144
  yield f'{{"delta": {token_text.__repr__()}}}\n'
145
 
146
  @app.post("/v1/chat/stream")
147
  def chat_stream(req: ChatRequest = Body(...)):
148
+ return StreamingResponse(stream_generate(req), media_type="application/x-ndjson")
 
 
 
main_old1.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # ✅ Hugging Face 캐시/토큰 경로를 쓰기 가능한 위치로 지정 (Spaces에서는 /data가 안전)
3
+ os.environ["HF_HOME"] = "/data"
4
+ os.environ["TRANSFORMERS_CACHE"] = "/data/transformers"
5
+ os.environ["HF_HUB_CACHE"] = "/data/hub"
6
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
7
+ os.environ.setdefault("PYTORCH_FORCE_MPS_FALLBACK", "1")
8
+
9
+ import threading
10
+ from typing import List, Optional, Dict, Any, Iterator
11
+
12
+ import torch
13
+ from fastapi import FastAPI, Body
14
+ from fastapi.responses import StreamingResponse, JSONResponse
15
+ from pydantic import BaseModel, Field
16
+ from transformers import (
17
+ AutoTokenizer,
18
+ AutoModelForCausalLM,
19
+ TextIteratorStreamer,
20
+ )
21
+
22
+ MODEL_ID = "unsloth/Qwen2.5-1.5B-Instruct" # :contentReference[oaicite:3]{index=3}
23
+
24
+ try:
25
+ torch.set_num_threads(max(1, os.cpu_count() or 1))
26
+ except Exception:
27
+ pass
28
+
29
+ # ---- 전역 모델/토크나이저 로드 ----
30
+ print(f"[BOOT] Loading {MODEL_ID} on CPU(float32)...")
31
+ tokenizer = AutoTokenizer.from_pretrained(
32
+ MODEL_ID,
33
+ use_fast=False,
34
+ trust_remote_code=True,
35
+ )
36
+
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ MODEL_ID,
39
+ torch_dtype=torch.float32,
40
+ device_map="cpu",
41
+ low_cpu_mem_usage=True, # accelerate 필요 (요건 requirements에 반영됨) :contentReference[oaicite:4]{index=4}
42
+ trust_remote_code=True,
43
+ )
44
+ model.eval()
45
+
46
+ # ---- API 모델 ----
47
+ class ChatMessage(BaseModel):
48
+ role: str = Field(..., description="system | user | assistant")
49
+ content: str
50
+
51
+ class ChatRequest(BaseModel):
52
+ messages: List[ChatMessage]
53
+ max_new_tokens: int = 256
54
+ temperature: float = 0.7
55
+ top_p: float = 0.95
56
+ repetition_penalty: float = 1.1
57
+
58
+ class ChatResponse(BaseModel):
59
+ text: str
60
+
61
+ app = FastAPI(title="Qwen2.5-1.5B CPU API")
62
+
63
+ @app.get("/")
64
+ def health():
65
+ return {"status": "ok", "model": MODEL_ID}
66
+
67
+ def build_prompt(messages: List[Dict[str, str]]) -> str:
68
+ # Qwen 계열 권장: chat 템플릿 사용(업로드 스크립트와 동일 컨셉) :contentReference[oaicite:5]{index=5}
69
+ return tokenizer.apply_chat_template(
70
+ [{"role": m["role"], "content": m["content"]} for m in messages],
71
+ tokenize=False,
72
+ add_generation_prompt=True,
73
+ )
74
+
75
+ @app.post("/v1/chat", response_model=ChatResponse)
76
+ def chat(req: ChatRequest):
77
+ prompt = build_prompt([m.dict() for m in req.messages])
78
+ inputs = tokenizer(prompt, return_tensors="pt")
79
+ with torch.no_grad():
80
+ output_ids = model.generate(
81
+ **{k: v.to("cpu") for k, v in inputs.items()},
82
+ max_new_tokens=req.max_new_tokens,
83
+ do_sample=True,
84
+ temperature=req.temperature,
85
+ top_p=req.top_p,
86
+ repetition_penalty=req.repetition_penalty,
87
+ eos_token_id=tokenizer.eos_token_id,
88
+ pad_token_id=tokenizer.eos_token_id,
89
+ )
90
+ text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
91
+ return ChatResponse(text=text)
92
+
93
+ def stream_generate(req: ChatRequest) -> Iterator[str]:
94
+ prompt = build_prompt([m.dict() for m in req.messages])
95
+ inputs = tokenizer(prompt, return_tensors="pt")
96
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
97
+
98
+ gen_kwargs = dict(
99
+ **{k: v.to("cpu") for k, v in inputs.items()},
100
+ max_new_tokens=req.max_new_tokens,
101
+ do_sample=True,
102
+ temperature=req.temperature,
103
+ top_p=req.top_p,
104
+ repetition_penalty=req.repetition_penalty,
105
+ eos_token_id=tokenizer.eos_token_id,
106
+ pad_token_id=tokenizer.eos_token_id,
107
+ streamer=streamer,
108
+ )
109
+
110
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
111
+ thread.start()
112
+
113
+ # NDJSON(한 줄에 { "delta": "..." }) 형태로 전송
114
+ for token_text in streamer:
115
+ yield f'{{"delta": {token_text.__repr__()}}}\n'
116
+
117
+ @app.post("/v1/chat/stream")
118
+ def chat_stream(req: ChatRequest = Body(...)):
119
+ return StreamingResponse(
120
+ stream_generate(req),
121
+ media_type="application/x-ndjson",
122
+ )
requirements.txt CHANGED
@@ -10,3 +10,6 @@ protobuf>=4.25.3
10
 
11
  fastapi>=0.112
12
  uvicorn[standard]>=0.30
 
 
 
 
10
 
11
  fastapi>=0.112
12
  uvicorn[standard]>=0.30
13
+ peft>=0.11.1
14
+ unsloth
15
+ bitsandbytes==0.43.3
simplerequest.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $body = @{
2
+ messages = @(
3
+ @{ role = "system"; content = "" },
4
+ @{ role = "user"; content = "간단히 자기소개해줘" }
5
+ )
6
+ max_new_tokens = 128
7
+ temperature = 0.7
8
+ } | ConvertTo-Json -Depth 3
9
+
10
+ Invoke-RestMethod -Uri https://wildojisan-qwen2-5-1-5b-instruct-basic-test.hf.space/v1/chat `
11
+ -Method POST `
12
+ -ContentType "application/json" `
13
+ -Body $body