Raphael Glon commited on
Commit
22111fb
·
unverified ·
1 Parent(s): 3f40f7e

Signed-off-by: Raphael Glon <[email protected]>

Files changed (2) hide show
  1. app.py +133 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import logging
4
+ import os
5
+ import threading
6
+ from typing import List, Tuple, Dict
7
+
8
+ import torch
9
+ import gradio as gr
10
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
11
+ from huggingface_hub import login
12
+
13
+ MODEL_ID = "openai/gpt-oss-20b"
14
+
15
+ logging.basicConfig(level=logging.DEBUG)
16
+
17
+ LOG = logging.getLogger(__name__)
18
+
19
+ MAX_NEW_TOKENS = 256
20
+ TEMPERATURE = 0.7
21
+ TOP_P = 0.95
22
+
23
+ # --- Silent Hub auth via env/Space Secret (no UI) ---
24
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
25
+ if HF_TOKEN:
26
+ try:
27
+ login(token=HF_TOKEN)
28
+ except Exception:
29
+ pass # stay silent
30
+
31
+ # Globals so we only load once
32
+ _tokenizer = None
33
+ _model = None
34
+ _device = None
35
+
36
+
37
+ def _ensure_loaded():
38
+ LOG.info("Loading model and tokenizer")
39
+ global _tokenizer, _model, _device
40
+ if _tokenizer is not None and _model is not None:
41
+ return
42
+ _tokenizer = AutoTokenizer.from_pretrained(
43
+ MODEL_ID, trust_remote_code=True
44
+ )
45
+ _model = AutoModelForCausalLM.from_pretrained(
46
+ MODEL_ID,
47
+ trust_remote_code=True,
48
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
49
+ low_cpu_mem_usage=True,
50
+ device_map="auto" if torch.cuda.is_available() else None,
51
+ )
52
+ if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
53
+ _tokenizer.pad_token = _tokenizer.eos_token
54
+ _model.eval()
55
+ _device = next(_model.parameters()).device
56
+
57
+
58
+ _ensure_loaded()
59
+
60
+ LOG.info("DEVICE %s", _device)
61
+
62
+
63
+ def _history_to_messages(history: List[Tuple[str, str]]) -> List[Dict[str, str]]:
64
+ msgs: List[Dict[str, str]] = []
65
+ for user_msg, bot_msg in history:
66
+ if user_msg:
67
+ msgs.append({"role": "user", "content": user_msg})
68
+ if bot_msg:
69
+ msgs.append({"role": "assistant", "content": bot_msg})
70
+ return msgs
71
+
72
+
73
+ @spaces.GPU(duration=120)
74
+ def generate_stream(message: str, history: List[Tuple[str, str]]):
75
+ """
76
+ Minimal streaming chat function for gr.ChatInterface.
77
+ Uses instruct chat template. No token UI. No extra controls.
78
+ """
79
+ _ensure_loaded()
80
+
81
+ messages = _history_to_messages(history) + [{"role": "user", "content": message}]
82
+ inputs = _tokenizer.apply_chat_template(
83
+ messages,
84
+ return_tensors="pt",
85
+ add_generation_prompt=True,
86
+ )
87
+ input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs
88
+ input_ids = input_ids.to(_device)
89
+
90
+ # IMPORTANT: don't stream the prompt (prevents system/user text from appearing)
91
+ streamer = TextIteratorStreamer(
92
+ _tokenizer,
93
+ skip_special_tokens=True,
94
+ skip_prompt=True, # <-- key fix
95
+ )
96
+
97
+ gen_kwargs = dict(
98
+ input_ids=input_ids,
99
+ max_new_tokens=MAX_NEW_TOKENS,
100
+ do_sample=TEMPERATURE > 0.0,
101
+ temperature=float(TEMPERATURE),
102
+ top_p=float(TOP_P),
103
+ pad_token_id=_tokenizer.pad_token_id,
104
+ eos_token_id=_tokenizer.eos_token_id,
105
+ streamer=streamer,
106
+ )
107
+
108
+ thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
109
+ thread.start()
110
+
111
+ output = ""
112
+ for new_text in streamer:
113
+ output += new_text
114
+ yield output
115
+
116
+
117
+ with gr.Blocks(title="MobileLLM-Pro — Chat") as demo:
118
+ gr.Markdown(
119
+ """
120
+ # Chat
121
+ Streaming chat with openai/gpt-oss-20b (instruct)
122
+ """)
123
+ gr.ChatInterface(
124
+ fn=generate_stream,
125
+ chatbot=gr.Chatbot(height=420, label="OpenAI"),
126
+ title=None, # header handled by Markdown above
127
+ description=None,
128
+ )
129
+
130
+ if __name__ == "__main__":
131
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
132
+
133
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers==4.57.1
2
+ torch==2.9.0