Spaces:
Sleeping
Sleeping
yongdong
commited on
Commit
Β·
1f6f70b
1
Parent(s):
c6b828a
Disable sampling for deterministic JSON output
Browse files
app.py
CHANGED
|
@@ -92,7 +92,7 @@ def load_model_on_gpu():
|
|
| 92 |
raise load_error
|
| 93 |
|
| 94 |
@spaces.GPU(duration=60) # GPU inference
|
| 95 |
-
def generate_response_gpu(prompt, max_tokens=
|
| 96 |
"""Generate response - executed on GPU"""
|
| 97 |
global model
|
| 98 |
|
|
@@ -109,7 +109,6 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
| 109 |
|
| 110 |
try:
|
| 111 |
formatted_prompt = (
|
| 112 |
-
"You are a JSON generator. Please output only a valid JSON object and no additional text.\n\n"
|
| 113 |
"### Instruction:\n"
|
| 114 |
f"{prompt.strip()}\n\n"
|
| 115 |
"### Response:\n"
|
|
@@ -128,9 +127,7 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
| 128 |
outputs = model.generate(
|
| 129 |
**inputs,
|
| 130 |
max_new_tokens=max_tokens,
|
| 131 |
-
do_sample=
|
| 132 |
-
temperature=temperature,
|
| 133 |
-
top_p=top_p,
|
| 134 |
pad_token_id=tokenizer.pad_token_id,
|
| 135 |
eos_token_id=tokenizer.eos_token_id,
|
| 136 |
repetition_penalty=1.1,
|
|
@@ -152,7 +149,7 @@ def generate_response_gpu(prompt, max_tokens=200, temperature=0.7, top_p=0.9):
|
|
| 152 |
except Exception as generation_error:
|
| 153 |
return f"β Generation Error: {str(generation_error)}"
|
| 154 |
|
| 155 |
-
def chat_interface(message, history, max_tokens
|
| 156 |
"""Chat interface - runs on CPU, calls GPU functions"""
|
| 157 |
if not message.strip():
|
| 158 |
return history, ""
|
|
@@ -163,7 +160,7 @@ def chat_interface(message, history, max_tokens, temperature, top_p):
|
|
| 163 |
|
| 164 |
try:
|
| 165 |
# Call GPU function to generate response
|
| 166 |
-
response = generate_response_gpu(message, max_tokens
|
| 167 |
history.append((message, response))
|
| 168 |
return history, ""
|
| 169 |
except Exception as chat_error:
|
|
@@ -226,31 +223,13 @@ with gr.Blocks(
|
|
| 226 |
|
| 227 |
max_tokens = gr.Slider(
|
| 228 |
minimum=50,
|
| 229 |
-
maximum=
|
| 230 |
-
value=
|
| 231 |
step=10,
|
| 232 |
label="Max Tokens",
|
| 233 |
info="Maximum number of tokens to generate"
|
| 234 |
)
|
| 235 |
|
| 236 |
-
temperature = gr.Slider(
|
| 237 |
-
minimum=0.1,
|
| 238 |
-
maximum=2.0,
|
| 239 |
-
value=0.7,
|
| 240 |
-
step=0.1,
|
| 241 |
-
label="Temperature",
|
| 242 |
-
info="Controls randomness (lower = more focused)"
|
| 243 |
-
)
|
| 244 |
-
|
| 245 |
-
top_p = gr.Slider(
|
| 246 |
-
minimum=0.1,
|
| 247 |
-
maximum=1.0,
|
| 248 |
-
value=0.9,
|
| 249 |
-
step=0.05,
|
| 250 |
-
label="Top-p",
|
| 251 |
-
info="Nucleus sampling threshold"
|
| 252 |
-
)
|
| 253 |
-
|
| 254 |
gr.Markdown("""
|
| 255 |
### π Model Status
|
| 256 |
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
|
|
@@ -268,13 +247,13 @@ with gr.Blocks(
|
|
| 268 |
# Event handling
|
| 269 |
msg.submit(
|
| 270 |
chat_interface,
|
| 271 |
-
inputs=[msg, chatbot, max_tokens
|
| 272 |
outputs=[chatbot, msg]
|
| 273 |
)
|
| 274 |
|
| 275 |
send_btn.click(
|
| 276 |
chat_interface,
|
| 277 |
-
inputs=[msg, chatbot, max_tokens
|
| 278 |
outputs=[chatbot, msg]
|
| 279 |
)
|
| 280 |
|
|
|
|
| 92 |
raise load_error
|
| 93 |
|
| 94 |
@spaces.GPU(duration=60) # GPU inference
|
| 95 |
+
def generate_response_gpu(prompt, max_tokens=512):
|
| 96 |
"""Generate response - executed on GPU"""
|
| 97 |
global model
|
| 98 |
|
|
|
|
| 109 |
|
| 110 |
try:
|
| 111 |
formatted_prompt = (
|
|
|
|
| 112 |
"### Instruction:\n"
|
| 113 |
f"{prompt.strip()}\n\n"
|
| 114 |
"### Response:\n"
|
|
|
|
| 127 |
outputs = model.generate(
|
| 128 |
**inputs,
|
| 129 |
max_new_tokens=max_tokens,
|
| 130 |
+
do_sample=False,
|
|
|
|
|
|
|
| 131 |
pad_token_id=tokenizer.pad_token_id,
|
| 132 |
eos_token_id=tokenizer.eos_token_id,
|
| 133 |
repetition_penalty=1.1,
|
|
|
|
| 149 |
except Exception as generation_error:
|
| 150 |
return f"β Generation Error: {str(generation_error)}"
|
| 151 |
|
| 152 |
+
def chat_interface(message, history, max_tokens):
|
| 153 |
"""Chat interface - runs on CPU, calls GPU functions"""
|
| 154 |
if not message.strip():
|
| 155 |
return history, ""
|
|
|
|
| 160 |
|
| 161 |
try:
|
| 162 |
# Call GPU function to generate response
|
| 163 |
+
response = generate_response_gpu(message, max_tokens)
|
| 164 |
history.append((message, response))
|
| 165 |
return history, ""
|
| 166 |
except Exception as chat_error:
|
|
|
|
| 223 |
|
| 224 |
max_tokens = gr.Slider(
|
| 225 |
minimum=50,
|
| 226 |
+
maximum=5000,
|
| 227 |
+
value=512,
|
| 228 |
step=10,
|
| 229 |
label="Max Tokens",
|
| 230 |
info="Maximum number of tokens to generate"
|
| 231 |
)
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
gr.Markdown("""
|
| 234 |
### π Model Status
|
| 235 |
- **Hardware**: ZeroGPU (Dynamic Nvidia H200)
|
|
|
|
| 247 |
# Event handling
|
| 248 |
msg.submit(
|
| 249 |
chat_interface,
|
| 250 |
+
inputs=[msg, chatbot, max_tokens],
|
| 251 |
outputs=[chatbot, msg]
|
| 252 |
)
|
| 253 |
|
| 254 |
send_btn.click(
|
| 255 |
chat_interface,
|
| 256 |
+
inputs=[msg, chatbot, max_tokens],
|
| 257 |
outputs=[chatbot, msg]
|
| 258 |
)
|
| 259 |
|