Kingoteam commited on
Commit
c42200e
·
verified ·
1 Parent(s): 408ef77

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from threading import Thread
5
+
6
+ MODEL_ID = "ministral/Ministral-3b-instruct"
7
+
8
+ # ===== Load model & tokenizer =====
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ MODEL_ID,
12
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
13
+ device_map="auto"
14
+ )
15
+
16
+ # ===== Streaming Chat Function =====
17
+ def chat_stream(message, history):
18
+ prompt = ""
19
+ for user, bot in history:
20
+ prompt += f"<|user|>\n{user}\n<|assistant|>\n{bot}\n"
21
+ prompt += f"<|user|>\n{message}\n<|assistant|>\n"
22
+
23
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
24
+
25
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
26
+ generation_kwargs = dict(
27
+ **inputs,
28
+ streamer=streamer,
29
+ max_new_tokens=256,
30
+ temperature=0.7,
31
+ do_sample=True,
32
+ top_p=0.9,
33
+ )
34
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
35
+ thread.start()
36
+
37
+ partial_text = ""
38
+ for token in streamer:
39
+ partial_text += token
40
+ yield partial_text
41
+
42
+ # ===== Gradio UI =====
43
+ with gr.Blocks(title="Ministral 3B Chat") as demo:
44
+ gr.Markdown("## 🧠 Ministral 3B Instruct — Chat Demo\nLightweight model for Hugging Face Spaces.")
45
+ chatbot = gr.Chatbot(height=400)
46
+ msg = gr.Textbox(placeholder="متن خودت رو بنویس و Enter بزن...", label="پیام شما")
47
+
48
+ def respond(message, chat_history):
49
+ response = chat_stream(message, chat_history)
50
+ return response, chat_history + [(message, "")]
51
+
52
+ msg.submit(chat_stream, [msg, chatbot], chatbot)
53
+
54
+ demo.queue(max_size=32, concurrency_count=1)
55
+ demo.launch()