broadfield-dev commited on
Commit
e2ba8a5
·
verified ·
1 Parent(s): b13a27e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -21
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
- # --- ADDED: Import BitsAndBytesConfig for CPU quantization ---
5
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
6
  import os
7
  from playwright.sync_api import sync_playwright
@@ -9,10 +9,11 @@ import time
9
  import numpy as np
10
 
11
  # --- Configuration ---
12
- # --- CHANGED: Updated model options to reflect INT8 CPU quantization ---
13
  MODEL_OPTIONS = {
14
  "Standard (BF16)": "Qwen/Qwen3-VL-2B-Instruct",
15
- "Quantized (INT8) - Faster on CPU": "Qwen/Qwen3-VL-2B-Instruct", # We use the same base model for quantization
 
16
  }
17
 
18
  # --- DETAILED PROMPT TEMPLATE ---
@@ -63,25 +64,21 @@ def load_model(model_name):
63
 
64
  model, processor = None, None
65
  try:
66
- # --- CHANGED: New logic for CPU-compatible quantization ---
67
- if "Quantized" in model_name:
68
- # Use bitsandbytes for 8-bit quantization on CPU
 
 
69
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
70
- model = Qwen3VLForConditionalGeneration.from_pretrained(
71
- model_id,
72
- quantization_config=quantization_config,
73
- device_map="auto",
74
- trust_remote_code=True
75
- )
76
- else:
77
- # Standard loading for the full-precision model
78
- model = Qwen3VLForConditionalGeneration.from_pretrained(
79
- model_id,
80
- device_map="auto",
81
- trust_remote_code=True
82
- )
83
 
84
- # The processor is the same for both versions
85
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
86
 
87
  yield f"Status: {model_name} model loaded successfully.", gr.update(interactive=True), model, processor
@@ -153,7 +150,8 @@ with gr.Blocks() as demo:
153
  with gr.Row():
154
  model_selector = gr.Radio(
155
  choices=list(MODEL_OPTIONS.keys()),
156
- value="Quantized (INT8) - Faster on CPU", # Default to the faster option
 
157
  label="Select Model"
158
  )
159
  load_model_button = gr.Button("Load/Switch Model")
 
1
  import gradio as gr
2
  import torch
3
  from PIL import Image
4
+ # Import BitsAndBytesConfig for on-the-fly quantization
5
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
6
  import os
7
  from playwright.sync_api import sync_playwright
 
9
  import numpy as np
10
 
11
  # --- Configuration ---
12
+ # --- ADDED: New INT4 option pointing to the same base model ---
13
  MODEL_OPTIONS = {
14
  "Standard (BF16)": "Qwen/Qwen3-VL-2B-Instruct",
15
+ "Quantized (INT8) - Faster on CPU": "Qwen/Qwen3-VL-2B-Instruct",
16
+ "Quantized (INT4) - Fastest on CPU": "Qwen/Qwen3-VL-2B-Instruct",
17
  }
18
 
19
  # --- DETAILED PROMPT TEMPLATE ---
 
64
 
65
  model, processor = None, None
66
  try:
67
+ quantization_config = None
68
+ # --- ADDED: Logic to handle INT4, INT8, or no quantization ---
69
+ if "INT4" in model_name:
70
+ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
71
+ elif "INT8" in model_name:
72
  quantization_config = BitsAndBytesConfig(load_in_8bit=True)
73
+
74
+ # A single, clean call to from_pretrained handles all cases
75
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
76
+ model_id,
77
+ quantization_config=quantization_config, # Will be None if not quantizing
78
+ device_map="auto",
79
+ trust_remote_code=True
80
+ )
 
 
 
 
 
81
 
 
82
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
83
 
84
  yield f"Status: {model_name} model loaded successfully.", gr.update(interactive=True), model, processor
 
150
  with gr.Row():
151
  model_selector = gr.Radio(
152
  choices=list(MODEL_OPTIONS.keys()),
153
+ # --- CHANGED: Default to the fastest option ---
154
+ value="Quantized (INT4) - Fastest on CPU",
155
  label="Select Model"
156
  )
157
  load_model_button = gr.Button("Load/Switch Model")