broadfield-dev commited on
Commit
93a2bbb
·
verified ·
1 Parent(s): 020dfd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -164
app.py CHANGED
@@ -8,9 +8,11 @@ import time
8
  import numpy as np
9
 
10
  # --- Configuration ---
11
- MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
12
- CPU_DEVICE = "cpu"
13
-
 
 
14
 
15
  # --- DETAILED PROMPT TEMPLATE ---
16
  DETAILED_ANALYSIS_PROMPT = """
@@ -52,33 +54,41 @@ Describe the content of the sidebar, including any navigation, filters, or adver
52
  - **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
53
  """
54
 
55
-
56
- # --- Model and Processor Loading ---
57
- print("Loading model and processor...")
58
- try:
59
- processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
60
- model = Qwen3VLForConditionalGeneration.from_pretrained(
61
- MODEL_PATH,
62
- trust_remote_code=True,
63
- dtype="auto",
64
- device_map="auto"
65
- )
66
- print("Model and processor loaded successfully.")
67
- except Exception as e:
68
- print(f"Error loading model: {e}")
69
- # Exit or handle the error appropriately
70
- exit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  # --- Playwright Screenshot Function ---
74
  def take_screenshot(url, max_dimension=1024, full_page_capture=True):
75
- """
76
- Takes a screenshot of a webpage.
77
- - If full_page_capture is True, captures the full scrollable page and resizes
78
- based on the max_dimension applied to the width.
79
- - If False, it captures the visible viewport and resizes based on the
80
- largest dimension.
81
- """
82
  if not url.startswith('http://') and not url.startswith('https://'):
83
  url = 'http://' + url
84
  try:
@@ -89,200 +99,116 @@ def take_screenshot(url, max_dimension=1024, full_page_capture=True):
89
  screenshot_path = f"screenshot_{int(time.time())}.png"
90
  page.screenshot(path=screenshot_path, full_page=full_page_capture)
91
  browser.close()
92
-
93
- # --- Resize the Screenshot ---
94
  with Image.open(screenshot_path) as img:
95
  width, height = img.size
96
- if full_page_capture:
97
- # For full page, we only care about constraining the width
98
- if width > max_dimension:
 
 
 
 
99
  new_width = max_dimension
100
  new_height = int(height * (max_dimension / width))
101
- img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
102
- img.save(screenshot_path)
103
- else:
104
- # For viewport capture, constrain the largest dimension
105
- if max(width, height) > max_dimension:
106
- if width > height:
107
- new_width = max_dimension
108
- new_height = int(height * (max_dimension / width))
109
- else:
110
- new_height = max_dimension
111
- new_width = int(width * (max_dimension / height))
112
- img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
113
- img.save(screenshot_path)
114
-
115
  return screenshot_path
116
  except Exception as e:
117
  return f"Error taking screenshot: {str(e)}"
118
 
119
  # --- Inference Function ---
120
- def process_and_generate(image_input, text_prompt, processing_size=512):
121
- """
122
- Processes the image and text prompt, and generates a response from the model.
123
- Resizes the input image to the specified processing size.
124
- """
125
  if image_input is None or not text_prompt.strip():
126
  return "Please provide both an image and a text prompt."
 
 
 
127
 
128
- pil_image = Image.fromarray(image_input)
129
-
130
- # --- Resize for Processing ---
131
- pil_image = pil_image.resize((processing_size, processing_size), Image.Resampling.LANCZOS)
132
-
133
- messages = [
134
- {
135
- "role": "user",
136
- "content": [
137
- {"type": "image", "image": pil_image},
138
- {"type": "text", "text": text_prompt},
139
- ],
140
- }
141
- ]
142
-
143
- print("Processing inputs and generating response...")
144
  try:
145
- inputs = processor.apply_chat_template(
146
- messages,
147
- tokenize=True,
148
- add_generation_prompt=True,
149
- return_dict=True,
150
- return_tensors="pt"
151
- )
152
- inputs = inputs.to(model.device)
153
-
154
  if model.config.pad_token_id is None:
155
  model.config.pad_token_id = model.config.eos_token_id
156
-
157
  generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7)
158
-
159
- generated_ids_trimmed = [
160
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
161
- ]
162
-
163
- output_text = processor.batch_decode(
164
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
165
- )
166
-
167
  return output_text[0]
168
-
169
  except Exception as e:
170
- import traceback
171
- traceback.print_exc()
172
  return f"An error occurred during generation: {str(e)}"
173
 
174
  # --- Gradio Interface ---
175
  with gr.Blocks() as demo:
176
- gr.Markdown(
177
- """
178
- # Screenshot to Markdown with Qwen3-VL (CPU Optimized)
179
- Enter a URL to take a screenshot, then provide a prompt to generate a markdown document from the image.
180
- **Warning:** Running this on a free CPU Space can be slow. Use the controls below to manage performance.
181
- """
182
- )
183
 
 
 
184
  with gr.Accordion("Controls", open=True):
185
  with gr.Row():
186
- use_template_checkbox = gr.Checkbox(
187
- value=True,
188
- label="Use Detailed Analysis Template",
189
- info="If checked, uses a comprehensive prompt to dissect the page into sections. If unchecked, uses the prompt in the textbox below."
190
- )
191
- full_page_checkbox = gr.Checkbox(
192
- value=True,
193
- label="Enable Full Height Page Capture",
194
- info="If checked, captures the entire scrollable webpage. If unchecked, captures only the visible part."
195
- )
196
- max_dim_slider = gr.Slider(
197
- minimum=512,
198
- maximum=2048,
199
- value=1024,
200
- step=128,
201
- label="Max Screenshot Dimension (Width)",
202
- info="The maximum width of the captured screenshot. Larger values capture more detail but are slower."
203
- )
204
- processing_size_slider = gr.Slider(
205
- minimum=256,
206
- maximum=1024,
207
- value=512,
208
- step=64,
209
- label="Processing Image Size",
210
- info="The size the image is resized to before being fed to the model. Smaller values are much faster on CPU."
211
- )
212
 
213
  with gr.Row():
214
  url_input = gr.Textbox(label="Website URL", placeholder="e.g., www.google.com")
215
  screenshot_button = gr.Button("Capture Screenshot")
216
 
217
  with gr.Row():
 
218
  with gr.Column(scale=1):
219
- image_output = gr.Image(type="numpy", label="Screenshot")
220
- with gr.Column(scale=1):
221
- text_prompt = gr.Textbox(label="Custom Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this page's color scheme.")
222
  submit_button = gr.Button("Generate Markdown")
223
 
224
- with gr.Row():
225
- # --- CHANGED: Switched from gr.Markdown to gr.Textbox for copyable output ---
226
- output_text = gr.Textbox(
227
- label="Model Output",
228
- lines=20,
229
- interactive=False,
230
- placeholder="Generated markdown will appear here..."
231
- )
232
-
233
 
 
234
  def update_image(url, max_dimension, full_page_capture):
235
- screenshot_path = take_screenshot(url, max_dimension, full_page_capture)
236
- if isinstance(screenshot_path, str) and os.path.exists(screenshot_path):
237
- return screenshot_path
238
- else:
239
- raise gr.Error(screenshot_path)
240
 
241
- # --- Function to handle the loading UI and processing ---
242
- def generate_markdown_with_loading(image, user_prompt, processing_size, use_template):
243
- # Return a dictionary of updates to show the loading state
244
- yield {
245
- # --- CHANGED: Updated loading message for Textbox ---
246
- output_text: "Processing, please wait... ⏳",
247
- submit_button: gr.update(interactive=False)
248
- }
249
-
250
- # Determine which prompt to use
251
  final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
 
 
252
 
253
- # Process the data
254
- result = process_and_generate(image, final_prompt, processing_size)
255
-
256
- # Return a dictionary of updates with the final result
257
- yield {
258
- output_text: result,
259
- submit_button: gr.update(interactive=True)
260
- }
261
-
262
-
263
  screenshot_button.click(
264
  fn=update_image,
265
  inputs=[url_input, max_dim_slider, full_page_checkbox],
266
  outputs=image_output
267
  )
268
-
269
  submit_button.click(
270
  fn=generate_markdown_with_loading,
271
- inputs=[image_output, text_prompt, processing_size_slider, use_template_checkbox],
272
  outputs=[output_text, submit_button]
273
  )
274
 
275
-
276
  if __name__ == "__main__":
277
- # Install playwright browsers
278
  import subprocess
279
  try:
280
  print("Installing Playwright browsers...")
281
  subprocess.run(["playwright", "install"], check=True)
282
- # For Debian/Ubuntu based systems, install dependencies
283
  subprocess.run(["playwright", "install-deps"], check=True)
284
  print("Playwright installation complete.")
285
  except Exception as e:
286
  print(f"Could not install playwright dependencies: {e}")
287
-
288
  demo.launch()
 
8
  import numpy as np
9
 
10
  # --- Configuration ---
11
+ # Define the model options
12
+ MODEL_OPTIONS = {
13
+ "Standard (BF16)": "Qwen/Qwen3-VL-2B-Instruct",
14
+ "Quantized (FP8) - Faster": "Qwen/Qwen3-VL-2B-Instruct-FP8",
15
+ }
16
 
17
  # --- DETAILED PROMPT TEMPLATE ---
18
  DETAILED_ANALYSIS_PROMPT = """
 
54
  - **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
55
  """
56
 
57
+ # --- Model Loading Function ---
58
+ def load_model(model_name):
59
+ """Loads the specified model and processor from Hugging Face."""
60
+ model_id = MODEL_OPTIONS[model_name]
61
+ yield f"Status: Loading {model_name} model ({model_id})... Please wait.", gr.update(interactive=False)
62
+
63
+ try:
64
+ # Specific loading instructions for the FP8 model
65
+ if "FP8" in model_id:
66
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
67
+ model_id,
68
+ torch_dtype=torch.float8_e4m3fn,
69
+ device_map="auto",
70
+ trust_remote_code=True
71
+ )
72
+ else: # Standard loading for other models
73
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
74
+ model_id,
75
+ device_map="auto",
76
+ trust_remote_code=True
77
+ )
78
+
79
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
80
+
81
+ yield f"Status: {model_name} model loaded successfully.", gr.update(interactive=True)
82
+
83
+ except Exception as e:
84
+ yield f"Status: Error loading model: {e}", gr.update(interactive=True)
85
+ model, processor = None, None
86
+
87
+ return model, processor
88
 
89
 
90
  # --- Playwright Screenshot Function ---
91
  def take_screenshot(url, max_dimension=1024, full_page_capture=True):
 
 
 
 
 
 
 
92
  if not url.startswith('http://') and not url.startswith('https://'):
93
  url = 'http://' + url
94
  try:
 
99
  screenshot_path = f"screenshot_{int(time.time())}.png"
100
  page.screenshot(path=screenshot_path, full_page=full_page_capture)
101
  browser.close()
 
 
102
  with Image.open(screenshot_path) as img:
103
  width, height = img.size
104
+ if full_page_capture and width > max_dimension:
105
+ new_width = max_dimension
106
+ new_height = int(height * (max_dimension / width))
107
+ img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
108
+ img.save(screenshot_path)
109
+ elif not full_page_capture and max(width, height) > max_dimension:
110
+ if width > height:
111
  new_width = max_dimension
112
  new_height = int(height * (max_dimension / width))
113
+ else:
114
+ new_height = max_dimension
115
+ new_width = int(width * (max_dimension / height))
116
+ img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
117
+ img.save(screenshot_path)
 
 
 
 
 
 
 
 
 
118
  return screenshot_path
119
  except Exception as e:
120
  return f"Error taking screenshot: {str(e)}"
121
 
122
  # --- Inference Function ---
123
+ def process_and_generate(model, processor, image_input, text_prompt, processing_size=512):
124
+ if model is None or processor is None:
125
+ return "Error: Model is not loaded. Please select a model and click 'Load Model'."
 
 
126
  if image_input is None or not text_prompt.strip():
127
  return "Please provide both an image and a text prompt."
128
+
129
+ pil_image = Image.fromarray(image_input).resize((processing_size, processing_size), Image.Resampling.LANCZOS)
130
+ messages = [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": text_prompt}]}]
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  try:
133
+ inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
134
  if model.config.pad_token_id is None:
135
  model.config.pad_token_id = model.config.eos_token_id
 
136
  generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7)
137
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
138
+ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 
 
 
 
 
 
139
  return output_text[0]
 
140
  except Exception as e:
 
 
141
  return f"An error occurred during generation: {str(e)}"
142
 
143
  # --- Gradio Interface ---
144
  with gr.Blocks() as demo:
145
+ # State components to hold the loaded model and processor
146
+ model_state = gr.State(None)
147
+ processor_state = gr.State(None)
 
 
 
 
148
 
149
+ gr.Markdown("# Screenshot to Markdown with Qwen3-VL (CPU Optimized)")
150
+
151
  with gr.Accordion("Controls", open=True):
152
  with gr.Row():
153
+ model_selector = gr.Radio(choices=list(MODEL_OPTIONS.keys()), value="Quantized (FP8) - Faster", label="Select Model")
154
+ load_model_button = gr.Button("Load/Switch Model")
155
+ status_text = gr.Textbox(label="Status", value="Status: No model loaded.", interactive=False)
156
+
157
+ with gr.Row():
158
+ use_template_checkbox = gr.Checkbox(value=True, label="Use Detailed Analysis Template")
159
+ full_page_checkbox = gr.Checkbox(value=True, label="Enable Full Height Page Capture")
160
+ max_dim_slider = gr.Slider(512, 2048, 1024, step=128, label="Max Screenshot Dimension (Width)")
161
+ processing_size_slider = gr.Slider(256, 1024, 512, step=64, label="Processing Image Size")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  with gr.Row():
164
  url_input = gr.Textbox(label="Website URL", placeholder="e.g., www.google.com")
165
  screenshot_button = gr.Button("Capture Screenshot")
166
 
167
  with gr.Row():
168
+ image_output = gr.Image(type="numpy", label="Screenshot", scale=1)
169
  with gr.Column(scale=1):
170
+ text_prompt = gr.Textbox(label="Custom Prompt", value="Describe this page's color scheme.")
 
 
171
  submit_button = gr.Button("Generate Markdown")
172
 
173
+ output_text = gr.Textbox(label="Model Output", lines=20, interactive=False, placeholder="Generated markdown will appear here...")
 
 
 
 
 
 
 
 
174
 
175
+ # --- UI Event Handlers ---
176
  def update_image(url, max_dimension, full_page_capture):
177
+ path = take_screenshot(url, max_dimension, full_page_capture)
178
+ if "Error" in path: raise gr.Error(path)
179
+ return path
 
 
180
 
181
+ def generate_markdown_with_loading(model, processor, image, user_prompt, processing_size, use_template):
182
+ yield "Processing, please wait... ⏳", gr.update(interactive=False)
 
 
 
 
 
 
 
 
183
  final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
184
+ result = process_and_generate(model, processor, image, final_prompt, processing_size)
185
+ yield result, gr.update(interactive=True)
186
 
187
+ load_model_button.click(
188
+ fn=load_model,
189
+ inputs=[model_selector],
190
+ outputs=[status_text, load_model_button, model_state, processor_state]
191
+ )
192
+
 
 
 
 
193
  screenshot_button.click(
194
  fn=update_image,
195
  inputs=[url_input, max_dim_slider, full_page_checkbox],
196
  outputs=image_output
197
  )
198
+
199
  submit_button.click(
200
  fn=generate_markdown_with_loading,
201
+ inputs=[model_state, processor_state, image_output, text_prompt, processing_size_slider, use_template_checkbox],
202
  outputs=[output_text, submit_button]
203
  )
204
 
 
205
  if __name__ == "__main__":
 
206
  import subprocess
207
  try:
208
  print("Installing Playwright browsers...")
209
  subprocess.run(["playwright", "install"], check=True)
 
210
  subprocess.run(["playwright", "install-deps"], check=True)
211
  print("Playwright installation complete.")
212
  except Exception as e:
213
  print(f"Could not install playwright dependencies: {e}")
 
214
  demo.launch()