broadfield-dev commited on
Commit
00920d5
·
verified ·
1 Parent(s): 1865678

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from PIL import Image
4
+ from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
5
+ from playwright.sync_api import sync_playwright
6
+ import os
7
+ import time
8
+
9
+ # --- Configuration ---
10
+ MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
11
+ CPU_DEVICE = "cpu"
12
+
13
+ # --- Model and Processor Loading ---
14
+ print("Loading model and processor...")
15
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
16
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
17
+ MODEL_PATH,
18
+ trust_remote_code=True,
19
+ dtype="auto",
20
+ device_map="auto"
21
+ )
22
+ print("Model and processor loaded successfully.")
23
+
24
+ # --- Playwright Screenshot Function ---
25
+ def take_screenshot(url):
26
+ """
27
+ Takes a screenshot of a webpage at the given URL using Playwright.
28
+ """
29
+ if not url.startswith('http://') and not url.startswith('https://'):
30
+ url = 'http://' + url
31
+ try:
32
+ with sync_playwright() as p:
33
+ browser = p.chromium.launch(headless=True)
34
+ page = browser.new_page()
35
+ page.goto(url, wait_until='networkidle')
36
+ screenshot_path = f"screenshot_{int(time.time())}.png"
37
+ page.screenshot(path=screenshot_path, full_page=True)
38
+ browser.close()
39
+ return screenshot_path
40
+ except Exception as e:
41
+ return f"Error taking screenshot: {str(e)}"
42
+
43
+ # --- Inference Function ---
44
+ def process_and_generate(image_input, text_prompt):
45
+ """
46
+ Processes the image and text prompt, and generates a response from the model.
47
+ """
48
+ if image_input is None or not text_prompt.strip():
49
+ return "Please provide both an image and a text prompt."
50
+
51
+ pil_image = Image.fromarray(image_input)
52
+
53
+ messages = [
54
+ {
55
+ "role": "user",
56
+ "content": [
57
+ {"type": "image", "image": pil_image},
58
+ {"type": "text", "text": text_prompt},
59
+ ],
60
+ }
61
+ ]
62
+
63
+ print("Processing inputs and generating response...")
64
+ try:
65
+ inputs = processor.apply_chat_template(
66
+ messages,
67
+ tokenize=True,
68
+ add_generation_prompt=True,
69
+ return_dict=True,
70
+ return_tensors="pt"
71
+ )
72
+ inputs = inputs.to(model.device)
73
+
74
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
75
+
76
+ generated_ids_trimmed = [
77
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
78
+ ]
79
+
80
+ output_text = processor.batch_decode(
81
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
82
+ )
83
+
84
+ return output_text[0]
85
+
86
+ except Exception as e:
87
+ return f"An error occurred during generation: {str(e)}"
88
+
89
+ # --- Gradio Interface ---
90
+ with gr.Blocks() as demo:
91
+ gr.Markdown(
92
+ """
93
+ # Screenshot to Markdown with Qwen3-VL
94
+ Enter a URL to take a screenshot, then provide a prompt to generate a markdown document from the image.
95
+ **Warning:** Running this on a free CPU Space can be slow.
96
+ """
97
+ )
98
+
99
+ with gr.Row():
100
+ url_input = gr.Textbox(label="Website URL", placeholder="e.g., www.google.com")
101
+ screenshot_button = gr.Button("Capture Screenshot")
102
+
103
+ with gr.Row():
104
+ with gr.Column():
105
+ image_output = gr.Image(type="numpy", label="Screenshot")
106
+ with gr.Column():
107
+ text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.")
108
+ submit_button = gr.Button("Generate Markdown")
109
+
110
+ with gr.Row():
111
+ output_text = gr.Markdown(label="Model Output")
112
+
113
+
114
+ def update_image(url):
115
+ screenshot_path = take_screenshot(url)
116
+ if isinstance(screenshot_path, str) and os.path.exists(screenshot_path):
117
+ return screenshot_path
118
+ else:
119
+ raise gr.Error(screenshot_path)
120
+
121
+ screenshot_button.click(
122
+ fn=update_image,
123
+ inputs=url_input,
124
+ outputs=image_output
125
+ )
126
+
127
+ submit_button.click(
128
+ fn=process_and_generate,
129
+ inputs=[image_output, text_prompt],
130
+ outputs=output_text
131
+ )
132
+
133
+ if __name__ == "__main__":
134
+ # Install playwright browsers
135
+ import subprocess
136
+ subprocess.run(["playwright", "install"], check=True)
137
+ subprocess.run(["playwright", "install-deps"], check=True)
138
+ demo.launch()