Spaces:

broadfield-dev
/

url2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Oct 26

Commit

93a2bbb

verified ·

1 Parent(s): 020dfd6

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -164

app.py CHANGED Viewed

@@ -8,9 +8,11 @@ import time
 import numpy as np
 # --- Configuration ---
-MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct"
-CPU_DEVICE = "cpu"
 # --- DETAILED PROMPT TEMPLATE ---
 DETAILED_ANALYSIS_PROMPT = """
@@ -52,33 +54,41 @@ Describe the content of the sidebar, including any navigation, filters, or adver
 - **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
 """
-# --- Model and Processor Loading ---
-print("Loading model and processor...")
-try:
-    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
-    model = Qwen3VLForConditionalGeneration.from_pretrained(
-        MODEL_PATH,
-        trust_remote_code=True,
-        dtype="auto",
-        device_map="auto"
-    )
-    print("Model and processor loaded successfully.")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    # Exit or handle the error appropriately
-    exit()
 # --- Playwright Screenshot Function ---
 def take_screenshot(url, max_dimension=1024, full_page_capture=True):
-    """
-    Takes a screenshot of a webpage.
-    - If full_page_capture is True, captures the full scrollable page and resizes
-      based on the max_dimension applied to the width.
-    - If False, it captures the visible viewport and resizes based on the
-      largest dimension.
-    """
     if not url.startswith('http://') and not url.startswith('https://'):
         url = 'http://' + url
     try:
@@ -89,200 +99,116 @@ def take_screenshot(url, max_dimension=1024, full_page_capture=True):
             screenshot_path = f"screenshot_{int(time.time())}.png"
             page.screenshot(path=screenshot_path, full_page=full_page_capture)
             browser.close()
-            # --- Resize the Screenshot ---
             with Image.open(screenshot_path) as img:
                 width, height = img.size
-                if full_page_capture:
-                    # For full page, we only care about constraining the width
-                    if width > max_dimension:
                         new_width = max_dimension
                         new_height = int(height * (max_dimension / width))
-                        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
-                        img.save(screenshot_path)
-                else:
-                    # For viewport capture, constrain the largest dimension
-                    if max(width, height) > max_dimension:
-                        if width > height:
-                            new_width = max_dimension
-                            new_height = int(height * (max_dimension / width))
-                        else:
-                            new_height = max_dimension
-                            new_width = int(width * (max_dimension / height))
-                        img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
-                        img.save(screenshot_path)
             return screenshot_path
     except Exception as e:
         return f"Error taking screenshot: {str(e)}"
 # --- Inference Function ---
-def process_and_generate(image_input, text_prompt, processing_size=512):
-    """
-    Processes the image and text prompt, and generates a response from the model.
-    Resizes the input image to the specified processing size.
-    """
     if image_input is None or not text_prompt.strip():
         return "Please provide both an image and a text prompt."
-    pil_image = Image.fromarray(image_input)
-    # --- Resize for Processing ---
-    pil_image = pil_image.resize((processing_size, processing_size), Image.Resampling.LANCZOS)
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {"type": "text", "text": text_prompt},
-            ],
-        }
-    ]
-    print("Processing inputs and generating response...")
     try:
-        inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-        inputs = inputs.to(model.device)
         if model.config.pad_token_id is None:
             model.config.pad_token_id = model.config.eos_token_id
         generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
         return output_text[0]
     except Exception as e:
-        import traceback
-        traceback.print_exc()
         return f"An error occurred during generation: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # Screenshot to Markdown with Qwen3-VL (CPU Optimized)
-        Enter a URL to take a screenshot, then provide a prompt to generate a markdown document from the image.
-        **Warning:** Running this on a free CPU Space can be slow. Use the controls below to manage performance.
-        """
-    )
     with gr.Accordion("Controls", open=True):
         with gr.Row():
-            use_template_checkbox = gr.Checkbox(
-                value=True,
-                label="Use Detailed Analysis Template",
-                info="If checked, uses a comprehensive prompt to dissect the page into sections. If unchecked, uses the prompt in the textbox below."
-            )
-            full_page_checkbox = gr.Checkbox(
-                value=True,
-                label="Enable Full Height Page Capture",
-                info="If checked, captures the entire scrollable webpage. If unchecked, captures only the visible part."
-            )
-        max_dim_slider = gr.Slider(
-            minimum=512,
-            maximum=2048,
-            value=1024,
-            step=128,
-            label="Max Screenshot Dimension (Width)",
-            info="The maximum width of the captured screenshot. Larger values capture more detail but are slower."
-        )
-        processing_size_slider = gr.Slider(
-            minimum=256,
-            maximum=1024,
-            value=512,
-            step=64,
-            label="Processing Image Size",
-            info="The size the image is resized to before being fed to the model. Smaller values are much faster on CPU."
-        )
     with gr.Row():
         url_input = gr.Textbox(label="Website URL", placeholder="e.g., www.google.com")
         screenshot_button = gr.Button("Capture Screenshot")
     with gr.Row():
         with gr.Column(scale=1):
-            image_output = gr.Image(type="numpy", label="Screenshot")
-        with gr.Column(scale=1):
-            text_prompt = gr.Textbox(label="Custom Prompt", placeholder="e.g., Describe this webpage in detail as a markdown document.", value="Describe this page's color scheme.")
             submit_button = gr.Button("Generate Markdown")
-    with gr.Row():
-        # --- CHANGED: Switched from gr.Markdown to gr.Textbox for copyable output ---
-        output_text = gr.Textbox(
-            label="Model Output",
-            lines=20,
-            interactive=False,
-            placeholder="Generated markdown will appear here..."
-        )
     def update_image(url, max_dimension, full_page_capture):
-        screenshot_path = take_screenshot(url, max_dimension, full_page_capture)
-        if isinstance(screenshot_path, str) and os.path.exists(screenshot_path):
-            return screenshot_path
-        else:
-            raise gr.Error(screenshot_path)
-    # --- Function to handle the loading UI and processing ---
-    def generate_markdown_with_loading(image, user_prompt, processing_size, use_template):
-        # Return a dictionary of updates to show the loading state
-        yield {
-            # --- CHANGED: Updated loading message for Textbox ---
-            output_text: "Processing, please wait... ⏳",
-            submit_button: gr.update(interactive=False)
-        }
-        # Determine which prompt to use
         final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
-        # Process the data
-        result = process_and_generate(image, final_prompt, processing_size)
-        # Return a dictionary of updates with the final result
-        yield {
-            output_text: result,
-            submit_button: gr.update(interactive=True)
-        }
     screenshot_button.click(
         fn=update_image,
         inputs=[url_input, max_dim_slider, full_page_checkbox],
         outputs=image_output
     )
     submit_button.click(
         fn=generate_markdown_with_loading,
-        inputs=[image_output, text_prompt, processing_size_slider, use_template_checkbox],
         outputs=[output_text, submit_button]
     )
 if __name__ == "__main__":
-    # Install playwright browsers
     import subprocess
     try:
         print("Installing Playwright browsers...")
         subprocess.run(["playwright", "install"], check=True)
-        # For Debian/Ubuntu based systems, install dependencies
         subprocess.run(["playwright", "install-deps"], check=True)
         print("Playwright installation complete.")
     except Exception as e:
         print(f"Could not install playwright dependencies: {e}")
     demo.launch()

 import numpy as np
 # --- Configuration ---
+# Define the model options
+MODEL_OPTIONS = {
+    "Standard (BF16)": "Qwen/Qwen3-VL-2B-Instruct",
+    "Quantized (FP8) - Faster": "Qwen/Qwen3-VL-2B-Instruct-FP8",
+}
 # --- DETAILED PROMPT TEMPLATE ---
 DETAILED_ANALYSIS_PROMPT = """
 - **Layout:** Describe the overall structure (e.g., single-column, grid-based, etc.).
 """
+# --- Model Loading Function ---
+def load_model(model_name):
+    """Loads the specified model and processor from Hugging Face."""
+    model_id = MODEL_OPTIONS[model_name]
+    yield f"Status: Loading {model_name} model ({model_id})... Please wait.", gr.update(interactive=False)
+    try:
+        # Specific loading instructions for the FP8 model
+        if "FP8" in model_id:
+            model = Qwen3VLForConditionalGeneration.from_pretrained(
+                model_id,
+                torch_dtype=torch.float8_e4m3fn,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        else: # Standard loading for other models
+            model = Qwen3VLForConditionalGeneration.from_pretrained(
+                model_id,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        yield f"Status: {model_name} model loaded successfully.", gr.update(interactive=True)
+    except Exception as e:
+        yield f"Status: Error loading model: {e}", gr.update(interactive=True)
+        model, processor = None, None
+    return model, processor
 # --- Playwright Screenshot Function ---
 def take_screenshot(url, max_dimension=1024, full_page_capture=True):
     if not url.startswith('http://') and not url.startswith('https://'):
         url = 'http://' + url
     try:
             screenshot_path = f"screenshot_{int(time.time())}.png"
             page.screenshot(path=screenshot_path, full_page=full_page_capture)
             browser.close()
             with Image.open(screenshot_path) as img:
                 width, height = img.size
+                if full_page_capture and width > max_dimension:
+                    new_width = max_dimension
+                    new_height = int(height * (max_dimension / width))
+                    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                    img.save(screenshot_path)
+                elif not full_page_capture and max(width, height) > max_dimension:
+                    if width > height:
                         new_width = max_dimension
                         new_height = int(height * (max_dimension / width))
+                    else:
+                        new_height = max_dimension
+                        new_width = int(width * (max_dimension / height))
+                    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                    img.save(screenshot_path)
             return screenshot_path
     except Exception as e:
         return f"Error taking screenshot: {str(e)}"
 # --- Inference Function ---
+def process_and_generate(model, processor, image_input, text_prompt, processing_size=512):
+    if model is None or processor is None:
+        return "Error: Model is not loaded. Please select a model and click 'Load Model'."
     if image_input is None or not text_prompt.strip():
         return "Please provide both an image and a text prompt."
+    pil_image = Image.fromarray(image_input).resize((processing_size, processing_size), Image.Resampling.LANCZOS)
+    messages = [{"role": "user", "content": [{"type": "image", "image": pil_image}, {"type": "text", "text": text_prompt}]}]
     try:
+        inputs = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt").to(model.device)
         if model.config.pad_token_id is None:
             model.config.pad_token_id = model.config.eos_token_id
         generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=True, top_p=0.8, temperature=0.7)
+        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
         return output_text[0]
     except Exception as e:
         return f"An error occurred during generation: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
+    # State components to hold the loaded model and processor
+    model_state = gr.State(None)
+    processor_state = gr.State(None)
+    gr.Markdown("# Screenshot to Markdown with Qwen3-VL (CPU Optimized)")
     with gr.Accordion("Controls", open=True):
         with gr.Row():
+            model_selector = gr.Radio(choices=list(MODEL_OPTIONS.keys()), value="Quantized (FP8) - Faster", label="Select Model")
+            load_model_button = gr.Button("Load/Switch Model")
+        status_text = gr.Textbox(label="Status", value="Status: No model loaded.", interactive=False)
+        with gr.Row():
+            use_template_checkbox = gr.Checkbox(value=True, label="Use Detailed Analysis Template")
+            full_page_checkbox = gr.Checkbox(value=True, label="Enable Full Height Page Capture")
+        max_dim_slider = gr.Slider(512, 2048, 1024, step=128, label="Max Screenshot Dimension (Width)")
+        processing_size_slider = gr.Slider(256, 1024, 512, step=64, label="Processing Image Size")
     with gr.Row():
         url_input = gr.Textbox(label="Website URL", placeholder="e.g., www.google.com")
         screenshot_button = gr.Button("Capture Screenshot")
     with gr.Row():
+        image_output = gr.Image(type="numpy", label="Screenshot", scale=1)
         with gr.Column(scale=1):
+            text_prompt = gr.Textbox(label="Custom Prompt", value="Describe this page's color scheme.")
             submit_button = gr.Button("Generate Markdown")
+    output_text = gr.Textbox(label="Model Output", lines=20, interactive=False, placeholder="Generated markdown will appear here...")
+    # --- UI Event Handlers ---
     def update_image(url, max_dimension, full_page_capture):
+        path = take_screenshot(url, max_dimension, full_page_capture)
+        if "Error" in path: raise gr.Error(path)
+        return path
+    def generate_markdown_with_loading(model, processor, image, user_prompt, processing_size, use_template):
+        yield "Processing, please wait... ⏳", gr.update(interactive=False)
         final_prompt = DETAILED_ANALYSIS_PROMPT if use_template else user_prompt
+        result = process_and_generate(model, processor, image, final_prompt, processing_size)
+        yield result, gr.update(interactive=True)
+    load_model_button.click(
+        fn=load_model,
+        inputs=[model_selector],
+        outputs=[status_text, load_model_button, model_state, processor_state]
+    )
     screenshot_button.click(
         fn=update_image,
         inputs=[url_input, max_dim_slider, full_page_checkbox],
         outputs=image_output
     )
     submit_button.click(
         fn=generate_markdown_with_loading,
+        inputs=[model_state, processor_state, image_output, text_prompt, processing_size_slider, use_template_checkbox],
         outputs=[output_text, submit_button]
     )
 if __name__ == "__main__":
     import subprocess
     try:
         print("Installing Playwright browsers...")
         subprocess.run(["playwright", "install"], check=True)
         subprocess.run(["playwright", "install-deps"], check=True)
         print("Playwright installation complete.")
     except Exception as e:
         print(f"Could not install playwright dependencies: {e}")
     demo.launch()