bharatcoder commited on
Commit
1f42ce9
·
verified ·
1 Parent(s): 2c602f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -22
app.py CHANGED
@@ -1,27 +1,60 @@
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
4
- import PIL.Image
 
 
 
5
 
6
  # Load model & processor once at startup
7
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
9
 
10
- def smoldocling_readimage(image: PIL.Image.Image, prompt_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
  Extract text and structured content from document images using SmolDocling model.
13
 
14
- This function processes document images (PDFs, scanned documents, screenshots, etc.)
15
- and converts them to structured text format based on the provided prompt. It uses
16
- the SmolDocling-256M-preview model for image-to-text conversion with chat-based
17
- prompting.
18
 
19
  Args:
20
- image (PIL.Image.Image): The input document image to process. Should be a PIL
21
- Image object containing a document, text, or any visual content that needs
22
- to be converted to text.
23
- prompt_text (str): The instruction or prompt text that guides the model's
24
- output format. Supported prompts include:
25
 
26
  Content Conversion:
27
  - "Convert this page to docling." - Full conversion to DocTags representation
@@ -31,28 +64,30 @@ def smoldocling_readimage(image: PIL.Image.Image, prompt_text: str) -> str:
31
  - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
32
 
33
  OCR and Location-based Actions:
34
- - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
35
- - "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
36
- - "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
 
 
 
37
  - "Detect footer elements on the page." - Identify footer content
38
 
39
  Returns:
40
- str: The extracted and formatted text content from the image, cleaned of
41
- special tokens and whitespace. The format depends on the prompt_text
42
- provided.
43
 
44
  Example:
45
- >>> from PIL import Image
46
- >>> img = Image.open("document.pdf")
47
- >>> result = smoldocling_readimage(img, "Convert to docling")
48
  >>> print(result) # Returns structured document content
49
 
50
  Note:
51
- - The function is optimized for document images but can handle any image
52
- containing text
53
  - Processing time depends on image size and complexity
54
  - Maximum output length is limited to 1024 new tokens
55
  """
 
 
 
56
  messages = [
57
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
58
  ]
 
1
  import gradio as gr
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+ import os
7
+
8
 
9
  # Load model & processor once at startup
10
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
11
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
12
 
13
+
14
+ def convert_to_pil(image_input: str) -> Image.Image:
15
+ """
16
+ Convert base64 or file path string to PIL.Image.
17
+
18
+ Args:
19
+ image_input: Base64 encoded string or file path
20
+
21
+ Returns:
22
+ PIL.Image.Image object
23
+ """
24
+ # Check if it's a base64 string
25
+ if image_input.startswith('data:image'):
26
+ # Remove data:image/jpeg;base64, prefix
27
+ base64_str = image_input.split(',', 1)[1]
28
+ image_data = base64.b64decode(base64_str)
29
+ return Image.open(BytesIO(image_data))
30
+ elif ',' in image_input and len(image_input) > 100:
31
+ # Might be base64 without prefix
32
+ try:
33
+ image_data = base64.b64decode(image_input)
34
+ return Image.open(BytesIO(image_data))
35
+ except:
36
+ pass
37
+
38
+ # Assume it's a file path
39
+ if os.path.exists(image_input):
40
+ return Image.open(image_input)
41
+
42
+ raise ValueError(f"Could not convert image input to PIL.Image: {type(image_input)}")
43
+
44
+
45
+ def smoldocling_readimage(image: str, prompt_text: str) -> str:
46
  """
47
  Extract text and structured content from document images using SmolDocling model.
48
 
49
+ This function processes document images (PDFs, scanned documents, screenshots, etc.)
50
+ and converts them to structured text format based on the provided prompt. It uses
51
+ the SmolDocling-256M-preview model for image-to-text conversion with chat-based prompting.
 
52
 
53
  Args:
54
+ image (str): The input document image as base64 encoded string or file path.
55
+ MCP clients will send this as base64.
56
+ prompt_text (str): The instruction or prompt text that guides the model's output format.
57
+ Supported prompts include:
 
58
 
59
  Content Conversion:
60
  - "Convert this page to docling." - Full conversion to DocTags representation
 
64
  - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
65
 
66
  OCR and Location-based Actions:
67
+ - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>"
68
+ - Extract text from specific coordinates
69
+ - "Identify element at: <loc_247><loc_482><loc_252><loc_486>"
70
+ - Identify element type at coordinates
71
+ - "Find all 'text' elements on the page, retrieve all section headers."
72
+ - Extract section headers
73
  - "Detect footer elements on the page." - Identify footer content
74
 
75
  Returns:
76
+ str: The extracted and formatted text content from the image, cleaned of special
77
+ tokens and whitespace. The format depends on the prompt_text provided.
 
78
 
79
  Example:
80
+ >>> result = smoldocling_readimage("data:image/jpeg;base64,/9j/4AAQ...", "Convert to docling")
 
 
81
  >>> print(result) # Returns structured document content
82
 
83
  Note:
84
+ - The function is optimized for document images but can handle any image containing text
 
85
  - Processing time depends on image size and complexity
86
  - Maximum output length is limited to 1024 new tokens
87
  """
88
+ # Convert string input (base64 or path) to PIL.Image
89
+ pil_image = convert_to_pil(image)
90
+
91
  messages = [
92
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
93
  ]