bharatcoder commited on
Commit
fcf0972
·
verified ·
1 Parent(s): 5e6cb9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -1
app.py CHANGED
@@ -7,6 +7,51 @@ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
7
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
 
9
  def smoldocling_readimage(image, prompt_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  messages = [
11
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
12
  ]
@@ -30,4 +75,4 @@ demo = gr.Interface(
30
  description="Upload a document image and convert it to structured docling format."
31
  )
32
 
33
- demo.launch(mcp_server=True)
 
7
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
8
 
9
  def smoldocling_readimage(image, prompt_text):
10
+ """
11
+ Extract text and structured content from document images using SmolDocling model.
12
+
13
+ This function processes document images (PDFs, scanned documents, screenshots, etc.)
14
+ and converts them to structured text format based on the provided prompt. It uses
15
+ the SmolDocling-256M-preview model for image-to-text conversion with chat-based
16
+ prompting.
17
+
18
+ Args:
19
+ image (PIL.Image.Image): The input document image to process. Should be a PIL
20
+ Image object containing a document, text, or any visual content that needs
21
+ to be converted to text.
22
+ prompt_text (str): The instruction or prompt text that guides the model's
23
+ output format. Supported prompts include:
24
+
25
+ Content Conversion:
26
+ - "Convert this page to docling." - Full conversion to DocTags representation
27
+ - "Convert chart to table." - Convert charts to table format
28
+ - "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
29
+ - "Convert code to text." - Convert code blocks to readable text
30
+ - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
31
+
32
+ OCR and Location-based Actions:
33
+ - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
34
+ - "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
35
+ - "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
36
+ - "Detect footer elements on the page." - Identify footer content
37
+
38
+ Returns:
39
+ str: The extracted and formatted text content from the image, cleaned of
40
+ special tokens and whitespace. The format depends on the prompt_text
41
+ provided.
42
+
43
+ Example:
44
+ >>> from PIL import Image
45
+ >>> img = Image.open("document.pdf")
46
+ >>> result = smoldocling_readimage(img, "Convert to docling")
47
+ >>> print(result) # Returns structured document content
48
+
49
+ Note:
50
+ - The function is optimized for document images but can handle any image
51
+ containing text
52
+ - Processing time depends on image size and complexity
53
+ - Maximum output length is limited to 1024 new tokens
54
+ """
55
  messages = [
56
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
57
  ]
 
75
  description="Upload a document image and convert it to structured docling format."
76
  )
77
 
78
+ demo.launch(mcp_server=True)