prithivMLmods commited on
Commit
e38b6e6
·
verified ·
1 Parent(s): 47ab751

update app

Browse files
Files changed (1) hide show
  1. app.py +6 -110
app.py CHANGED
@@ -11,9 +11,7 @@ from typing import Iterable
11
  import gradio as gr
12
  import spaces
13
  import torch
14
- import numpy as np
15
  from PIL import Image
16
- import cv2
17
 
18
  from transformers import (
19
  Qwen2_5_VLForConditionalGeneration,
@@ -142,27 +140,6 @@ model_o = Qwen2_5_VLForConditionalGeneration.from_pretrained(
142
  torch_dtype=torch.float16
143
  ).to(device).eval()
144
 
145
- def downsample_video(video_path):
146
- """
147
- Downsamples the video to evenly spaced frames.
148
- Each frame is returned as a PIL image along with its timestamp.
149
- """
150
- vidcap = cv2.VideoCapture(video_path)
151
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
152
- fps = vidcap.get(cv2.CAP_PROP_FPS)
153
- frames = []
154
- frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
155
- for i in frame_indices:
156
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
157
- success, image = vidcap.read()
158
- if success:
159
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
160
- pil_image = Image.fromarray(image)
161
- timestamp = round(i / fps, 2)
162
- frames.append((pil_image, timestamp))
163
- vidcap.release()
164
- return frames
165
-
166
  @spaces.GPU
167
  def generate_image(model_name: str, text: str, image: Image.Image,
168
  max_new_tokens: int = 1024,
@@ -217,72 +194,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
217
  time.sleep(0.01)
218
  yield buffer, buffer
219
 
220
- @spaces.GPU
221
- def generate_video(model_name: str, text: str, video_path: str,
222
- max_new_tokens: int = 1024,
223
- temperature: float = 0.6,
224
- top_p: float = 0.9,
225
- top_k: int = 50,
226
- repetition_penalty: float = 1.2):
227
- """
228
- Generates responses using the selected model for video input.
229
- Yields raw text and Markdown-formatted text.
230
- """
231
- if model_name == "docscopeOCR-7B-050425-exp":
232
- processor, model = processor_m, model_m
233
- elif model_name == "coreOCR-7B-050325-preview":
234
- processor, model = processor_x, model_x
235
- elif model_name == "MonkeyOCR-Recognition":
236
- processor, model = processor_g, model_g
237
- elif model_name == "Camel-Doc-OCR-080125(v2)":
238
- processor, model = processor_o, model_o
239
- else:
240
- yield "Invalid model selected.", "Invalid model selected."
241
- return
242
-
243
- if video_path is None:
244
- yield "Please upload a video.", "Please upload a video."
245
- return
246
-
247
- frames = downsample_video(video_path)
248
- messages = [
249
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
250
- {"role": "user", "content": [{"type": "text", "text": text}]}
251
- ]
252
- for frame in frames:
253
- image, timestamp = frame
254
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
255
- messages[1]["content"].append({"type": "image", "image": image})
256
- inputs = processor.apply_chat_template(
257
- messages,
258
- tokenize=True,
259
- add_generation_prompt=True,
260
- return_dict=True,
261
- return_tensors="pt",
262
- truncation=True,
263
- max_length=MAX_INPUT_TOKEN_LENGTH
264
- ).to(device)
265
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
266
- generation_kwargs = {
267
- **inputs,
268
- "streamer": streamer,
269
- "max_new_tokens": max_new_tokens,
270
- "do_sample": True,
271
- "temperature": temperature,
272
- "top_p": top_p,
273
- "top_k": top_k,
274
- "repetition_penalty": repetition_penalty,
275
- }
276
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
277
- thread.start()
278
- buffer = ""
279
- for new_text in streamer:
280
- buffer += new_text
281
- buffer = buffer.replace("<|im_end|>", "")
282
- time.sleep(0.01)
283
- yield buffer, buffer
284
-
285
- # Define examples for image and video inference
286
  image_examples = [
287
  ["Reconstruct the content [table] as it is.", "images/doc.jpg"],
288
  ["Reconstruct the doc [table] as it is.", "images/zh.png"],
@@ -292,11 +204,6 @@ image_examples = [
292
  ["OCR the image", "images/image1.png"]
293
  ]
294
 
295
- video_examples = [
296
- ["Explain the video in detail", "videos/2.mp4"],
297
- ["Explain the video in detail", "videos/1.mp4"]
298
- ]
299
-
300
  css = """
301
  #main-title h1 {
302
  font-size: 2.3em !important;
@@ -311,17 +218,11 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
311
  gr.Markdown("# **core [OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations)**", elem_id="main-title")
312
  with gr.Row():
313
  with gr.Column(scale=2):
314
- with gr.Tabs():
315
- with gr.TabItem("Image Inference"):
316
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
317
- image_upload = gr.Image(type="pil", label="Upload Image", height=290)
318
- image_submit = gr.Button("Submit", variant="primary")
319
- gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
320
- with gr.TabItem("Video Inference"):
321
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
322
- video_upload = gr.Video(label="Upload Video", height=290)
323
- video_submit = gr.Button("Submit", variant="primary")
324
- gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
325
  with gr.Accordion("Advanced options", open=False):
326
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
327
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -346,11 +247,6 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
346
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
347
  outputs=[output, markdown_output]
348
  )
349
- video_submit.click(
350
- fn=generate_video,
351
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
352
- outputs=[output, markdown_output]
353
- )
354
 
355
  if __name__ == "__main__":
356
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
11
  import gradio as gr
12
  import spaces
13
  import torch
 
14
  from PIL import Image
 
15
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
 
140
  torch_dtype=torch.float16
141
  ).to(device).eval()
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  @spaces.GPU
144
  def generate_image(model_name: str, text: str, image: Image.Image,
145
  max_new_tokens: int = 1024,
 
194
  time.sleep(0.01)
195
  yield buffer, buffer
196
 
197
+ # Define examples for image inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  image_examples = [
199
  ["Reconstruct the content [table] as it is.", "images/doc.jpg"],
200
  ["Reconstruct the doc [table] as it is.", "images/zh.png"],
 
204
  ["OCR the image", "images/image1.png"]
205
  ]
206
 
 
 
 
 
 
207
  css = """
208
  #main-title h1 {
209
  font-size: 2.3em !important;
 
218
  gr.Markdown("# **core [OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations)**", elem_id="main-title")
219
  with gr.Row():
220
  with gr.Column(scale=2):
221
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
222
+ image_upload = gr.Image(type="pil", label="Upload Image", height=290)
223
+ image_submit = gr.Button("Submit", variant="primary")
224
+ gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
225
+
 
 
 
 
 
 
226
  with gr.Accordion("Advanced options", open=False):
227
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
228
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
247
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
248
  outputs=[output, markdown_output]
249
  )
 
 
 
 
 
250
 
251
  if __name__ == "__main__":
252
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)