BaoNhan commited on
Commit
13a7fd4
·
verified ·
1 Parent(s): d3a4d4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -21
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # --- RAG / Semantic Search imports ---
2
  import numpy as np
3
  import traceback
4
  import torch
@@ -18,7 +17,6 @@ client = OpenAI(
18
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
19
  )
20
 
21
- # --- Functions for RAG ---
22
  def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
23
  try:
24
  headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
@@ -157,11 +155,7 @@ async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, t
157
 
158
 
159
  def compress_directory_to_zip(directory_path, output_zip_path):
160
- """压缩指定目录到一个 ZIP 文件。
161
 
162
- :param directory_path: 要压缩的目录路径
163
- :param output_zip_path: 输出的 ZIP 文件路径
164
- """
165
  try:
166
  with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
167
 
@@ -186,23 +180,19 @@ def image_to_base64(image_path):
186
 
187
 
188
  def replace_image_with_base64(markdown_text, image_dir_path):
189
- # 匹配Markdown中的图片标签
190
  pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
191
 
192
- # 替换图片链接
193
  def replace(match):
194
  relative_path = match.group(1)
195
  full_path = os.path.join(image_dir_path, relative_path)
196
  base64_image = image_to_base64(full_path)
197
  return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
198
 
199
- # 应用替换
200
  return re.sub(pattern, replace, markdown_text)
201
 
202
 
203
  async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
204
  file_path = to_pdf(file_path)
205
- # 获取识别的md文件以及压缩包文件路径
206
  local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
207
  archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
208
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
@@ -214,7 +204,6 @@ async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True
214
  with open(md_path, 'r', encoding='utf-8') as f:
215
  txt_content = f.read()
216
  md_content = replace_image_with_base64(txt_content, local_md_dir)
217
- # 返回转换后的PDF路径
218
  new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
219
 
220
  return md_content, txt_content, archive_zip_path, new_pdf_path
@@ -231,7 +220,6 @@ async def to_markdown_safe(file_path, end_pages=10, is_ocr=False,
231
  except Exception as e:
232
  err_msg = traceback.format_exc()
233
  logger.error(f"Error in to_markdown: {err_msg}")
234
- # trả về giá trị mặc định để Gradio không crash
235
  return f"Error: {str(e)}", err_msg, None, None
236
 
237
 
@@ -297,15 +285,12 @@ devanagari_lang = [
297
  other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
298
  add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
299
 
300
- # all_lang = ['', 'auto']
301
  all_lang = []
302
- # all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
303
  all_lang.extend([*other_lang, *add_lang])
304
 
305
 
306
  def safe_stem(file_path):
307
  stem = Path(file_path).stem
308
- # 只保留字母、数字、下划线和点,其他字符替换为下划线
309
  return re.sub(r'[^\w.]', '_', stem)
310
 
311
 
@@ -316,20 +301,16 @@ def to_pdf(file_path):
316
 
317
  pdf_bytes = read_fn(file_path)
318
 
319
- # unique_filename = f'{uuid.uuid4()}.pdf'
320
  unique_filename = f'{safe_stem(file_path)}.pdf'
321
 
322
- # 构建完整的文件路径
323
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
324
 
325
- # 将字节数据写入文件
326
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
327
  tmp_pdf_file.write(pdf_bytes)
328
 
329
  return tmp_file_path
330
 
331
 
332
- # 更新界面函数
333
  def update_interface(backend_choice):
334
  if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
335
  return gr.update(visible=False), gr.update(visible=False)
@@ -487,14 +468,12 @@ def main(ctx,
487
  inputs=[rag_md_text, rag_query, rag_true],
488
  outputs=[rag_answer_out, rag_score_out, rag_timing_out]
489
  )
490
- # 添加事件处理
491
  backend.change(
492
  fn=update_interface,
493
  inputs=[backend],
494
  outputs=[client_options, ocr_options],
495
  api_name=False
496
  )
497
- # 添加demo.load事件,在页面加载时触发一次界面更新
498
  demo.load(
499
  fn=update_interface,
500
  inputs=[backend],
 
 
1
  import numpy as np
2
  import traceback
3
  import torch
 
17
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
18
  )
19
 
 
20
  def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
21
  try:
22
  headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
 
155
 
156
 
157
  def compress_directory_to_zip(directory_path, output_zip_path):
 
158
 
 
 
 
159
  try:
160
  with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
161
 
 
180
 
181
 
182
  def replace_image_with_base64(markdown_text, image_dir_path):
 
183
  pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
184
 
 
185
  def replace(match):
186
  relative_path = match.group(1)
187
  full_path = os.path.join(image_dir_path, relative_path)
188
  base64_image = image_to_base64(full_path)
189
  return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
190
 
 
191
  return re.sub(pattern, replace, markdown_text)
192
 
193
 
194
  async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
195
  file_path = to_pdf(file_path)
 
196
  local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
197
  archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
198
  zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
 
204
  with open(md_path, 'r', encoding='utf-8') as f:
205
  txt_content = f.read()
206
  md_content = replace_image_with_base64(txt_content, local_md_dir)
 
207
  new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
208
 
209
  return md_content, txt_content, archive_zip_path, new_pdf_path
 
220
  except Exception as e:
221
  err_msg = traceback.format_exc()
222
  logger.error(f"Error in to_markdown: {err_msg}")
 
223
  return f"Error: {str(e)}", err_msg, None, None
224
 
225
 
 
285
  other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
286
  add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
287
 
 
288
  all_lang = []
 
289
  all_lang.extend([*other_lang, *add_lang])
290
 
291
 
292
  def safe_stem(file_path):
293
  stem = Path(file_path).stem
 
294
  return re.sub(r'[^\w.]', '_', stem)
295
 
296
 
 
301
 
302
  pdf_bytes = read_fn(file_path)
303
 
 
304
  unique_filename = f'{safe_stem(file_path)}.pdf'
305
 
 
306
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
307
 
 
308
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
309
  tmp_pdf_file.write(pdf_bytes)
310
 
311
  return tmp_file_path
312
 
313
 
 
314
  def update_interface(backend_choice):
315
  if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
316
  return gr.update(visible=False), gr.update(visible=False)
 
468
  inputs=[rag_md_text, rag_query, rag_true],
469
  outputs=[rag_answer_out, rag_score_out, rag_timing_out]
470
  )
 
471
  backend.change(
472
  fn=update_interface,
473
  inputs=[backend],
474
  outputs=[client_options, ocr_options],
475
  api_name=False
476
  )
 
477
  demo.load(
478
  fn=update_interface,
479
  inputs=[backend],