Spaces:

BaoNhan
/

MinerU_2.5_plus_Intelligent_Reasoning_and_Knowledge_Explorer

Running

App Files Files Community

BaoNhan commited on 20 days ago

Commit

13a7fd4

verified ·

1 Parent(s): d3a4d4d

Update app.py

Browse files

Files changed (1) hide show

app.py +0 -21

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# --- RAG / Semantic Search imports ---
 import numpy as np
 import traceback
 import torch
@@ -18,7 +17,6 @@ client = OpenAI(
     base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
 )
-# --- Functions for RAG ---
 def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
     try:
         headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
@@ -157,11 +155,7 @@ async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, t
 def compress_directory_to_zip(directory_path, output_zip_path):
-    """压缩指定目录到一个 ZIP 文件。
-    :param directory_path: 要压缩的目录路径
-    :param output_zip_path: 输出的 ZIP 文件路径
-    """
     try:
         with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
@@ -186,23 +180,19 @@ def image_to_base64(image_path):
 def replace_image_with_base64(markdown_text, image_dir_path):
-    # 匹配Markdown中的图片标签
     pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
-    # 替换图片链接
     def replace(match):
         relative_path = match.group(1)
         full_path = os.path.join(image_dir_path, relative_path)
         base64_image = image_to_base64(full_path)
         return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
-    # 应用替换
     return re.sub(pattern, replace, markdown_text)
 async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
     file_path = to_pdf(file_path)
-    # 获取识别的md文件以及压缩包文件路径
     local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
     archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
     zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
@@ -214,7 +204,6 @@ async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True
     with open(md_path, 'r', encoding='utf-8') as f:
         txt_content = f.read()
     md_content = replace_image_with_base64(txt_content, local_md_dir)
-    # 返回转换后的PDF路径
     new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
     return md_content, txt_content, archive_zip_path, new_pdf_path
@@ -231,7 +220,6 @@ async def to_markdown_safe(file_path, end_pages=10, is_ocr=False,
     except Exception as e:
         err_msg = traceback.format_exc()
         logger.error(f"Error in to_markdown: {err_msg}")
-        # trả về giá trị mặc định để Gradio không crash
         return f"Error: {str(e)}", err_msg, None, None
@@ -297,15 +285,12 @@ devanagari_lang = [
 other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
 add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
-# all_lang = ['', 'auto']
 all_lang = []
-# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
 all_lang.extend([*other_lang, *add_lang])
 def safe_stem(file_path):
     stem = Path(file_path).stem
-    # 只保留字母、数字、下划线和点，其他字符替换为下划线
     return re.sub(r'[^\w.]', '_', stem)
@@ -316,20 +301,16 @@ def to_pdf(file_path):
     pdf_bytes = read_fn(file_path)
-    # unique_filename = f'{uuid.uuid4()}.pdf'
     unique_filename = f'{safe_stem(file_path)}.pdf'
-    # 构建完整的文件路径
     tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
-    # 将字节数据写入文件
     with open(tmp_file_path, 'wb') as tmp_pdf_file:
         tmp_pdf_file.write(pdf_bytes)
     return tmp_file_path
-# 更新界面函数
 def update_interface(backend_choice):
     if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
         return gr.update(visible=False), gr.update(visible=False)
@@ -487,14 +468,12 @@ def main(ctx,
                             inputs=[rag_md_text, rag_query, rag_true],
                             outputs=[rag_answer_out, rag_score_out, rag_timing_out]
                         )
-        # 添加事件处理
         backend.change(
             fn=update_interface,
             inputs=[backend],
             outputs=[client_options, ocr_options],
             api_name=False
         )
-        # 添加demo.load事件，在页面加载时触发一次界面更新
         demo.load(
             fn=update_interface,
             inputs=[backend],

 import numpy as np
 import traceback
 import torch
     base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
 )
 def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
     try:
         headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
 def compress_directory_to_zip(directory_path, output_zip_path):
     try:
         with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
 def replace_image_with_base64(markdown_text, image_dir_path):
     pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
     def replace(match):
         relative_path = match.group(1)
         full_path = os.path.join(image_dir_path, relative_path)
         base64_image = image_to_base64(full_path)
         return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
     return re.sub(pattern, replace, markdown_text)
 async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
     file_path = to_pdf(file_path)
     local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
     archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
     zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
     with open(md_path, 'r', encoding='utf-8') as f:
         txt_content = f.read()
     md_content = replace_image_with_base64(txt_content, local_md_dir)
     new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
     return md_content, txt_content, archive_zip_path, new_pdf_path
     except Exception as e:
         err_msg = traceback.format_exc()
         logger.error(f"Error in to_markdown: {err_msg}")
         return f"Error: {str(e)}", err_msg, None, None
 other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
 add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
 all_lang = []
 all_lang.extend([*other_lang, *add_lang])
 def safe_stem(file_path):
     stem = Path(file_path).stem
     return re.sub(r'[^\w.]', '_', stem)
     pdf_bytes = read_fn(file_path)
     unique_filename = f'{safe_stem(file_path)}.pdf'
     tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
     with open(tmp_file_path, 'wb') as tmp_pdf_file:
         tmp_pdf_file.write(pdf_bytes)
     return tmp_file_path
 def update_interface(backend_choice):
     if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
         return gr.update(visible=False), gr.update(visible=False)
                             inputs=[rag_md_text, rag_query, rag_true],
                             outputs=[rag_answer_out, rag_score_out, rag_timing_out]
                         )
         backend.change(
             fn=update_interface,
             inputs=[backend],
             outputs=[client_options, ocr_options],
             api_name=False
         )
         demo.load(
             fn=update_interface,
             inputs=[backend],