Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# --- RAG / Semantic Search imports ---
|
| 2 |
import numpy as np
|
| 3 |
import traceback
|
| 4 |
import torch
|
|
@@ -18,7 +17,6 @@ client = OpenAI(
|
|
| 18 |
base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
|
| 19 |
)
|
| 20 |
|
| 21 |
-
# --- Functions for RAG ---
|
| 22 |
def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
|
| 23 |
try:
|
| 24 |
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
|
|
@@ -157,11 +155,7 @@ async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, t
|
|
| 157 |
|
| 158 |
|
| 159 |
def compress_directory_to_zip(directory_path, output_zip_path):
|
| 160 |
-
"""压缩指定目录到一个 ZIP 文件。
|
| 161 |
|
| 162 |
-
:param directory_path: 要压缩的目录路径
|
| 163 |
-
:param output_zip_path: 输出的 ZIP 文件路径
|
| 164 |
-
"""
|
| 165 |
try:
|
| 166 |
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 167 |
|
|
@@ -186,23 +180,19 @@ def image_to_base64(image_path):
|
|
| 186 |
|
| 187 |
|
| 188 |
def replace_image_with_base64(markdown_text, image_dir_path):
|
| 189 |
-
# 匹配Markdown中的图片标签
|
| 190 |
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
|
| 191 |
|
| 192 |
-
# 替换图片链接
|
| 193 |
def replace(match):
|
| 194 |
relative_path = match.group(1)
|
| 195 |
full_path = os.path.join(image_dir_path, relative_path)
|
| 196 |
base64_image = image_to_base64(full_path)
|
| 197 |
return f''
|
| 198 |
|
| 199 |
-
# 应用替换
|
| 200 |
return re.sub(pattern, replace, markdown_text)
|
| 201 |
|
| 202 |
|
| 203 |
async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
|
| 204 |
file_path = to_pdf(file_path)
|
| 205 |
-
# 获取识别的md文件以及压缩包文件路径
|
| 206 |
local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
|
| 207 |
archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
|
| 208 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
|
@@ -214,7 +204,6 @@ async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True
|
|
| 214 |
with open(md_path, 'r', encoding='utf-8') as f:
|
| 215 |
txt_content = f.read()
|
| 216 |
md_content = replace_image_with_base64(txt_content, local_md_dir)
|
| 217 |
-
# 返回转换后的PDF路径
|
| 218 |
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
|
| 219 |
|
| 220 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
|
@@ -231,7 +220,6 @@ async def to_markdown_safe(file_path, end_pages=10, is_ocr=False,
|
|
| 231 |
except Exception as e:
|
| 232 |
err_msg = traceback.format_exc()
|
| 233 |
logger.error(f"Error in to_markdown: {err_msg}")
|
| 234 |
-
# trả về giá trị mặc định để Gradio không crash
|
| 235 |
return f"Error: {str(e)}", err_msg, None, None
|
| 236 |
|
| 237 |
|
|
@@ -297,15 +285,12 @@ devanagari_lang = [
|
|
| 297 |
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
|
| 298 |
add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
|
| 299 |
|
| 300 |
-
# all_lang = ['', 'auto']
|
| 301 |
all_lang = []
|
| 302 |
-
# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
| 303 |
all_lang.extend([*other_lang, *add_lang])
|
| 304 |
|
| 305 |
|
| 306 |
def safe_stem(file_path):
|
| 307 |
stem = Path(file_path).stem
|
| 308 |
-
# 只保留字母、数字、下划线和点,其他字符替换为下划线
|
| 309 |
return re.sub(r'[^\w.]', '_', stem)
|
| 310 |
|
| 311 |
|
|
@@ -316,20 +301,16 @@ def to_pdf(file_path):
|
|
| 316 |
|
| 317 |
pdf_bytes = read_fn(file_path)
|
| 318 |
|
| 319 |
-
# unique_filename = f'{uuid.uuid4()}.pdf'
|
| 320 |
unique_filename = f'{safe_stem(file_path)}.pdf'
|
| 321 |
|
| 322 |
-
# 构建完整的文件路径
|
| 323 |
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
| 324 |
|
| 325 |
-
# 将字节数据写入文件
|
| 326 |
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
| 327 |
tmp_pdf_file.write(pdf_bytes)
|
| 328 |
|
| 329 |
return tmp_file_path
|
| 330 |
|
| 331 |
|
| 332 |
-
# 更新界面函数
|
| 333 |
def update_interface(backend_choice):
|
| 334 |
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
|
| 335 |
return gr.update(visible=False), gr.update(visible=False)
|
|
@@ -487,14 +468,12 @@ def main(ctx,
|
|
| 487 |
inputs=[rag_md_text, rag_query, rag_true],
|
| 488 |
outputs=[rag_answer_out, rag_score_out, rag_timing_out]
|
| 489 |
)
|
| 490 |
-
# 添加事件处理
|
| 491 |
backend.change(
|
| 492 |
fn=update_interface,
|
| 493 |
inputs=[backend],
|
| 494 |
outputs=[client_options, ocr_options],
|
| 495 |
api_name=False
|
| 496 |
)
|
| 497 |
-
# 添加demo.load事件,在页面加载时触发一次界面更新
|
| 498 |
demo.load(
|
| 499 |
fn=update_interface,
|
| 500 |
inputs=[backend],
|
|
|
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
import traceback
|
| 3 |
import torch
|
|
|
|
| 17 |
base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
|
| 18 |
)
|
| 19 |
|
|
|
|
| 20 |
def md_to_kb_safe(md_text, embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"):
|
| 21 |
try:
|
| 22 |
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
def compress_directory_to_zip(directory_path, output_zip_path):
|
|
|
|
| 158 |
|
|
|
|
|
|
|
|
|
|
| 159 |
try:
|
| 160 |
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 161 |
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
def replace_image_with_base64(markdown_text, image_dir_path):
|
|
|
|
| 183 |
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
|
| 184 |
|
|
|
|
| 185 |
def replace(match):
|
| 186 |
relative_path = match.group(1)
|
| 187 |
full_path = os.path.join(image_dir_path, relative_path)
|
| 188 |
base64_image = image_to_base64(full_path)
|
| 189 |
return f''
|
| 190 |
|
|
|
|
| 191 |
return re.sub(pattern, replace, markdown_text)
|
| 192 |
|
| 193 |
|
| 194 |
async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
|
| 195 |
file_path = to_pdf(file_path)
|
|
|
|
| 196 |
local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
|
| 197 |
archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
|
| 198 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
|
|
|
| 204 |
with open(md_path, 'r', encoding='utf-8') as f:
|
| 205 |
txt_content = f.read()
|
| 206 |
md_content = replace_image_with_base64(txt_content, local_md_dir)
|
|
|
|
| 207 |
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
|
| 208 |
|
| 209 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
|
|
|
| 220 |
except Exception as e:
|
| 221 |
err_msg = traceback.format_exc()
|
| 222 |
logger.error(f"Error in to_markdown: {err_msg}")
|
|
|
|
| 223 |
return f"Error: {str(e)}", err_msg, None, None
|
| 224 |
|
| 225 |
|
|
|
|
| 285 |
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', "el", "th"]
|
| 286 |
add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
|
| 287 |
|
|
|
|
| 288 |
all_lang = []
|
|
|
|
| 289 |
all_lang.extend([*other_lang, *add_lang])
|
| 290 |
|
| 291 |
|
| 292 |
def safe_stem(file_path):
|
| 293 |
stem = Path(file_path).stem
|
|
|
|
| 294 |
return re.sub(r'[^\w.]', '_', stem)
|
| 295 |
|
| 296 |
|
|
|
|
| 301 |
|
| 302 |
pdf_bytes = read_fn(file_path)
|
| 303 |
|
|
|
|
| 304 |
unique_filename = f'{safe_stem(file_path)}.pdf'
|
| 305 |
|
|
|
|
| 306 |
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
| 307 |
|
|
|
|
| 308 |
with open(tmp_file_path, 'wb') as tmp_pdf_file:
|
| 309 |
tmp_pdf_file.write(pdf_bytes)
|
| 310 |
|
| 311 |
return tmp_file_path
|
| 312 |
|
| 313 |
|
|
|
|
| 314 |
def update_interface(backend_choice):
|
| 315 |
if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
|
| 316 |
return gr.update(visible=False), gr.update(visible=False)
|
|
|
|
| 468 |
inputs=[rag_md_text, rag_query, rag_true],
|
| 469 |
outputs=[rag_answer_out, rag_score_out, rag_timing_out]
|
| 470 |
)
|
|
|
|
| 471 |
backend.change(
|
| 472 |
fn=update_interface,
|
| 473 |
inputs=[backend],
|
| 474 |
outputs=[client_options, ocr_options],
|
| 475 |
api_name=False
|
| 476 |
)
|
|
|
|
| 477 |
demo.load(
|
| 478 |
fn=update_interface,
|
| 479 |
inputs=[backend],
|