Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import re
|
|
| 5 |
import tempfile
|
| 6 |
import gc
|
| 7 |
from collections.abc import Iterator
|
| 8 |
-
from threading import Thread
|
| 9 |
import json
|
| 10 |
import requests
|
| 11 |
import cv2
|
|
@@ -21,7 +21,6 @@ import warnings
|
|
| 21 |
from typing import Dict, List, Optional, Union
|
| 22 |
import librosa
|
| 23 |
import scipy.signal as sps
|
| 24 |
-
from threading import Thread, Lock
|
| 25 |
import queue
|
| 26 |
|
| 27 |
# CSV/TXT ๋ถ์
|
|
@@ -55,6 +54,11 @@ model_loaded = False
|
|
| 55 |
whisper_loaded = False
|
| 56 |
model_name = "Gemma3-R1984-4B"
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
##############################################################################
|
| 59 |
# ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ
|
| 60 |
##############################################################################
|
|
@@ -64,6 +68,24 @@ def clear_cuda_cache():
|
|
| 64 |
torch.cuda.empty_cache()
|
| 65 |
gc.collect()
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
##############################################################################
|
| 68 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 69 |
##############################################################################
|
|
@@ -93,21 +115,8 @@ def load_whisper():
|
|
| 93 |
return False
|
| 94 |
|
| 95 |
##############################################################################
|
| 96 |
-
# ์ค๋์ค ์ฒ๋ฆฌ ํจ์
|
| 97 |
##############################################################################
|
| 98 |
-
import scipy.signal as sps
|
| 99 |
-
from threading import Thread, Lock
|
| 100 |
-
import queue
|
| 101 |
-
|
| 102 |
-
# ์ค๋์ค ๋ฒํผ ๊ด๋ฆฌ
|
| 103 |
-
audio_buffer_lock = Lock()
|
| 104 |
-
audio_buffer_a = []
|
| 105 |
-
audio_buffer_b = []
|
| 106 |
-
current_buffer = 'a' # ํ์ฌ ๋
น์ ์ค์ธ ๋ฒํผ
|
| 107 |
-
processing_queue = queue.Queue() # ์ฒ๋ฆฌ ๋๊ธฐ ํ
|
| 108 |
-
ready_audio_queue = queue.Queue() # ์ ์ฌ ์ค๋น๋ ์ค๋์ค
|
| 109 |
-
last_transcription = "" # ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ
|
| 110 |
-
|
| 111 |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
|
| 112 |
"""์ค๋์ค ๋ฆฌ์ํ๋ง"""
|
| 113 |
if orig_sr == target_sr:
|
|
@@ -146,152 +155,56 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
|
|
| 146 |
logger.error(traceback.format_exc())
|
| 147 |
return None
|
| 148 |
|
| 149 |
-
def
|
| 150 |
-
"""์ค๋์ค
|
| 151 |
-
global
|
| 152 |
-
|
| 153 |
-
if audio_chunk is None:
|
| 154 |
-
return
|
| 155 |
|
| 156 |
-
|
| 157 |
-
if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
|
| 158 |
-
sr, audio = audio_chunk
|
| 159 |
-
else:
|
| 160 |
-
logger.warning(f"์์์น ๋ชปํ ์ค๋์ค ํ์: {type(audio_chunk)}")
|
| 161 |
-
return
|
| 162 |
-
|
| 163 |
-
# ์ค๋์ค ๋ฐ์ดํฐ ๊ฒ์ฆ
|
| 164 |
-
if audio is None or len(audio) == 0:
|
| 165 |
-
return
|
| 166 |
-
|
| 167 |
-
# numpy ๋ฐฐ์ด๋ก ๋ณํ
|
| 168 |
-
if not isinstance(audio, np.ndarray):
|
| 169 |
-
audio = np.array(audio)
|
| 170 |
-
|
| 171 |
-
# ์คํ
๋ ์ค๋ฅผ ๋ชจ๋
ธ๋ก ๋ณํ
|
| 172 |
-
if audio.ndim > 1:
|
| 173 |
-
audio = audio.mean(axis=1)
|
| 174 |
-
|
| 175 |
-
# ๋ฌด์ ์ฒดํฌ (๋๋ฌด ์์ ์๋ฆฌ๋ ๋ฌด์)
|
| 176 |
-
max_val = np.max(np.abs(audio))
|
| 177 |
-
if max_val < 0.001:
|
| 178 |
-
return
|
| 179 |
-
|
| 180 |
-
with audio_buffer_lock:
|
| 181 |
-
if current_buffer == 'a':
|
| 182 |
-
audio_buffer_a.append((audio, sr))
|
| 183 |
-
if len(audio_buffer_a) % 10 == 0: # 10์ฒญํฌ๋ง๋ค ๋ก๊ทธ
|
| 184 |
-
logger.info(f"๋ฒํผ A: {len(audio_buffer_a)} ์ฒญํฌ, ์ต๋๊ฐ: {max_val:.4f}")
|
| 185 |
-
else:
|
| 186 |
-
audio_buffer_b.append((audio, sr))
|
| 187 |
-
if len(audio_buffer_b) % 10 == 0: # 10์ฒญํฌ๋ง๋ค ๋ก๊ทธ
|
| 188 |
-
logger.info(f"๋ฒํผ B: {len(audio_buffer_b)} ์ฒญํฌ, ์ต๋๊ฐ: {max_val:.4f}")
|
| 189 |
-
|
| 190 |
-
def switch_buffers():
|
| 191 |
-
"""๋ฒํผ ์ ํ ๋ฐ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ"""
|
| 192 |
-
global current_buffer, audio_buffer_a, audio_buffer_b
|
| 193 |
-
|
| 194 |
-
with audio_buffer_lock:
|
| 195 |
-
if current_buffer == 'a':
|
| 196 |
-
# A ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 197 |
-
if audio_buffer_a:
|
| 198 |
-
logger.info(f"๋ฒํผ A ์ ํ: {len(audio_buffer_a)} ์ฒญํฌ")
|
| 199 |
-
processing_queue.put(('a', audio_buffer_a.copy()))
|
| 200 |
-
audio_buffer_a.clear()
|
| 201 |
-
current_buffer = 'b'
|
| 202 |
-
else:
|
| 203 |
-
# B ๋ฒํผ๋ฅผ ์ฒ๋ฆฌ ํ์ ์ถ๊ฐ
|
| 204 |
-
if audio_buffer_b:
|
| 205 |
-
logger.info(f"๋ฒํผ B ์ ํ: {len(audio_buffer_b)} ์ฒญํฌ")
|
| 206 |
-
processing_queue.put(('b', audio_buffer_b.copy()))
|
| 207 |
-
audio_buffer_b.clear()
|
| 208 |
-
current_buffer = 'a'
|
| 209 |
-
|
| 210 |
-
def process_audio_buffer(buffer_data):
|
| 211 |
-
"""๋ฒํผ์ ์ค๋์ค ๋ฐ์ดํฐ ์ฒ๋ฆฌ"""
|
| 212 |
-
buffer_name, audio_chunks = buffer_data
|
| 213 |
-
|
| 214 |
-
if not audio_chunks:
|
| 215 |
-
logger.warning(f"๋ฒํผ {buffer_name} ๋น์ด์์")
|
| 216 |
return None
|
| 217 |
|
| 218 |
try:
|
| 219 |
-
#
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
except Exception as e:
|
| 246 |
-
logger.error(f"์ค๋์ค
|
| 247 |
import traceback
|
| 248 |
logger.error(traceback.format_exc())
|
| 249 |
|
| 250 |
return None
|
| 251 |
|
| 252 |
-
# ๋ฐฑ๊ทธ๋ผ์ด๋ ์ฒ๋ฆฌ ์ค๋ ๋
|
| 253 |
-
def audio_processing_worker():
|
| 254 |
-
"""๋ฐฑ๊ทธ๋ผ์ด๋์์ ์ค๋์ค ๋ฒํผ ์ฒ๋ฆฌ"""
|
| 255 |
-
global ready_audio_queue
|
| 256 |
-
|
| 257 |
-
while True:
|
| 258 |
-
try:
|
| 259 |
-
# ์ฒ๋ฆฌํ ๋ฒํผ ๊ฐ์ ธ์ค๊ธฐ
|
| 260 |
-
buffer_data = processing_queue.get(timeout=1)
|
| 261 |
-
|
| 262 |
-
# ์ค๋์ค ์ฒ๋ฆฌ (์ค๋น๋ง)
|
| 263 |
-
prepared_audio = process_audio_buffer(buffer_data)
|
| 264 |
-
|
| 265 |
-
if prepared_audio is not None:
|
| 266 |
-
# ์ค๋น๋ ์ค๋์ค๋ฅผ ํ์ ์ถ๊ฐ
|
| 267 |
-
ready_audio_queue.put(prepared_audio)
|
| 268 |
-
logger.info("์ค๋์ค ์ ์ฌ ์ค๋น ์๋ฃ")
|
| 269 |
-
|
| 270 |
-
except queue.Empty:
|
| 271 |
-
continue
|
| 272 |
-
except Exception as e:
|
| 273 |
-
logger.error(f"์ค๋์ค ์ฒ๋ฆฌ ์์ปค ์ค๋ฅ: {e}")
|
| 274 |
-
import traceback
|
| 275 |
-
logger.error(traceback.format_exc())
|
| 276 |
-
|
| 277 |
-
##############################################################################
|
| 278 |
-
# ํค์๋ ์ถ์ถ ํจ์
|
| 279 |
-
##############################################################################
|
| 280 |
-
def extract_keywords(text: str, top_k: int = 5) -> str:
|
| 281 |
-
"""ํค์๋ ์ถ์ถ"""
|
| 282 |
-
text = re.sub(r"[^a-zA-Z0-9๊ฐ-ํฃ\s]", "", text)
|
| 283 |
-
tokens = text.split()
|
| 284 |
-
|
| 285 |
-
seen = set()
|
| 286 |
-
unique_tokens = []
|
| 287 |
-
for token in tokens:
|
| 288 |
-
if token not in seen and len(token) > 1:
|
| 289 |
-
seen.add(token)
|
| 290 |
-
unique_tokens.append(token)
|
| 291 |
-
|
| 292 |
-
key_tokens = unique_tokens[:top_k]
|
| 293 |
-
return " ".join(key_tokens)
|
| 294 |
-
|
| 295 |
##############################################################################
|
| 296 |
# ์น ๊ฒ์ ํจ์
|
| 297 |
##############################################################################
|
|
@@ -305,8 +218,8 @@ def do_web_search(query: str) -> str:
|
|
| 305 |
"domain": "google.com",
|
| 306 |
"serp_type": "web",
|
| 307 |
"device": "desktop",
|
| 308 |
-
"lang": "ko",
|
| 309 |
-
"num": "10"
|
| 310 |
}
|
| 311 |
|
| 312 |
headers = {
|
|
@@ -410,20 +323,6 @@ def pdf_to_markdown(pdf_path: str) -> str:
|
|
| 410 |
|
| 411 |
return f"**[PDF ํ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
|
| 412 |
|
| 413 |
-
# ์์ปค ์ค๋ ๋ ์์
|
| 414 |
-
audio_worker_thread = None
|
| 415 |
-
|
| 416 |
-
def start_audio_worker():
|
| 417 |
-
"""์ค๋์ค ์์ปค ์ค๋ ๋ ์์"""
|
| 418 |
-
global audio_worker_thread
|
| 419 |
-
if audio_worker_thread is None or not audio_worker_thread.is_alive():
|
| 420 |
-
audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
|
| 421 |
-
audio_worker_thread.start()
|
| 422 |
-
logger.info("์ค๋์ค ์์ปค ์ค๋ ๋ ์์๋จ")
|
| 423 |
-
|
| 424 |
-
# ์ด๊ธฐ ์์
|
| 425 |
-
start_audio_worker()
|
| 426 |
-
|
| 427 |
##############################################################################
|
| 428 |
# ๋ชจ๋ธ ๋ก๋
|
| 429 |
##############################################################################
|
|
@@ -883,27 +782,6 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 883 |
'<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>'
|
| 884 |
)
|
| 885 |
|
| 886 |
-
# ์จ๊ฒจ์ง ์ค๋์ค ์
๋ ฅ
|
| 887 |
-
audio_input = gr.Audio(
|
| 888 |
-
sources=["microphone"],
|
| 889 |
-
streaming=True,
|
| 890 |
-
visible=False,
|
| 891 |
-
label="๋ง์ดํฌ ์
๋ ฅ"
|
| 892 |
-
)
|
| 893 |
-
|
| 894 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 895 |
-
def audio_stream_callback(audio_chunk):
|
| 896 |
-
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 897 |
-
accumulate_audio(audio_chunk)
|
| 898 |
-
return None # ์ํ ์
๋ฐ์ดํธ ์์
|
| 899 |
-
|
| 900 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฐ๊ฒฐ
|
| 901 |
-
audio_input.stream(
|
| 902 |
-
fn=audio_stream_callback,
|
| 903 |
-
inputs=[audio_input],
|
| 904 |
-
outputs=None
|
| 905 |
-
)
|
| 906 |
-
|
| 907 |
# ๋ฌธ์ ๋ถ์ ํญ (์จ๊น)
|
| 908 |
with gr.Tab("๐ ๋ฌธ์ ๋ถ์", visible=False):
|
| 909 |
with gr.Row():
|
|
@@ -946,25 +824,17 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 946 |
|
| 947 |
def clear_capture():
|
| 948 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 949 |
-
global last_transcription,
|
| 950 |
|
| 951 |
-
with
|
| 952 |
last_transcription = ""
|
| 953 |
-
|
| 954 |
-
audio_buffer_b.clear()
|
| 955 |
-
|
| 956 |
-
# ๋๊ธฐ ์ค์ธ ์ค๋์ค๋ ์ด๊ธฐํ
|
| 957 |
-
while not ready_audio_queue.empty():
|
| 958 |
-
try:
|
| 959 |
-
ready_audio_queue.get_nowait()
|
| 960 |
-
except:
|
| 961 |
-
break
|
| 962 |
|
| 963 |
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>', ""
|
| 964 |
|
| 965 |
def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
|
| 966 |
"""ํน์ ํ์คํฌ๋ก ์ด๋ฏธ์ง ๋ถ์"""
|
| 967 |
-
global last_transcription
|
| 968 |
|
| 969 |
if image is None:
|
| 970 |
return "โ ๋จผ์ ์ด๋ฏธ์ง๋ฅผ ์บก์ฒํ์ธ์.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โ ์ด๋ฏธ์ง ์์</div>'
|
|
@@ -973,7 +843,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 973 |
|
| 974 |
# ํ์ฌ ์ ์ฌ ํ
์คํธ ๊ฐ์ ธ์ค๊ธฐ
|
| 975 |
transcript = ""
|
| 976 |
-
with
|
| 977 |
transcript = last_transcription
|
| 978 |
|
| 979 |
result = analyze_image_for_robot(
|
|
@@ -1005,9 +875,9 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1005 |
|
| 1006 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
| 1007 |
@spaces.GPU(duration=60)
|
| 1008 |
-
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
|
| 1009 |
-
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์
|
| 1010 |
-
global last_transcription,
|
| 1011 |
|
| 1012 |
if webcam_frame is None:
|
| 1013 |
return (
|
|
@@ -1016,54 +886,23 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1016 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 1017 |
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>',
|
| 1018 |
"๋๊ธฐ ์ค...",
|
| 1019 |
-
|
| 1020 |
)
|
| 1021 |
|
| 1022 |
# ์บก์ฒ ์ํ
|
| 1023 |
timestamp = time.strftime("%H:%M:%S")
|
| 1024 |
|
| 1025 |
-
#
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
active = current_buffer
|
| 1032 |
-
buffer_status = f'<div class="buffer-info">๋ฒํผ ์ํ: {active.upper()} ํ์ฑ | A: {a_chunks}์ฒญํฌ, B: {b_chunks}์ฒญํฌ</div>'
|
| 1033 |
-
|
| 1034 |
-
# ๋ฒํผ ์ ํ (10์ด๋ง๋ค)
|
| 1035 |
-
if use_audio:
|
| 1036 |
-
logger.info(f"[{timestamp}] ์ค๋์ค ๋ฒํผ ์ ํ")
|
| 1037 |
-
switch_buffers()
|
| 1038 |
-
|
| 1039 |
-
# ์ค๋น๋ ์ค๋์ค๊ฐ ์์ผ๋ฉด ์ ์ฌ
|
| 1040 |
-
try:
|
| 1041 |
-
if not ready_audio_queue.empty():
|
| 1042 |
-
audio_data = ready_audio_queue.get_nowait()
|
| 1043 |
-
logger.info(f"์ค๋์ค ์ ์ฌ ์์... ๊ธธ์ด: {len(audio_data)/16000:.1f}์ด")
|
| 1044 |
-
|
| 1045 |
-
# GPU์์ Whisper ์คํ
|
| 1046 |
-
transcription = transcribe_audio_whisper(audio_data, 16000)
|
| 1047 |
-
|
| 1048 |
-
if transcription:
|
| 1049 |
-
logger.info(f"์ ์ฌ ์๋ฃ: {transcription[:50]}...")
|
| 1050 |
-
with audio_buffer_lock:
|
| 1051 |
-
last_transcription = transcription
|
| 1052 |
-
else:
|
| 1053 |
-
logger.warning("์ ์ฌ ๊ฒฐ๊ณผ ์์")
|
| 1054 |
-
else:
|
| 1055 |
-
logger.debug("์ ์ฌํ ์ค๋์ค ์์")
|
| 1056 |
-
except queue.Empty:
|
| 1057 |
-
logger.debug("์ ์ฌ ํ๊ฐ ๋น์ด์์")
|
| 1058 |
-
except Exception as e:
|
| 1059 |
-
logger.error(f"์ค๋์ค ์ ์ฌ ์ค๋ฅ: {e}")
|
| 1060 |
-
import traceback
|
| 1061 |
-
logger.error(traceback.format_exc())
|
| 1062 |
|
| 1063 |
# ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ ๊ฐ์ ธ์ค๊ธฐ
|
| 1064 |
audio_transcript = ""
|
| 1065 |
if use_audio:
|
| 1066 |
-
with
|
| 1067 |
audio_transcript = last_transcription
|
| 1068 |
if audio_transcript:
|
| 1069 |
logger.info(f"๋ถ์์ ์ฌ์ฉํ ์์ฑ: {audio_transcript[:50]}...")
|
|
@@ -1093,7 +932,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1093 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 1094 |
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>',
|
| 1095 |
transcript_display,
|
| 1096 |
-
|
| 1097 |
)
|
| 1098 |
|
| 1099 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
@@ -1103,39 +942,6 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1103 |
outputs=[webcam_state]
|
| 1104 |
)
|
| 1105 |
|
| 1106 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 1107 |
-
def audio_stream_callback(audio_chunk):
|
| 1108 |
-
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 1109 |
-
try:
|
| 1110 |
-
if audio_chunk is not None:
|
| 1111 |
-
# ์ฒ์ ๋ช ๋ฒ๋ง ๋ก๊น
|
| 1112 |
-
accumulate_audio(audio_chunk)
|
| 1113 |
-
except Exception as e:
|
| 1114 |
-
logger.error(f"์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ ์ค๋ฅ: {e}")
|
| 1115 |
-
import traceback
|
| 1116 |
-
logger.error(traceback.format_exc())
|
| 1117 |
-
return None
|
| 1118 |
-
|
| 1119 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฐ๊ฒฐ
|
| 1120 |
-
audio_input.stream(
|
| 1121 |
-
fn=audio_stream_callback,
|
| 1122 |
-
inputs=[audio_input],
|
| 1123 |
-
outputs=None
|
| 1124 |
-
)
|
| 1125 |
-
|
| 1126 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฒ๋ฆฌ
|
| 1127 |
-
def audio_stream_callback(audio_chunk):
|
| 1128 |
-
"""์ค๋์ค ์คํธ๋ฆผ ์ฝ๋ฐฑ - ๋ฒํผ์ ๋์ """
|
| 1129 |
-
accumulate_audio(audio_chunk)
|
| 1130 |
-
return None # ์ํ ์
๋ฐ์ดํธ ์์
|
| 1131 |
-
|
| 1132 |
-
# ์ค๋์ค ์คํธ๋ฆฌ๋ฐ ์ฐ๊ฒฐ
|
| 1133 |
-
audio_input.stream(
|
| 1134 |
-
fn=audio_stream_callback,
|
| 1135 |
-
inputs=[audio_input],
|
| 1136 |
-
outputs=None
|
| 1137 |
-
)
|
| 1138 |
-
|
| 1139 |
# ์๋ ์บก์ฒ ๋ฒํผ
|
| 1140 |
capture_btn.click(
|
| 1141 |
fn=capture_webcam,
|
|
@@ -1196,7 +1002,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1196 |
|
| 1197 |
# ์ค๋์ค ํ ๊ธ ์ด๋ฒคํธ
|
| 1198 |
def toggle_audio(enabled):
|
| 1199 |
-
global last_transcription, last_audio_data
|
| 1200 |
|
| 1201 |
if enabled:
|
| 1202 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
|
@@ -1235,7 +1041,7 @@ with gr.Blocks(title="๐ค ๋ก๋ด ์๊ฐ ์์คํ
(Gemma3-4B)", css=css) as dem
|
|
| 1235 |
# ์ค๋์ค ๋
น์ ์๋ฃ ์ ์ฒ๋ฆฌ
|
| 1236 |
def on_audio_recorded(audio_data):
|
| 1237 |
"""์ค๋์ค ๋
น์ ์๋ฃ ์ ์๋ ์ฒ๋ฆฌ"""
|
| 1238 |
-
global last_transcription
|
| 1239 |
|
| 1240 |
if audio_data is not None:
|
| 1241 |
logger.info("์ ์ค๋์ค ๋
น์ ๊ฐ์ง")
|
|
|
|
| 5 |
import tempfile
|
| 6 |
import gc
|
| 7 |
from collections.abc import Iterator
|
| 8 |
+
from threading import Thread, Lock
|
| 9 |
import json
|
| 10 |
import requests
|
| 11 |
import cv2
|
|
|
|
| 21 |
from typing import Dict, List, Optional, Union
|
| 22 |
import librosa
|
| 23 |
import scipy.signal as sps
|
|
|
|
| 24 |
import queue
|
| 25 |
|
| 26 |
# CSV/TXT ๋ถ์
|
|
|
|
| 54 |
whisper_loaded = False
|
| 55 |
model_name = "Gemma3-R1984-4B"
|
| 56 |
|
| 57 |
+
# ์ค๋์ค ๊ด๋ จ ์ ์ญ ๋ณ์
|
| 58 |
+
audio_lock = Lock()
|
| 59 |
+
last_audio_data = None
|
| 60 |
+
last_transcription = ""
|
| 61 |
+
|
| 62 |
##############################################################################
|
| 63 |
# ๋ฉ๋ชจ๋ฆฌ ๊ด๋ฆฌ
|
| 64 |
##############################################################################
|
|
|
|
| 68 |
torch.cuda.empty_cache()
|
| 69 |
gc.collect()
|
| 70 |
|
| 71 |
+
##############################################################################
|
| 72 |
+
# ํค์๋ ์ถ์ถ ํจ์
|
| 73 |
+
##############################################################################
|
| 74 |
+
def extract_keywords(text: str, top_k: int = 5) -> str:
|
| 75 |
+
"""ํค์๋ ์ถ์ถ"""
|
| 76 |
+
text = re.sub(r"[^a-zA-Z0-9๊ฐ-ํฃ\s]", "", text)
|
| 77 |
+
tokens = text.split()
|
| 78 |
+
|
| 79 |
+
seen = set()
|
| 80 |
+
unique_tokens = []
|
| 81 |
+
for token in tokens:
|
| 82 |
+
if token not in seen and len(token) > 1:
|
| 83 |
+
seen.add(token)
|
| 84 |
+
unique_tokens.append(token)
|
| 85 |
+
|
| 86 |
+
key_tokens = unique_tokens[:top_k]
|
| 87 |
+
return " ".join(key_tokens)
|
| 88 |
+
|
| 89 |
##############################################################################
|
| 90 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
| 91 |
##############################################################################
|
|
|
|
| 115 |
return False
|
| 116 |
|
| 117 |
##############################################################################
|
| 118 |
+
# ์ค๋์ค ์ฒ๋ฆฌ ํจ์ (๊ฐ์ํ)
|
| 119 |
##############################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
|
| 121 |
"""์ค๋์ค ๋ฆฌ์ํ๋ง"""
|
| 122 |
if orig_sr == target_sr:
|
|
|
|
| 155 |
logger.error(traceback.format_exc())
|
| 156 |
return None
|
| 157 |
|
| 158 |
+
def process_audio_recording(audio_data):
|
| 159 |
+
"""๋
น์๋ ์ค๋์ค ์ฒ๋ฆฌ"""
|
| 160 |
+
global last_audio_data, last_transcription, audio_lock
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
+
if audio_data is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
return None
|
| 164 |
|
| 165 |
try:
|
| 166 |
+
# ์ค๋์ค ๋ฐ์ดํฐ ์ถ์ถ
|
| 167 |
+
if isinstance(audio_data, tuple) and len(audio_data) == 2:
|
| 168 |
+
sr, audio = audio_data
|
| 169 |
+
else:
|
| 170 |
+
logger.warning(f"์์์น ๋ชปํ ์ค๋์ค ํ์: {type(audio_data)}")
|
| 171 |
+
return None
|
| 172 |
+
|
| 173 |
+
if audio is None or len(audio) == 0:
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
# numpy ๋ฐฐ์ด๋ก ๋ณํ
|
| 177 |
+
if not isinstance(audio, np.ndarray):
|
| 178 |
+
audio = np.array(audio)
|
| 179 |
+
|
| 180 |
+
# ์คํ
๋ ์ค๋ฅผ ๋ชจ๋
ธ๋ก ๋ณํ
|
| 181 |
+
if audio.ndim > 1:
|
| 182 |
+
audio = audio.mean(axis=1)
|
| 183 |
+
|
| 184 |
+
# 16kHz๋ก ๋ฆฌ์ํ๋ง
|
| 185 |
+
if sr != 16000:
|
| 186 |
+
audio = resample_audio(audio, sr, 16000)
|
| 187 |
+
|
| 188 |
+
# ์ ์ฅ
|
| 189 |
+
with audio_lock:
|
| 190 |
+
last_audio_data = (audio, 16000)
|
| 191 |
+
|
| 192 |
+
logger.info(f"์ค๋์ค ์ ์ฅ ์๋ฃ: {len(audio)/16000:.1f}์ด")
|
| 193 |
+
|
| 194 |
+
# ์ ์ฌ ์๋
|
| 195 |
+
transcription = transcribe_audio_whisper(audio, 16000)
|
| 196 |
+
if transcription:
|
| 197 |
+
with audio_lock:
|
| 198 |
+
last_transcription = transcription
|
| 199 |
+
return transcription
|
| 200 |
+
|
| 201 |
except Exception as e:
|
| 202 |
+
logger.error(f"์ค๋์ค ์ฒ๋ฆฌ ์ค๋ฅ: {e}")
|
| 203 |
import traceback
|
| 204 |
logger.error(traceback.format_exc())
|
| 205 |
|
| 206 |
return None
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
##############################################################################
|
| 209 |
# ์น ๊ฒ์ ํจ์
|
| 210 |
##############################################################################
|
|
|
|
| 218 |
"domain": "google.com",
|
| 219 |
"serp_type": "web",
|
| 220 |
"device": "desktop",
|
| 221 |
+
"lang": "ko", # ํ๊ตญ์ด ์ฐ์
|
| 222 |
+
"num": "10" # 10๊ฐ๋ก ์ ํ
|
| 223 |
}
|
| 224 |
|
| 225 |
headers = {
|
|
|
|
| 323 |
|
| 324 |
return f"**[PDF ํ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
##############################################################################
|
| 327 |
# ๋ชจ๋ธ ๋ก๋
|
| 328 |
##############################################################################
|
|
|
|
| 782 |
'<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>'
|
| 783 |
)
|
| 784 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
# ๋ฌธ์ ๋ถ์ ํญ (์จ๊น)
|
| 786 |
with gr.Tab("๐ ๋ฌธ์ ๋ถ์", visible=False):
|
| 787 |
with gr.Row():
|
|
|
|
| 824 |
|
| 825 |
def clear_capture():
|
| 826 |
"""์บก์ฒ ์ด๊ธฐํ"""
|
| 827 |
+
global last_transcription, last_audio_data, audio_lock
|
| 828 |
|
| 829 |
+
with audio_lock:
|
| 830 |
last_transcription = ""
|
| 831 |
+
last_audio_data = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
|
| 833 |
return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ฎ ์์คํ
์ค๋น</div>', ""
|
| 834 |
|
| 835 |
def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
|
| 836 |
"""ํน์ ํ์คํฌ๋ก ์ด๋ฏธ์ง ๋ถ์"""
|
| 837 |
+
global last_transcription, audio_lock
|
| 838 |
|
| 839 |
if image is None:
|
| 840 |
return "โ ๋จผ์ ์ด๋ฏธ์ง๋ฅผ ์บก์ฒํ์ธ์.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โ ์ด๋ฏธ์ง ์์</div>'
|
|
|
|
| 843 |
|
| 844 |
# ํ์ฌ ์ ์ฌ ํ
์คํธ ๊ฐ์ ธ์ค๊ธฐ
|
| 845 |
transcript = ""
|
| 846 |
+
with audio_lock:
|
| 847 |
transcript = last_transcription
|
| 848 |
|
| 849 |
result = analyze_image_for_robot(
|
|
|
|
| 875 |
|
| 876 |
# ์๋ ์บก์ฒ ๋ฐ ๋ถ์ ํจ์
|
| 877 |
@spaces.GPU(duration=60)
|
| 878 |
+
def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio, audio_data):
|
| 879 |
+
"""์๋ ์บก์ฒ ๋ฐ ๋ถ์"""
|
| 880 |
+
global last_transcription, audio_lock
|
| 881 |
|
| 882 |
if webcam_frame is None:
|
| 883 |
return (
|
|
|
|
| 886 |
'<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์น์บ ๋๊ธฐ ์ค</div>',
|
| 887 |
'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ์น์บ ๋๊ธฐ ์ค</div>',
|
| 888 |
"๋๊ธฐ ์ค...",
|
| 889 |
+
None # ์ค๋์ค ๋ฆฌ์
|
| 890 |
)
|
| 891 |
|
| 892 |
# ์บก์ฒ ์ํ
|
| 893 |
timestamp = time.strftime("%H:%M:%S")
|
| 894 |
|
| 895 |
+
# ์ค๋์ค ์ฒ๋ฆฌ (์์ผ๋ฉด)
|
| 896 |
+
if use_audio and audio_data is not None:
|
| 897 |
+
logger.info(f"[{timestamp}] ์ค๋์ค ์ฒ๋ฆฌ ์์")
|
| 898 |
+
transcription = process_audio_recording(audio_data)
|
| 899 |
+
if transcription:
|
| 900 |
+
logger.info(f"์๋ก์ด ์ ์ฌ: {transcription[:50]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 901 |
|
| 902 |
# ๋ง์ง๋ง ์ ์ฌ ๊ฒฐ๊ณผ ๊ฐ์ ธ์ค๊ธฐ
|
| 903 |
audio_transcript = ""
|
| 904 |
if use_audio:
|
| 905 |
+
with audio_lock:
|
| 906 |
audio_transcript = last_transcription
|
| 907 |
if audio_transcript:
|
| 908 |
logger.info(f"๋ถ์์ ์ฌ์ฉํ ์์ฑ: {audio_transcript[:50]}...")
|
|
|
|
| 932 |
'<div class="status-box" style="background:#d4edda; color:#155724;">โ
์๋ ๋ถ์ ์๋ฃ</div>',
|
| 933 |
f'<div class="auto-capture-status">๐ ์๋ ์บก์ฒ: ๋ง์ง๋ง ๋ถ์ {timestamp}</div>',
|
| 934 |
transcript_display,
|
| 935 |
+
None # ์ค๋์ค ๋ฆฌ์
(๋ค์ ๋
น์ ์ค๋น)
|
| 936 |
)
|
| 937 |
|
| 938 |
# ์น์บ ์คํธ๋ฆฌ๋ฐ
|
|
|
|
| 942 |
outputs=[webcam_state]
|
| 943 |
)
|
| 944 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
# ์๋ ์บก์ฒ ๋ฒํผ
|
| 946 |
capture_btn.click(
|
| 947 |
fn=capture_webcam,
|
|
|
|
| 1002 |
|
| 1003 |
# ์ค๋์ค ํ ๊ธ ์ด๋ฒคํธ
|
| 1004 |
def toggle_audio(enabled):
|
| 1005 |
+
global last_transcription, last_audio_data, audio_lock
|
| 1006 |
|
| 1007 |
if enabled:
|
| 1008 |
# Whisper ๋ชจ๋ธ ๋ก๋
|
|
|
|
| 1041 |
# ์ค๋์ค ๋
น์ ์๋ฃ ์ ์ฒ๋ฆฌ
|
| 1042 |
def on_audio_recorded(audio_data):
|
| 1043 |
"""์ค๋์ค ๋
น์ ์๋ฃ ์ ์๋ ์ฒ๋ฆฌ"""
|
| 1044 |
+
global last_transcription, audio_lock
|
| 1045 |
|
| 1046 |
if audio_data is not None:
|
| 1047 |
logger.info("์ ์ค๋์ค ๋
น์ ๊ฐ์ง")
|