openfree commited on
Commit
591769c
ยท
verified ยท
1 Parent(s): 46b8860

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -280
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  import tempfile
6
  import gc
7
  from collections.abc import Iterator
8
- from threading import Thread
9
  import json
10
  import requests
11
  import cv2
@@ -21,7 +21,6 @@ import warnings
21
  from typing import Dict, List, Optional, Union
22
  import librosa
23
  import scipy.signal as sps
24
- from threading import Thread, Lock
25
  import queue
26
 
27
  # CSV/TXT ๋ถ„์„
@@ -55,6 +54,11 @@ model_loaded = False
55
  whisper_loaded = False
56
  model_name = "Gemma3-R1984-4B"
57
 
 
 
 
 
 
58
  ##############################################################################
59
  # ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
60
  ##############################################################################
@@ -64,6 +68,24 @@ def clear_cuda_cache():
64
  torch.cuda.empty_cache()
65
  gc.collect()
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  ##############################################################################
68
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
69
  ##############################################################################
@@ -93,21 +115,8 @@ def load_whisper():
93
  return False
94
 
95
  ##############################################################################
96
- # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
97
  ##############################################################################
98
- import scipy.signal as sps
99
- from threading import Thread, Lock
100
- import queue
101
-
102
- # ์˜ค๋””์˜ค ๋ฒ„ํผ ๊ด€๋ฆฌ
103
- audio_buffer_lock = Lock()
104
- audio_buffer_a = []
105
- audio_buffer_b = []
106
- current_buffer = 'a' # ํ˜„์žฌ ๋…น์Œ ์ค‘์ธ ๋ฒ„ํผ
107
- processing_queue = queue.Queue() # ์ฒ˜๋ฆฌ ๋Œ€๊ธฐ ํ
108
- ready_audio_queue = queue.Queue() # ์ „์‚ฌ ์ค€๋น„๋œ ์˜ค๋””์˜ค
109
- last_transcription = "" # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ
110
-
111
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
112
  """์˜ค๋””์˜ค ๋ฆฌ์ƒ˜ํ”Œ๋ง"""
113
  if orig_sr == target_sr:
@@ -146,152 +155,56 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
146
  logger.error(traceback.format_exc())
147
  return None
148
 
149
- def accumulate_audio(audio_chunk):
150
- """์˜ค๋””์˜ค ์ฒญํฌ๋ฅผ ๋ฒ„ํผ์— ๋ˆ„์ """
151
- global current_buffer, audio_buffer_a, audio_buffer_b
152
-
153
- if audio_chunk is None:
154
- return
155
 
156
- # Gradio ์ŠคํŠธ๋ฆฌ๋ฐ ํ˜•์‹ ์ฒ˜๋ฆฌ
157
- if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
158
- sr, audio = audio_chunk
159
- else:
160
- logger.warning(f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋””์˜ค ํ˜•์‹: {type(audio_chunk)}")
161
- return
162
-
163
- # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ
164
- if audio is None or len(audio) == 0:
165
- return
166
-
167
- # numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
168
- if not isinstance(audio, np.ndarray):
169
- audio = np.array(audio)
170
-
171
- # ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
172
- if audio.ndim > 1:
173
- audio = audio.mean(axis=1)
174
-
175
- # ๋ฌด์Œ ์ฒดํฌ (๋„ˆ๋ฌด ์ž‘์€ ์†Œ๋ฆฌ๋Š” ๋ฌด์‹œ)
176
- max_val = np.max(np.abs(audio))
177
- if max_val < 0.001:
178
- return
179
-
180
- with audio_buffer_lock:
181
- if current_buffer == 'a':
182
- audio_buffer_a.append((audio, sr))
183
- if len(audio_buffer_a) % 10 == 0: # 10์ฒญํฌ๋งˆ๋‹ค ๋กœ๊ทธ
184
- logger.info(f"๋ฒ„ํผ A: {len(audio_buffer_a)} ์ฒญํฌ, ์ตœ๋Œ€๊ฐ’: {max_val:.4f}")
185
- else:
186
- audio_buffer_b.append((audio, sr))
187
- if len(audio_buffer_b) % 10 == 0: # 10์ฒญํฌ๋งˆ๋‹ค ๋กœ๊ทธ
188
- logger.info(f"๋ฒ„ํผ B: {len(audio_buffer_b)} ์ฒญํฌ, ์ตœ๋Œ€๊ฐ’: {max_val:.4f}")
189
-
190
- def switch_buffers():
191
- """๋ฒ„ํผ ์ „ํ™˜ ๋ฐ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€"""
192
- global current_buffer, audio_buffer_a, audio_buffer_b
193
-
194
- with audio_buffer_lock:
195
- if current_buffer == 'a':
196
- # A ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
197
- if audio_buffer_a:
198
- logger.info(f"๋ฒ„ํผ A ์ „ํ™˜: {len(audio_buffer_a)} ์ฒญํฌ")
199
- processing_queue.put(('a', audio_buffer_a.copy()))
200
- audio_buffer_a.clear()
201
- current_buffer = 'b'
202
- else:
203
- # B ๋ฒ„ํผ๋ฅผ ์ฒ˜๋ฆฌ ํ์— ์ถ”๊ฐ€
204
- if audio_buffer_b:
205
- logger.info(f"๋ฒ„ํผ B ์ „ํ™˜: {len(audio_buffer_b)} ์ฒญํฌ")
206
- processing_queue.put(('b', audio_buffer_b.copy()))
207
- audio_buffer_b.clear()
208
- current_buffer = 'a'
209
-
210
- def process_audio_buffer(buffer_data):
211
- """๋ฒ„ํผ์˜ ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ"""
212
- buffer_name, audio_chunks = buffer_data
213
-
214
- if not audio_chunks:
215
- logger.warning(f"๋ฒ„ํผ {buffer_name} ๋น„์–ด์žˆ์Œ")
216
  return None
217
 
218
  try:
219
- # ๋ชจ๋“  ์ฒญํฌ๋ฅผ ํ•˜๋‚˜๋กœ ๊ฒฐํ•ฉ
220
- combined_audio = []
221
- sample_rate = 16000
222
-
223
- logger.info(f"๋ฒ„ํผ {buffer_name} ์ฒ˜๋ฆฌ ์‹œ์ž‘: {len(audio_chunks)} ์ฒญํฌ")
224
-
225
- for audio, sr in audio_chunks:
226
- # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
227
- if sr != 16000:
228
- audio = resample_audio(audio, sr, 16000)
229
- combined_audio.append(audio)
230
-
231
- # ๊ฒฐํ•ฉ
232
- if combined_audio:
233
- full_audio = np.concatenate(combined_audio)
234
- logger.info(f"์˜ค๋””์˜ค ๊ธธ์ด: {len(full_audio)/16000:.1f}์ดˆ")
235
-
236
- # ๋„ˆ๋ฌด ์งง์€ ์˜ค๋””์˜ค๋Š” ๋ฌด์‹œ
237
- if len(full_audio) < 16000 * 0.5: # 0.5์ดˆ ๋ฏธ๋งŒ
238
- logger.warning("์˜ค๋””์˜ค๊ฐ€ ๋„ˆ๋ฌด ์งง์Œ")
239
- return None
240
-
241
- # Whisper๋กœ ์ „์‚ฌ (GPU ํ•จ์ˆ˜ ํ˜ธ์ถœ)
242
- # ์—ฌ๊ธฐ์„œ๋Š” ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ๋งŒ ์ค€๋น„ํ•˜๊ณ  ์‹ค์ œ ์ „์‚ฌ๋Š” ๋ฉ”์ธ ์Šค๋ ˆ๋“œ์—์„œ
243
- return full_audio
244
-
 
 
 
 
 
 
 
 
 
245
  except Exception as e:
246
- logger.error(f"์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
247
  import traceback
248
  logger.error(traceback.format_exc())
249
 
250
  return None
251
 
252
- # ๋ฐฑ๊ทธ๋ผ์šด๋“œ ์ฒ˜๋ฆฌ ์Šค๋ ˆ๋“œ
253
- def audio_processing_worker():
254
- """๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์˜ค๋””์˜ค ๋ฒ„ํผ ์ฒ˜๋ฆฌ"""
255
- global ready_audio_queue
256
-
257
- while True:
258
- try:
259
- # ์ฒ˜๋ฆฌํ•  ๋ฒ„ํผ ๊ฐ€์ ธ์˜ค๊ธฐ
260
- buffer_data = processing_queue.get(timeout=1)
261
-
262
- # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (์ค€๋น„๋งŒ)
263
- prepared_audio = process_audio_buffer(buffer_data)
264
-
265
- if prepared_audio is not None:
266
- # ์ค€๋น„๋œ ์˜ค๋””์˜ค๋ฅผ ํ์— ์ถ”๊ฐ€
267
- ready_audio_queue.put(prepared_audio)
268
- logger.info("์˜ค๋””์˜ค ์ „์‚ฌ ์ค€๋น„ ์™„๋ฃŒ")
269
-
270
- except queue.Empty:
271
- continue
272
- except Exception as e:
273
- logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์›Œ์ปค ์˜ค๋ฅ˜: {e}")
274
- import traceback
275
- logger.error(traceback.format_exc())
276
-
277
- ##############################################################################
278
- # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
279
- ##############################################################################
280
- def extract_keywords(text: str, top_k: int = 5) -> str:
281
- """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
282
- text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
283
- tokens = text.split()
284
-
285
- seen = set()
286
- unique_tokens = []
287
- for token in tokens:
288
- if token not in seen and len(token) > 1:
289
- seen.add(token)
290
- unique_tokens.append(token)
291
-
292
- key_tokens = unique_tokens[:top_k]
293
- return " ".join(key_tokens)
294
-
295
  ##############################################################################
296
  # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
297
  ##############################################################################
@@ -305,8 +218,8 @@ def do_web_search(query: str) -> str:
305
  "domain": "google.com",
306
  "serp_type": "web",
307
  "device": "desktop",
308
- "lang": "ko",
309
- "num": "10"
310
  }
311
 
312
  headers = {
@@ -410,20 +323,6 @@ def pdf_to_markdown(pdf_path: str) -> str:
410
 
411
  return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
412
 
413
- # ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘
414
- audio_worker_thread = None
415
-
416
- def start_audio_worker():
417
- """์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘"""
418
- global audio_worker_thread
419
- if audio_worker_thread is None or not audio_worker_thread.is_alive():
420
- audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
421
- audio_worker_thread.start()
422
- logger.info("์˜ค๋””์˜ค ์›Œ์ปค ์Šค๋ ˆ๋“œ ์‹œ์ž‘๋จ")
423
-
424
- # ์ดˆ๊ธฐ ์‹œ์ž‘
425
- start_audio_worker()
426
-
427
  ##############################################################################
428
  # ๋ชจ๋ธ ๋กœ๋“œ
429
  ##############################################################################
@@ -883,27 +782,6 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
883
  '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>'
884
  )
885
 
886
- # ์ˆจ๊ฒจ์ง„ ์˜ค๋””์˜ค ์ž…๋ ฅ
887
- audio_input = gr.Audio(
888
- sources=["microphone"],
889
- streaming=True,
890
- visible=False,
891
- label="๋งˆ์ดํฌ ์ž…๋ ฅ"
892
- )
893
-
894
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
895
- def audio_stream_callback(audio_chunk):
896
- """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
897
- accumulate_audio(audio_chunk)
898
- return None # ์ƒํƒœ ์—…๋ฐ์ดํŠธ ์—†์Œ
899
-
900
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
901
- audio_input.stream(
902
- fn=audio_stream_callback,
903
- inputs=[audio_input],
904
- outputs=None
905
- )
906
-
907
  # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
908
  with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
909
  with gr.Row():
@@ -946,25 +824,17 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
946
 
947
  def clear_capture():
948
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
949
- global last_transcription, audio_buffer_a, audio_buffer_b, ready_audio_queue
950
 
951
- with audio_buffer_lock:
952
  last_transcription = ""
953
- audio_buffer_a.clear()
954
- audio_buffer_b.clear()
955
-
956
- # ๋Œ€๊ธฐ ์ค‘์ธ ์˜ค๋””์˜ค๋„ ์ดˆ๊ธฐํ™”
957
- while not ready_audio_queue.empty():
958
- try:
959
- ready_audio_queue.get_nowait()
960
- except:
961
- break
962
 
963
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
964
 
965
  def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
966
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
967
- global last_transcription
968
 
969
  if image is None:
970
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
@@ -973,7 +843,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
973
 
974
  # ํ˜„์žฌ ์ „์‚ฌ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
975
  transcript = ""
976
- with audio_buffer_lock:
977
  transcript = last_transcription
978
 
979
  result = analyze_image_for_robot(
@@ -1005,9 +875,9 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1005
 
1006
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
1007
  @spaces.GPU(duration=60)
1008
- def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
1009
- """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ (10์ดˆ๋งˆ๋‹ค ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜)"""
1010
- global last_transcription, ready_audio_queue, current_buffer, audio_buffer_a, audio_buffer_b
1011
 
1012
  if webcam_frame is None:
1013
  return (
@@ -1016,54 +886,23 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1016
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
1017
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
1018
  "๋Œ€๊ธฐ ์ค‘...",
1019
- '<div class="buffer-info">๋ฒ„ํผ ์ƒํƒœ: ๋Œ€๊ธฐ ์ค‘</div>'
1020
  )
1021
 
1022
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
1023
  timestamp = time.strftime("%H:%M:%S")
1024
 
1025
- # ๋ฒ„ํผ ์ƒํƒœ ์ •๋ณด
1026
- buffer_status = ""
1027
- if use_audio:
1028
- with audio_buffer_lock:
1029
- a_chunks = len(audio_buffer_a)
1030
- b_chunks = len(audio_buffer_b)
1031
- active = current_buffer
1032
- buffer_status = f'<div class="buffer-info">๋ฒ„ํผ ์ƒํƒœ: {active.upper()} ํ™œ์„ฑ | A: {a_chunks}์ฒญํฌ, B: {b_chunks}์ฒญํฌ</div>'
1033
-
1034
- # ๋ฒ„ํผ ์ „ํ™˜ (10์ดˆ๋งˆ๋‹ค)
1035
- if use_audio:
1036
- logger.info(f"[{timestamp}] ์˜ค๋””์˜ค ๋ฒ„ํผ ์ „ํ™˜")
1037
- switch_buffers()
1038
-
1039
- # ์ค€๋น„๋œ ์˜ค๋””์˜ค๊ฐ€ ์žˆ์œผ๋ฉด ์ „์‚ฌ
1040
- try:
1041
- if not ready_audio_queue.empty():
1042
- audio_data = ready_audio_queue.get_nowait()
1043
- logger.info(f"์˜ค๋””์˜ค ์ „์‚ฌ ์‹œ์ž‘... ๊ธธ์ด: {len(audio_data)/16000:.1f}์ดˆ")
1044
-
1045
- # GPU์—์„œ Whisper ์‹คํ–‰
1046
- transcription = transcribe_audio_whisper(audio_data, 16000)
1047
-
1048
- if transcription:
1049
- logger.info(f"์ „์‚ฌ ์™„๋ฃŒ: {transcription[:50]}...")
1050
- with audio_buffer_lock:
1051
- last_transcription = transcription
1052
- else:
1053
- logger.warning("์ „์‚ฌ ๊ฒฐ๊ณผ ์—†์Œ")
1054
- else:
1055
- logger.debug("์ „์‚ฌํ•  ์˜ค๋””์˜ค ์—†์Œ")
1056
- except queue.Empty:
1057
- logger.debug("์ „์‚ฌ ํ๊ฐ€ ๋น„์–ด์žˆ์Œ")
1058
- except Exception as e:
1059
- logger.error(f"์˜ค๋””์˜ค ์ „์‚ฌ ์˜ค๋ฅ˜: {e}")
1060
- import traceback
1061
- logger.error(traceback.format_exc())
1062
 
1063
  # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
1064
  audio_transcript = ""
1065
  if use_audio:
1066
- with audio_buffer_lock:
1067
  audio_transcript = last_transcription
1068
  if audio_transcript:
1069
  logger.info(f"๋ถ„์„์— ์‚ฌ์šฉํ•  ์Œ์„ฑ: {audio_transcript[:50]}...")
@@ -1093,7 +932,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1093
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
1094
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
1095
  transcript_display,
1096
- buffer_status
1097
  )
1098
 
1099
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
@@ -1103,39 +942,6 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1103
  outputs=[webcam_state]
1104
  )
1105
 
1106
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1107
- def audio_stream_callback(audio_chunk):
1108
- """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1109
- try:
1110
- if audio_chunk is not None:
1111
- # ์ฒ˜์Œ ๋ช‡ ๋ฒˆ๋งŒ ๋กœ๊น…
1112
- accumulate_audio(audio_chunk)
1113
- except Exception as e:
1114
- logger.error(f"์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ ์˜ค๋ฅ˜: {e}")
1115
- import traceback
1116
- logger.error(traceback.format_exc())
1117
- return None
1118
-
1119
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
1120
- audio_input.stream(
1121
- fn=audio_stream_callback,
1122
- inputs=[audio_input],
1123
- outputs=None
1124
- )
1125
-
1126
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์ฒ˜๋ฆฌ
1127
- def audio_stream_callback(audio_chunk):
1128
- """์˜ค๋””์˜ค ์ŠคํŠธ๋ฆผ ์ฝœ๋ฐฑ - ๋ฒ„ํผ์— ๋ˆ„์ """
1129
- accumulate_audio(audio_chunk)
1130
- return None # ์ƒํƒœ ์—…๋ฐ์ดํŠธ ์—†์Œ
1131
-
1132
- # ์˜ค๋””์˜ค ์ŠคํŠธ๋ฆฌ๋ฐ ์—ฐ๊ฒฐ
1133
- audio_input.stream(
1134
- fn=audio_stream_callback,
1135
- inputs=[audio_input],
1136
- outputs=None
1137
- )
1138
-
1139
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
1140
  capture_btn.click(
1141
  fn=capture_webcam,
@@ -1196,7 +1002,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1196
 
1197
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
1198
  def toggle_audio(enabled):
1199
- global last_transcription, last_audio_data
1200
 
1201
  if enabled:
1202
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
@@ -1235,7 +1041,7 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
1235
  # ์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ฒ˜๋ฆฌ
1236
  def on_audio_recorded(audio_data):
1237
  """์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ž๋™ ์ฒ˜๋ฆฌ"""
1238
- global last_transcription
1239
 
1240
  if audio_data is not None:
1241
  logger.info("์ƒˆ ์˜ค๋””์˜ค ๋…น์Œ ๊ฐ์ง€")
 
5
  import tempfile
6
  import gc
7
  from collections.abc import Iterator
8
+ from threading import Thread, Lock
9
  import json
10
  import requests
11
  import cv2
 
21
  from typing import Dict, List, Optional, Union
22
  import librosa
23
  import scipy.signal as sps
 
24
  import queue
25
 
26
  # CSV/TXT ๋ถ„์„
 
54
  whisper_loaded = False
55
  model_name = "Gemma3-R1984-4B"
56
 
57
+ # ์˜ค๋””์˜ค ๊ด€๋ จ ์ „์—ญ ๋ณ€์ˆ˜
58
+ audio_lock = Lock()
59
+ last_audio_data = None
60
+ last_transcription = ""
61
+
62
  ##############################################################################
63
  # ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
64
  ##############################################################################
 
68
  torch.cuda.empty_cache()
69
  gc.collect()
70
 
71
+ ##############################################################################
72
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
73
+ ##############################################################################
74
+ def extract_keywords(text: str, top_k: int = 5) -> str:
75
+ """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
76
+ text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
77
+ tokens = text.split()
78
+
79
+ seen = set()
80
+ unique_tokens = []
81
+ for token in tokens:
82
+ if token not in seen and len(token) > 1:
83
+ seen.add(token)
84
+ unique_tokens.append(token)
85
+
86
+ key_tokens = unique_tokens[:top_k]
87
+ return " ".join(key_tokens)
88
+
89
  ##############################################################################
90
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
91
  ##############################################################################
 
115
  return False
116
 
117
  ##############################################################################
118
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ํ•จ์ˆ˜ (๊ฐ„์†Œํ™”)
119
  ##############################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
121
  """์˜ค๋””์˜ค ๋ฆฌ์ƒ˜ํ”Œ๋ง"""
122
  if orig_sr == target_sr:
 
155
  logger.error(traceback.format_exc())
156
  return None
157
 
158
+ def process_audio_recording(audio_data):
159
+ """๋…น์Œ๋œ ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ"""
160
+ global last_audio_data, last_transcription, audio_lock
 
 
 
161
 
162
+ if audio_data is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  return None
164
 
165
  try:
166
+ # ์˜ค๋””์˜ค ๋ฐ์ดํ„ฐ ์ถ”์ถœ
167
+ if isinstance(audio_data, tuple) and len(audio_data) == 2:
168
+ sr, audio = audio_data
169
+ else:
170
+ logger.warning(f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ์˜ค๋””์˜ค ํ˜•์‹: {type(audio_data)}")
171
+ return None
172
+
173
+ if audio is None or len(audio) == 0:
174
+ return None
175
+
176
+ # numpy ๋ฐฐ์—ด๋กœ ๋ณ€ํ™˜
177
+ if not isinstance(audio, np.ndarray):
178
+ audio = np.array(audio)
179
+
180
+ # ์Šคํ…Œ๋ ˆ์˜ค๋ฅผ ๋ชจ๋…ธ๋กœ ๋ณ€ํ™˜
181
+ if audio.ndim > 1:
182
+ audio = audio.mean(axis=1)
183
+
184
+ # 16kHz๋กœ ๋ฆฌ์ƒ˜ํ”Œ๋ง
185
+ if sr != 16000:
186
+ audio = resample_audio(audio, sr, 16000)
187
+
188
+ # ์ €์žฅ
189
+ with audio_lock:
190
+ last_audio_data = (audio, 16000)
191
+
192
+ logger.info(f"์˜ค๋””์˜ค ์ €์žฅ ์™„๋ฃŒ: {len(audio)/16000:.1f}์ดˆ")
193
+
194
+ # ์ „์‚ฌ ์‹œ๋„
195
+ transcription = transcribe_audio_whisper(audio, 16000)
196
+ if transcription:
197
+ with audio_lock:
198
+ last_transcription = transcription
199
+ return transcription
200
+
201
  except Exception as e:
202
+ logger.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}")
203
  import traceback
204
  logger.error(traceback.format_exc())
205
 
206
  return None
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  ##############################################################################
209
  # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
210
  ##############################################################################
 
218
  "domain": "google.com",
219
  "serp_type": "web",
220
  "device": "desktop",
221
+ "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
222
+ "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
223
  }
224
 
225
  headers = {
 
323
 
324
  return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  ##############################################################################
327
  # ๋ชจ๋ธ ๋กœ๋“œ
328
  ##############################################################################
 
782
  '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>'
783
  )
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  # ๋ฌธ์„œ ๋ถ„์„ ํƒญ (์ˆจ๊น€)
786
  with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„", visible=False):
787
  with gr.Row():
 
824
 
825
  def clear_capture():
826
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
827
+ global last_transcription, last_audio_data, audio_lock
828
 
829
+ with audio_lock:
830
  last_transcription = ""
831
+ last_audio_data = None
 
 
 
 
 
 
 
 
832
 
833
  return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„</div>', ""
834
 
835
  def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
836
  """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
837
+ global last_transcription, audio_lock
838
 
839
  if image is None:
840
  return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
 
843
 
844
  # ํ˜„์žฌ ์ „์‚ฌ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
845
  transcript = ""
846
+ with audio_lock:
847
  transcript = last_transcription
848
 
849
  result = analyze_image_for_robot(
 
875
 
876
  # ์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„ ํ•จ์ˆ˜
877
  @spaces.GPU(duration=60)
878
+ def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio, audio_data):
879
+ """์ž๋™ ์บก์ฒ˜ ๋ฐ ๋ถ„์„"""
880
+ global last_transcription, audio_lock
881
 
882
  if webcam_frame is None:
883
  return (
 
886
  '<div class="status-box" style="background:#fff3cd; color:#856404;">โณ ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
887
  '<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ์›น์บ  ๋Œ€๊ธฐ ์ค‘</div>',
888
  "๋Œ€๊ธฐ ์ค‘...",
889
+ None # ์˜ค๋””์˜ค ๋ฆฌ์…‹
890
  )
891
 
892
  # ์บก์ฒ˜ ์ˆ˜ํ–‰
893
  timestamp = time.strftime("%H:%M:%S")
894
 
895
+ # ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ (์žˆ์œผ๋ฉด)
896
+ if use_audio and audio_data is not None:
897
+ logger.info(f"[{timestamp}] ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์‹œ์ž‘")
898
+ transcription = process_audio_recording(audio_data)
899
+ if transcription:
900
+ logger.info(f"์ƒˆ๋กœ์šด ์ „์‚ฌ: {transcription[:50]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901
 
902
  # ๋งˆ์ง€๋ง‰ ์ „์‚ฌ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
903
  audio_transcript = ""
904
  if use_audio:
905
+ with audio_lock:
906
  audio_transcript = last_transcription
907
  if audio_transcript:
908
  logger.info(f"๋ถ„์„์— ์‚ฌ์šฉํ•  ์Œ์„ฑ: {audio_transcript[:50]}...")
 
932
  '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ž๋™ ๋ถ„์„ ์™„๋ฃŒ</div>',
933
  f'<div class="auto-capture-status">๐Ÿ”„ ์ž๋™ ์บก์ฒ˜: ๋งˆ์ง€๋ง‰ ๋ถ„์„ {timestamp}</div>',
934
  transcript_display,
935
+ None # ์˜ค๋””์˜ค ๋ฆฌ์…‹ (๋‹ค์Œ ๋…น์Œ ์ค€๋น„)
936
  )
937
 
938
  # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
 
942
  outputs=[webcam_state]
943
  )
944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945
  # ์ˆ˜๋™ ์บก์ฒ˜ ๋ฒ„ํŠผ
946
  capture_btn.click(
947
  fn=capture_webcam,
 
1002
 
1003
  # ์˜ค๋””์˜ค ํ† ๊ธ€ ์ด๋ฒคํŠธ
1004
  def toggle_audio(enabled):
1005
+ global last_transcription, last_audio_data, audio_lock
1006
 
1007
  if enabled:
1008
  # Whisper ๋ชจ๋ธ ๋กœ๋“œ
 
1041
  # ์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ฒ˜๋ฆฌ
1042
  def on_audio_recorded(audio_data):
1043
  """์˜ค๋””์˜ค ๋…น์Œ ์™„๋ฃŒ ์‹œ ์ž๋™ ์ฒ˜๋ฆฌ"""
1044
+ global last_transcription, audio_lock
1045
 
1046
  if audio_data is not None:
1047
  logger.info("์ƒˆ ์˜ค๋””์˜ค ๋…น์Œ ๊ฐ์ง€")