ROBO-R1984

Runtime error

App Files Files Community

openfree commited on Jun 14, 2025

Commit

591769c

verified ·

1 Parent(s): 46b8860

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -280

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import re
 import tempfile
 import gc
 from collections.abc import Iterator
-from threading import Thread
 import json
 import requests
 import cv2
@@ -21,7 +21,6 @@ import warnings
 from typing import Dict, List, Optional, Union
 import librosa
 import scipy.signal as sps
-from threading import Thread, Lock
 import queue
 # CSV/TXT 분석
@@ -55,6 +54,11 @@ model_loaded = False
 whisper_loaded = False
 model_name = "Gemma3-R1984-4B"
 ##############################################################################
 # 메모리 관리
 ##############################################################################
@@ -64,6 +68,24 @@ def clear_cuda_cache():
         torch.cuda.empty_cache()
         gc.collect()
 ##############################################################################
 # Whisper 모델 로드
 ##############################################################################
@@ -93,21 +115,8 @@ def load_whisper():
         return False
 ##############################################################################
-# 오디오 처리 함수
 ##############################################################################
-import scipy.signal as sps
-from threading import Thread, Lock
-import queue
-# 오디오 버퍼 관리
-audio_buffer_lock = Lock()
-audio_buffer_a = []
-audio_buffer_b = []
-current_buffer = 'a'  # 현재 녹음 중인 버퍼
-processing_queue = queue.Queue()  # 처리 대기 큐
-ready_audio_queue = queue.Queue()  # 전사 준비된 오디오
-last_transcription = ""  # 마지막 전사 결과
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
     """오디오 리샘플링"""
     if orig_sr == target_sr:
@@ -146,152 +155,56 @@ def transcribe_audio_whisper(audio_array: np.ndarray, sr: int = 16000):
         logger.error(traceback.format_exc())
         return None
-def accumulate_audio(audio_chunk):
-    """오디오 청크를 버퍼에 누적"""
-    global current_buffer, audio_buffer_a, audio_buffer_b
-    if audio_chunk is None:
-        return
-    # Gradio 스트리밍 형식 처리
-    if isinstance(audio_chunk, tuple) and len(audio_chunk) == 2:
-        sr, audio = audio_chunk
-    else:
-        logger.warning(f"예상치 못한 오디오 형식: {type(audio_chunk)}")
-        return
-    # 오디오 데이터 검증
-    if audio is None or len(audio) == 0:
-        return
-    # numpy 배열로 변환
-    if not isinstance(audio, np.ndarray):
-        audio = np.array(audio)
-    # 스테레오를 모노로 변환
-    if audio.ndim > 1:
-        audio = audio.mean(axis=1)
-    # 무음 체크 (너무 작은 소리는 무시)
-    max_val = np.max(np.abs(audio))
-    if max_val < 0.001:
-        return
-    with audio_buffer_lock:
-        if current_buffer == 'a':
-            audio_buffer_a.append((audio, sr))
-            if len(audio_buffer_a) % 10 == 0:  # 10청크마다 로그
-                logger.info(f"버퍼 A: {len(audio_buffer_a)} 청크, 최대값: {max_val:.4f}")
-        else:
-            audio_buffer_b.append((audio, sr))
-            if len(audio_buffer_b) % 10 == 0:  # 10청크마다 로그
-                logger.info(f"버퍼 B: {len(audio_buffer_b)} 청크, 최대값: {max_val:.4f}")
-def switch_buffers():
-    """버퍼 전환 및 처리 큐에 추가"""
-    global current_buffer, audio_buffer_a, audio_buffer_b
-    with audio_buffer_lock:
-        if current_buffer == 'a':
-            # A 버퍼를 처리 큐에 추가
-            if audio_buffer_a:
-                logger.info(f"버퍼 A 전환: {len(audio_buffer_a)} 청크")
-                processing_queue.put(('a', audio_buffer_a.copy()))
-                audio_buffer_a.clear()
-            current_buffer = 'b'
-        else:
-            # B 버퍼를 처리 큐에 추가
-            if audio_buffer_b:
-                logger.info(f"버퍼 B 전환: {len(audio_buffer_b)} 청크")
-                processing_queue.put(('b', audio_buffer_b.copy()))
-                audio_buffer_b.clear()
-            current_buffer = 'a'
-def process_audio_buffer(buffer_data):
-    """버퍼의 오디오 데이터 처리"""
-    buffer_name, audio_chunks = buffer_data
-    if not audio_chunks:
-        logger.warning(f"버퍼 {buffer_name} 비어있음")
         return None
     try:
-        # 모든 청크를 하나로 결합
-        combined_audio = []
-        sample_rate = 16000
-        logger.info(f"버퍼 {buffer_name} 처리 시작: {len(audio_chunks)} 청크")
-        for audio, sr in audio_chunks:
-            # 16kHz로 리샘플링
-            if sr != 16000:
-                audio = resample_audio(audio, sr, 16000)
-            combined_audio.append(audio)
-        # 결합
-        if combined_audio:
-            full_audio = np.concatenate(combined_audio)
-            logger.info(f"오디오 길이: {len(full_audio)/16000:.1f}초")
-            # 너무 짧은 오디오는 무시
-            if len(full_audio) < 16000 * 0.5:  # 0.5초 미만
-                logger.warning("오디오가 너무 짧음")
-                return None
-            # Whisper로 전사 (GPU 함수 호출)
-            # 여기서는 오디오 데이터만 준비하고 실제 전사는 메인 스레드에서
-            return full_audio
     except Exception as e:
-        logger.error(f"오디오 버퍼 처리 오류: {e}")
         import traceback
         logger.error(traceback.format_exc())
     return None
-# 백그라운드 처리 스레드
-def audio_processing_worker():
-    """백그라운드에서 오디오 버퍼 처리"""
-    global ready_audio_queue
-    while True:
-        try:
-            # 처리할 버퍼 가져오기
-            buffer_data = processing_queue.get(timeout=1)
-            # 오디오 처리 (준비만)
-            prepared_audio = process_audio_buffer(buffer_data)
-            if prepared_audio is not None:
-                # 준비된 오디오를 큐에 추가
-                ready_audio_queue.put(prepared_audio)
-                logger.info("오디오 전사 준비 완료")
-        except queue.Empty:
-            continue
-        except Exception as e:
-            logger.error(f"오디오 처리 워커 오류: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-##############################################################################
-# 키워드 추출 함수
-##############################################################################
-def extract_keywords(text: str, top_k: int = 5) -> str:
-    """키워드 추출"""
-    text = re.sub(r"[^a-zA-Z0-9가-힣\s]", "", text)
-    tokens = text.split()
-    seen = set()
-    unique_tokens = []
-    for token in tokens:
-        if token not in seen and len(token) > 1:
-            seen.add(token)
-            unique_tokens.append(token)
-    key_tokens = unique_tokens[:top_k]
-    return " ".join(key_tokens)
 ##############################################################################
 # 웹 검색 함수
 ##############################################################################
@@ -305,8 +218,8 @@ def do_web_search(query: str) -> str:
             "domain": "google.com",
             "serp_type": "web",
             "device": "desktop",
-            "lang": "ko",
-            "num": "10"
         }
         headers = {
@@ -410,20 +323,6 @@ def pdf_to_markdown(pdf_path: str) -> str:
     return f"**[PDF 파일: {os.path.basename(pdf_path)}]**\n\n{full_text}"
-# 워커 스레드 시작
-audio_worker_thread = None
-def start_audio_worker():
-    """오디오 워커 스레드 시작"""
-    global audio_worker_thread
-    if audio_worker_thread is None or not audio_worker_thread.is_alive():
-        audio_worker_thread = Thread(target=audio_processing_worker, daemon=True)
-        audio_worker_thread.start()
-        logger.info("오디오 워커 스레드 시작됨")
-# 초기 시작
-start_audio_worker()
 ##############################################################################
 # 모델 로드
 ##############################################################################
@@ -883,27 +782,6 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>'
             )
-    # 숨겨진 오디오 입력
-    audio_input = gr.Audio(
-        sources=["microphone"],
-        streaming=True,
-        visible=False,
-        label="마이크 입력"
-    )
-    # 오디오 스트리밍 처리
-    def audio_stream_callback(audio_chunk):
-        """오디오 스트림 콜백 - 버퍼에 누적"""
-        accumulate_audio(audio_chunk)
-        return None  # 상태 업데이트 없음
-    # 오디오 스트리밍 연결
-    audio_input.stream(
-        fn=audio_stream_callback,
-        inputs=[audio_input],
-        outputs=None
-    )
     # 문서 분석 탭 (숨김)
     with gr.Tab("📄 문서 분석", visible=False):
         with gr.Row():
@@ -946,25 +824,17 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     def clear_capture():
         """캡처 초기화"""
-        global last_transcription, audio_buffer_a, audio_buffer_b, ready_audio_queue
-        with audio_buffer_lock:
             last_transcription = ""
-            audio_buffer_a.clear()
-            audio_buffer_b.clear()
-        # 대기 중인 오디오도 초기화
-        while not ready_audio_queue.empty():
-            try:
-                ready_audio_queue.get_nowait()
-            except:
-                break
         return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>', ""
     def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
         """특정 태스크로 이미지 분석"""
-        global last_transcription
         if image is None:
             return "❌ 먼저 이미지를 캡처하세요.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">❌ 이미지 없음</div>'
@@ -973,7 +843,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
         # 현재 전사 텍스트 가져오기
         transcript = ""
-        with audio_buffer_lock:
             transcript = last_transcription
         result = analyze_image_for_robot(
@@ -1005,9 +875,9 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     # 자동 캡처 및 분석 함수
     @spaces.GPU(duration=60)
-    def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio):
-        """자동 캡처 및 분석 (10초마다 오디오 버퍼 전환)"""
-        global last_transcription, ready_audio_queue, current_buffer, audio_buffer_a, audio_buffer_b
         if webcam_frame is None:
             return (
@@ -1016,54 +886,23 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 '<div class="status-box" style="background:#fff3cd; color:#856404;">⏳ 웹캠 대기 중</div>',
                 '<div class="auto-capture-status">🔄 자동 캡처: 웹캠 대기 중</div>',
                 "대기 중...",
-                '<div class="buffer-info">버퍼 상태: 대기 중</div>'
             )
         # 캡처 수행
         timestamp = time.strftime("%H:%M:%S")
-        # 버퍼 상태 정보
-        buffer_status = ""
-        if use_audio:
-            with audio_buffer_lock:
-                a_chunks = len(audio_buffer_a)
-                b_chunks = len(audio_buffer_b)
-                active = current_buffer
-                buffer_status = f'<div class="buffer-info">버퍼 상태: {active.upper()} 활성 | A: {a_chunks}청크, B: {b_chunks}청크</div>'
-        # 버퍼 전환 (10초마다)
-        if use_audio:
-            logger.info(f"[{timestamp}] 오디오 버퍼 전환")
-            switch_buffers()
-            # 준비된 오디오가 있으면 전사
-            try:
-                if not ready_audio_queue.empty():
-                    audio_data = ready_audio_queue.get_nowait()
-                    logger.info(f"오디오 전사 시작... 길이: {len(audio_data)/16000:.1f}초")
-                    # GPU에서 Whisper 실행
-                    transcription = transcribe_audio_whisper(audio_data, 16000)
-                    if transcription:
-                        logger.info(f"전사 완료: {transcription[:50]}...")
-                        with audio_buffer_lock:
-                            last_transcription = transcription
-                    else:
-                        logger.warning("전사 결과 없음")
-                else:
-                    logger.debug("전사할 오디오 없음")
-            except queue.Empty:
-                logger.debug("전사 큐가 비어있음")
-            except Exception as e:
-                logger.error(f"오디오 전사 오류: {e}")
-                import traceback
-                logger.error(traceback.format_exc())
         # 마지막 전사 결과 가져오기
         audio_transcript = ""
         if use_audio:
-            with audio_buffer_lock:
                 audio_transcript = last_transcription
                 if audio_transcript:
                     logger.info(f"분석에 사용할 음성: {audio_transcript[:50]}...")
@@ -1093,7 +932,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
             '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 자동 분석 완료</div>',
             f'<div class="auto-capture-status">🔄 자동 캡처: 마지막 분석 {timestamp}</div>',
             transcript_display,
-            buffer_status
         )
     # 웹캠 스트리밍
@@ -1103,39 +942,6 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
         outputs=[webcam_state]
     )
-    # 오디오 스트리밍 처리
-    def audio_stream_callback(audio_chunk):
-        """오디오 스트림 콜백 - 버퍼에 누적"""
-        try:
-            if audio_chunk is not None:
-                # 처음 몇 번만 로깅
-                accumulate_audio(audio_chunk)
-        except Exception as e:
-            logger.error(f"오디오 스트림 콜백 오류: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-        return None
-    # 오디오 스트리밍 연결
-    audio_input.stream(
-        fn=audio_stream_callback,
-        inputs=[audio_input],
-        outputs=None
-    )
-    # 오디오 스트리밍 처리
-    def audio_stream_callback(audio_chunk):
-        """오디오 스트림 콜백 - 버퍼에 누적"""
-        accumulate_audio(audio_chunk)
-        return None  # 상태 업데이트 없음
-    # 오디오 스트리밍 연결
-    audio_input.stream(
-        fn=audio_stream_callback,
-        inputs=[audio_input],
-        outputs=None
-    )
     # 수동 캡처 버튼
     capture_btn.click(
         fn=capture_webcam,
@@ -1196,7 +1002,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     # 오디오 토글 이벤트
     def toggle_audio(enabled):
-        global last_transcription, last_audio_data
         if enabled:
             # Whisper 모델 로드
@@ -1235,7 +1041,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     # 오디오 녹음 완료 시 처리
     def on_audio_recorded(audio_data):
         """오디오 녹음 완료 시 자동 처리"""
-        global last_transcription
         if audio_data is not None:
             logger.info("새 오디오 녹음 감지")

 import tempfile
 import gc
 from collections.abc import Iterator
+from threading import Thread, Lock
 import json
 import requests
 import cv2
 from typing import Dict, List, Optional, Union
 import librosa
 import scipy.signal as sps
 import queue
 # CSV/TXT 분석
 whisper_loaded = False
 model_name = "Gemma3-R1984-4B"
+# 오디오 관련 전역 변수
+audio_lock = Lock()
+last_audio_data = None
+last_transcription = ""
 ##############################################################################
 # 메모리 관리
 ##############################################################################
         torch.cuda.empty_cache()
         gc.collect()
+##############################################################################
+# 키워드 추출 함수
+##############################################################################
+def extract_keywords(text: str, top_k: int = 5) -> str:
+    """키워드 추출"""
+    text = re.sub(r"[^a-zA-Z0-9가-힣\s]", "", text)
+    tokens = text.split()
+    seen = set()
+    unique_tokens = []
+    for token in tokens:
+        if token not in seen and len(token) > 1:
+            seen.add(token)
+            unique_tokens.append(token)
+    key_tokens = unique_tokens[:top_k]
+    return " ".join(key_tokens)
 ##############################################################################
 # Whisper 모델 로드
 ##############################################################################
         return False
 ##############################################################################
+# 오디오 처리 함수 (간소화)
 ##############################################################################
 def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int = 16000) -> np.ndarray:
     """오디오 리샘플링"""
     if orig_sr == target_sr:
         logger.error(traceback.format_exc())
         return None
+def process_audio_recording(audio_data):
+    """녹음된 오디오 처리"""
+    global last_audio_data, last_transcription, audio_lock
+    if audio_data is None:
         return None
     try:
+        # 오디오 데이터 추출
+        if isinstance(audio_data, tuple) and len(audio_data) == 2:
+            sr, audio = audio_data
+        else:
+            logger.warning(f"예상치 못한 오디오 형식: {type(audio_data)}")
+            return None
+        if audio is None or len(audio) == 0:
+            return None
+        # numpy 배열로 변환
+        if not isinstance(audio, np.ndarray):
+            audio = np.array(audio)
+        # 스테레오를 모노로 변환
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        # 16kHz로 리샘플링
+        if sr != 16000:
+            audio = resample_audio(audio, sr, 16000)
+        # 저장
+        with audio_lock:
+            last_audio_data = (audio, 16000)
+        logger.info(f"오디오 저장 완료: {len(audio)/16000:.1f}초")
+        # 전사 시도
+        transcription = transcribe_audio_whisper(audio, 16000)
+        if transcription:
+            with audio_lock:
+                last_transcription = transcription
+            return transcription
     except Exception as e:
+        logger.error(f"오디오 처리 오류: {e}")
         import traceback
         logger.error(traceback.format_exc())
     return None
 ##############################################################################
 # 웹 검색 함수
 ##############################################################################
             "domain": "google.com",
             "serp_type": "web",
             "device": "desktop",
+            "lang": "ko",  # 한국어 우선
+            "num": "10"   # 10개로 제한
         }
         headers = {
     return f"**[PDF 파일: {os.path.basename(pdf_path)}]**\n\n{full_text}"
 ##############################################################################
 # 모델 로드
 ##############################################################################
                 '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>'
             )
     # 문서 분석 탭 (숨김)
     with gr.Tab("📄 문서 분석", visible=False):
         with gr.Row():
     def clear_capture():
         """캡처 초기화"""
+        global last_transcription, last_audio_data, audio_lock
+        with audio_lock:
             last_transcription = ""
+            last_audio_data = None
         return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비</div>', ""
     def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
         """특정 태스크로 이미지 분석"""
+        global last_transcription, audio_lock
         if image is None:
             return "❌ 먼저 이미지를 캡처하세요.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">❌ 이미지 없음</div>'
         # 현재 전사 텍스트 가져오기
         transcript = ""
+        with audio_lock:
             transcript = last_transcription
         result = analyze_image_for_robot(
     # 자동 캡처 및 분석 함수
     @spaces.GPU(duration=60)
+    def auto_capture_and_analyze(webcam_frame, task_prompt, use_search, thinking, tokens, use_audio, audio_data):
+        """자동 캡처 및 분석"""
+        global last_transcription, audio_lock
         if webcam_frame is None:
             return (
                 '<div class="status-box" style="background:#fff3cd; color:#856404;">⏳ 웹캠 대기 중</div>',
                 '<div class="auto-capture-status">🔄 자동 캡처: 웹캠 대기 중</div>',
                 "대기 중...",
+                None  # 오디오 리셋
             )
         # 캡처 수행
         timestamp = time.strftime("%H:%M:%S")
+        # 오디오 처리 (있으면)
+        if use_audio and audio_data is not None:
+            logger.info(f"[{timestamp}] 오디오 처리 시작")
+            transcription = process_audio_recording(audio_data)
+            if transcription:
+                logger.info(f"새로운 전사: {transcription[:50]}...")
         # 마지막 전사 결과 가져오기
         audio_transcript = ""
         if use_audio:
+            with audio_lock:
                 audio_transcript = last_transcription
                 if audio_transcript:
                     logger.info(f"분석에 사용할 음성: {audio_transcript[:50]}...")
             '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 자동 분석 완료</div>',
             f'<div class="auto-capture-status">🔄 자동 캡처: 마지막 분석 {timestamp}</div>',
             transcript_display,
+            None  # 오디오 리셋 (다음 녹음 준비)
         )
     # 웹캠 스트리밍
         outputs=[webcam_state]
     )
     # 수동 캡처 버튼
     capture_btn.click(
         fn=capture_webcam,
     # 오디오 토글 이벤트
     def toggle_audio(enabled):
+        global last_transcription, last_audio_data, audio_lock
         if enabled:
             # Whisper 모델 로드
     # 오디오 녹음 완료 시 처리
     def on_audio_recorded(audio_data):
         """오디오 녹음 완료 시 자동 처리"""
+        global last_transcription, audio_lock
         if audio_data is not None:
             logger.info("새 오디오 녹음 감지")