""" Speech Recognition Module using OpenAI Whisper This module provides speech-to-text functionality with support for multiple languages and automatic language detection. """ import os import logging from typing import Optional, Dict, Any, Union from pathlib import Path import whisper import torch import numpy as np from whisper.utils import format_timestamp from ..config import WHISPER_MODEL_SIZE, WHISPER_DEVICE from ..audio_processing.processor import AudioProcessor class SpeechRecognizer: """Speech recognition using OpenAI Whisper model.""" def __init__( self, model_size: str = WHISPER_MODEL_SIZE, device: str = WHISPER_DEVICE, cache_dir: Optional[str] = None ): """ Initialize the speech recognizer. Args: model_size: Whisper model size (tiny, base, small, medium, large) device: Device to run the model on (auto, cpu, cuda) cache_dir: Directory to cache downloaded models """ self.model_size = model_size self.device = self._setup_device(device) self.cache_dir = cache_dir self.model = None self.audio_processor = AudioProcessor() self.logger = logging.getLogger(__name__) self.logger.info(f"Initializing SpeechRecognizer with model={model_size}, device={self.device}") def _setup_device(self, device: str) -> str: """Setup and validate device configuration.""" if device == "auto": return "cuda" if torch.cuda.is_available() else "cpu" elif device == "cuda" and not torch.cuda.is_available(): self.logger.warning("CUDA requested but not available, falling back to CPU") return "cpu" return device def load_model(self) -> None: """Load the Whisper model.""" try: self.logger.info(f"Loading Whisper model: {self.model_size}") # Set cache directory if specified if self.cache_dir: os.environ['WHISPER_CACHE_DIR'] = self.cache_dir self.model = whisper.load_model( self.model_size, device=self.device ) self.logger.info("Whisper model loaded successfully") except Exception as e: self.logger.error(f"Failed to load Whisper model: {str(e)}") raise RuntimeError(f"Model loading failed: {str(e)}") def transcribe( self, audio_path: Union[str, Path], language: Optional[str] = None, task: str = "transcribe", **kwargs ) -> Dict[str, Any]: """ Transcribe audio file to text. Args: audio_path: Path to audio file language: Source language code (optional, auto-detected if None) task: Task type ('transcribe' or 'translate') **kwargs: Additional arguments for whisper.transcribe() Returns: Dictionary containing transcription results """ if self.model is None: self.load_model() try: # Preprocess audio audio_path = Path(audio_path) if not audio_path.exists(): raise FileNotFoundError(f"Audio file not found: {audio_path}") self.logger.info(f"Transcribing audio: {audio_path}") # Load and preprocess audio audio_data = self.audio_processor.load_audio(str(audio_path)) # Prepare transcription options options = { "language": language, "task": task, "fp16": self.device == "cuda", **kwargs } # Remove None values options = {k: v for k, v in options.items() if v is not None} # Transcribe result = self.model.transcribe(audio_data, **options) # Process results processed_result = self._process_result(result, audio_path) self.logger.info(f"Transcription completed. Detected language: {processed_result['language']}") return processed_result except Exception as e: self.logger.error(f"Transcription failed: {str(e)}") raise RuntimeError(f"Transcription failed: {str(e)}") def _process_result(self, result: Dict[str, Any], audio_path: Path) -> Dict[str, Any]: """Process and format transcription results.""" # Extract segments with timestamps segments = [] for segment in result.get("segments", []): segments.append({ "id": segment["id"], "start": segment["start"], "end": segment["end"], "text": segment["text"].strip(), "confidence": segment.get("avg_logprob", 0.0) }) # Calculate confidence score confidence = self._calculate_confidence(result.get("segments", [])) processed_result = { "text": result["text"].strip(), "language": result["language"], "segments": segments, "confidence": confidence, "audio_path": str(audio_path), "model_size": self.model_size, "processing_info": { "device": self.device, "num_segments": len(segments), "total_duration": segments[-1]["end"] if segments else 0.0 } } return processed_result def _calculate_confidence(self, segments: list) -> float: """Calculate overall confidence score from segments.""" if not segments: return 0.0 total_confidence = sum( segment.get("avg_logprob", 0.0) for segment in segments ) # Convert log probabilities to confidence (0-1 scale) avg_logprob = total_confidence / len(segments) confidence = max(0.0, min(1.0, (avg_logprob + 1.0))) # Normalize roughly return confidence def detect_language(self, audio_path: Union[str, Path]) -> Dict[str, Any]: """ Detect the language of the audio file. Args: audio_path: Path to audio file Returns: Dictionary with language detection results """ if self.model is None: self.load_model() try: audio_path = Path(audio_path) self.logger.info(f"Detecting language for: {audio_path}") # Load audio audio_data = self.audio_processor.load_audio(str(audio_path)) # Detect language using Whisper's built-in detection # Use only first 30 seconds for faster detection audio_segment = audio_data[:30 * 16000] # 30 seconds at 16kHz mel = whisper.log_mel_spectrogram(audio_segment).to(self.model.device) _, probs = self.model.detect_language(mel) # Get top 3 language predictions top_languages = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:3] result = { "detected_language": top_languages[0][0], "confidence": top_languages[0][1], "top_languages": [ {"language": lang, "confidence": conf} for lang, conf in top_languages ], "audio_path": str(audio_path) } self.logger.info(f"Detected language: {result['detected_language']} " f"(confidence: {result['confidence']:.3f})") return result except Exception as e: self.logger.error(f"Language detection failed: {str(e)}") raise RuntimeError(f"Language detection failed: {str(e)}") def transcribe_with_timestamps( self, audio_path: Union[str, Path], language: Optional[str] = None ) -> Dict[str, Any]: """ Transcribe audio with detailed timestamp information. Args: audio_path: Path to audio file language: Source language code (optional) Returns: Dictionary with transcription and timestamp data """ result = self.transcribe( audio_path, language=language, word_timestamps=True, verbose=True ) # Add formatted timestamps for segment in result["segments"]: segment["start_time"] = format_timestamp(segment["start"]) segment["end_time"] = format_timestamp(segment["end"]) return result def get_model_info(self) -> Dict[str, Any]: """Get information about the loaded model.""" return { "model_size": self.model_size, "device": self.device, "model_loaded": self.model is not None, "cache_dir": self.cache_dir, "cuda_available": torch.cuda.is_available() } class BatchSpeechRecognizer: """Batch processing for multiple audio files.""" def __init__(self, recognizer: SpeechRecognizer): """ Initialize batch processor. Args: recognizer: SpeechRecognizer instance """ self.recognizer = recognizer self.logger = logging.getLogger(__name__) def transcribe_batch( self, audio_files: list, language: Optional[str] = None, output_dir: Optional[str] = None ) -> Dict[str, Any]: """ Transcribe multiple audio files. Args: audio_files: List of audio file paths language: Source language (optional) output_dir: Directory to save results (optional) Returns: Dictionary with batch processing results """ results = {} failed_files = [] self.logger.info(f"Starting batch transcription of {len(audio_files)} files") for i, audio_file in enumerate(audio_files, 1): try: self.logger.info(f"Processing file {i}/{len(audio_files)}: {audio_file}") result = self.recognizer.transcribe(audio_file, language=language) results[audio_file] = result # Save individual result if output directory specified if output_dir: self._save_result(result, audio_file, output_dir) except Exception as e: self.logger.error(f"Failed to process {audio_file}: {str(e)}") failed_files.append({"file": audio_file, "error": str(e)}) batch_result = { "total_files": len(audio_files), "successful": len(results), "failed": len(failed_files), "results": results, "failed_files": failed_files } self.logger.info(f"Batch processing completed. " f"Success: {batch_result['successful']}, " f"Failed: {batch_result['failed']}") return batch_result def _save_result(self, result: Dict[str, Any], audio_file: str, output_dir: str) -> None: """Save individual transcription result to file.""" import json output_path = Path(output_dir) output_path.mkdir(exist_ok=True) # Create output filename audio_name = Path(audio_file).stem result_file = output_path / f"{audio_name}_transcription.json" with open(result_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) self.logger.debug(f"Saved result to: {result_file}") # Utility functions def create_speech_recognizer( model_size: str = WHISPER_MODEL_SIZE, device: str = WHISPER_DEVICE ) -> SpeechRecognizer: """Create and initialize a speech recognizer.""" recognizer = SpeechRecognizer(model_size=model_size, device=device) recognizer.load_model() return recognizer def quick_transcribe(audio_path: str, language: Optional[str] = None) -> str: """Quick transcription function for simple use cases.""" recognizer = create_speech_recognizer() result = recognizer.transcribe(audio_path, language=language) return result["text"]