Spaces:

SidML
/

AI-Speech-Translator

Sleeping

App Files Files Community

AI-Speech-Translator / src /speech_recognition /whisper_recognizer.py

SidML

Initial Upload

5a65ad6 verified 4 months ago

raw

history blame contribute delete

13.2 kB

	"""
	Speech Recognition Module using OpenAI Whisper

	This module provides speech-to-text functionality with support for multiple languages
	and automatic language detection.
	"""

	import os
	import logging
	from typing import Optional, Dict, Any, Union
	from pathlib import Path

	import whisper
	import torch
	import numpy as np
	from whisper.utils import format_timestamp

	from ..config import WHISPER_MODEL_SIZE, WHISPER_DEVICE
	from ..audio_processing.processor import AudioProcessor


	class SpeechRecognizer:
	"""Speech recognition using OpenAI Whisper model."""

	def __init__(
	self,
	model_size: str = WHISPER_MODEL_SIZE,
	device: str = WHISPER_DEVICE,
	cache_dir: Optional[str] = None
	):
	"""
	Initialize the speech recognizer.

	Args:
	model_size: Whisper model size (tiny, base, small, medium, large)
	device: Device to run the model on (auto, cpu, cuda)
	cache_dir: Directory to cache downloaded models
	"""
	self.model_size = model_size
	self.device = self._setup_device(device)
	self.cache_dir = cache_dir
	self.model = None
	self.audio_processor = AudioProcessor()

	self.logger = logging.getLogger(__name__)
	self.logger.info(f"Initializing SpeechRecognizer with model={model_size}, device={self.device}")

	def _setup_device(self, device: str) -> str:
	"""Setup and validate device configuration."""
	if device == "auto":
	return "cuda" if torch.cuda.is_available() else "cpu"
	elif device == "cuda" and not torch.cuda.is_available():
	self.logger.warning("CUDA requested but not available, falling back to CPU")
	return "cpu"
	return device

	def load_model(self) -> None:
	"""Load the Whisper model."""
	try:
	self.logger.info(f"Loading Whisper model: {self.model_size}")

	# Set cache directory if specified
	if self.cache_dir:
	os.environ['WHISPER_CACHE_DIR'] = self.cache_dir

	self.model = whisper.load_model(
	self.model_size,
	device=self.device
	)

	self.logger.info("Whisper model loaded successfully")

	except Exception as e:
	self.logger.error(f"Failed to load Whisper model: {str(e)}")
	raise RuntimeError(f"Model loading failed: {str(e)}")

	def transcribe(
	self,
	audio_path: Union[str, Path],
	language: Optional[str] = None,
	task: str = "transcribe",
	**kwargs
	) -> Dict[str, Any]:
	"""
	Transcribe audio file to text.

	Args:
	audio_path: Path to audio file
	language: Source language code (optional, auto-detected if None)
	task: Task type ('transcribe' or 'translate')
	**kwargs: Additional arguments for whisper.transcribe()

	Returns:
	Dictionary containing transcription results
	"""
	if self.model is None:
	self.load_model()

	try:
	# Preprocess audio
	audio_path = Path(audio_path)
	if not audio_path.exists():
	raise FileNotFoundError(f"Audio file not found: {audio_path}")

	self.logger.info(f"Transcribing audio: {audio_path}")

	# Load and preprocess audio
	audio_data = self.audio_processor.load_audio(str(audio_path))

	# Prepare transcription options
	options = {
	"language": language,
	"task": task,
	"fp16": self.device == "cuda",
	**kwargs
	}

	# Remove None values
	options = {k: v for k, v in options.items() if v is not None}

	# Transcribe
	result = self.model.transcribe(audio_data, **options)

	# Process results
	processed_result = self._process_result(result, audio_path)

	self.logger.info(f"Transcription completed. Detected language: {processed_result['language']}")

	return processed_result

	except Exception as e:
	self.logger.error(f"Transcription failed: {str(e)}")
	raise RuntimeError(f"Transcription failed: {str(e)}")

	def _process_result(self, result: Dict[str, Any], audio_path: Path) -> Dict[str, Any]:
	"""Process and format transcription results."""

	# Extract segments with timestamps
	segments = []
	for segment in result.get("segments", []):
	segments.append({
	"id": segment["id"],
	"start": segment["start"],
	"end": segment["end"],
	"text": segment["text"].strip(),
	"confidence": segment.get("avg_logprob", 0.0)
	})

	# Calculate confidence score
	confidence = self._calculate_confidence(result.get("segments", []))

	processed_result = {
	"text": result["text"].strip(),
	"language": result["language"],
	"segments": segments,
	"confidence": confidence,
	"audio_path": str(audio_path),
	"model_size": self.model_size,
	"processing_info": {
	"device": self.device,
	"num_segments": len(segments),
	"total_duration": segments[-1]["end"] if segments else 0.0
	}
	}

	return processed_result

	def _calculate_confidence(self, segments: list) -> float:
	"""Calculate overall confidence score from segments."""
	if not segments:
	return 0.0

	total_confidence = sum(
	segment.get("avg_logprob", 0.0)
	for segment in segments
	)

	# Convert log probabilities to confidence (0-1 scale)
	avg_logprob = total_confidence / len(segments)
	confidence = max(0.0, min(1.0, (avg_logprob + 1.0))) # Normalize roughly

	return confidence

	def detect_language(self, audio_path: Union[str, Path]) -> Dict[str, Any]:
	"""
	Detect the language of the audio file.

	Args:
	audio_path: Path to audio file

	Returns:
	Dictionary with language detection results
	"""
	if self.model is None:
	self.load_model()

	try:
	audio_path = Path(audio_path)
	self.logger.info(f"Detecting language for: {audio_path}")

	# Load audio
	audio_data = self.audio_processor.load_audio(str(audio_path))

	# Detect language using Whisper's built-in detection
	# Use only first 30 seconds for faster detection
	audio_segment = audio_data[:30 * 16000] # 30 seconds at 16kHz

	mel = whisper.log_mel_spectrogram(audio_segment).to(self.model.device)
	_, probs = self.model.detect_language(mel)

	# Get top 3 language predictions
	top_languages = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:3]

	result = {
	"detected_language": top_languages[0][0],
	"confidence": top_languages[0][1],
	"top_languages": [
	{"language": lang, "confidence": conf}
	for lang, conf in top_languages
	],
	"audio_path": str(audio_path)
	}

	self.logger.info(f"Detected language: {result['detected_language']} "
	f"(confidence: {result['confidence']:.3f})")

	return result

	except Exception as e:
	self.logger.error(f"Language detection failed: {str(e)}")
	raise RuntimeError(f"Language detection failed: {str(e)}")

	def transcribe_with_timestamps(
	self,
	audio_path: Union[str, Path],
	language: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Transcribe audio with detailed timestamp information.

	Args:
	audio_path: Path to audio file
	language: Source language code (optional)

	Returns:
	Dictionary with transcription and timestamp data
	"""
	result = self.transcribe(
	audio_path,
	language=language,
	word_timestamps=True,
	verbose=True
	)

	# Add formatted timestamps
	for segment in result["segments"]:
	segment["start_time"] = format_timestamp(segment["start"])
	segment["end_time"] = format_timestamp(segment["end"])

	return result

	def get_model_info(self) -> Dict[str, Any]:
	"""Get information about the loaded model."""
	return {
	"model_size": self.model_size,
	"device": self.device,
	"model_loaded": self.model is not None,
	"cache_dir": self.cache_dir,
	"cuda_available": torch.cuda.is_available()
	}


	class BatchSpeechRecognizer:
	"""Batch processing for multiple audio files."""

	def __init__(self, recognizer: SpeechRecognizer):
	"""
	Initialize batch processor.

	Args:
	recognizer: SpeechRecognizer instance
	"""
	self.recognizer = recognizer
	self.logger = logging.getLogger(__name__)

	def transcribe_batch(
	self,
	audio_files: list,
	language: Optional[str] = None,
	output_dir: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Transcribe multiple audio files.

	Args:
	audio_files: List of audio file paths
	language: Source language (optional)
	output_dir: Directory to save results (optional)

	Returns:
	Dictionary with batch processing results
	"""
	results = {}
	failed_files = []

	self.logger.info(f"Starting batch transcription of {len(audio_files)} files")

	for i, audio_file in enumerate(audio_files, 1):
	try:
	self.logger.info(f"Processing file {i}/{len(audio_files)}: {audio_file}")

	result = self.recognizer.transcribe(audio_file, language=language)
	results[audio_file] = result

	# Save individual result if output directory specified
	if output_dir:
	self._save_result(result, audio_file, output_dir)

	except Exception as e:
	self.logger.error(f"Failed to process {audio_file}: {str(e)}")
	failed_files.append({"file": audio_file, "error": str(e)})

	batch_result = {
	"total_files": len(audio_files),
	"successful": len(results),
	"failed": len(failed_files),
	"results": results,
	"failed_files": failed_files
	}

	self.logger.info(f"Batch processing completed. "
	f"Success: {batch_result['successful']}, "
	f"Failed: {batch_result['failed']}")

	return batch_result

	def _save_result(self, result: Dict[str, Any], audio_file: str, output_dir: str) -> None:
	"""Save individual transcription result to file."""
	import json

	output_path = Path(output_dir)
	output_path.mkdir(exist_ok=True)

	# Create output filename
	audio_name = Path(audio_file).stem
	result_file = output_path / f"{audio_name}_transcription.json"

	with open(result_file, 'w', encoding='utf-8') as f:
	json.dump(result, f, indent=2, ensure_ascii=False)

	self.logger.debug(f"Saved result to: {result_file}")


	# Utility functions
	def create_speech_recognizer(
	model_size: str = WHISPER_MODEL_SIZE,
	device: str = WHISPER_DEVICE
	) -> SpeechRecognizer:
	"""Create and initialize a speech recognizer."""
	recognizer = SpeechRecognizer(model_size=model_size, device=device)
	recognizer.load_model()
	return recognizer


	def quick_transcribe(audio_path: str, language: Optional[str] = None) -> str:
	"""Quick transcription function for simple use cases."""
	recognizer = create_speech_recognizer()
	result = recognizer.transcribe(audio_path, language=language)
	return result["text"]