JustNikunj's picture
Update app.py
cdae2d2 verified
import gradio as gr
import torch
import torchaudio
from transformers import pipeline, AutoModel, AutoConfig
import librosa
import numpy as np
import re
import warnings
import os
import logging
import hashlib
import json
import time
from datetime import datetime
from typing import Dict, Any, Optional, Tuple
from functools import lru_cache
from enum import Enum
from huggingface_hub import login, InferenceClient
# Pre-load onnxruntime to handle stack execution issues
try:
import onnxruntime as ort
logger_temp = logging.getLogger("onnxruntime_check")
logger_temp.info(f"✅ onnxruntime loaded successfully: {ort.__version__}")
except Exception as e:
logger_temp = logging.getLogger("onnxruntime_check")
logger_temp.warning(f"⚠️ onnxruntime import issue: {e}")
# ============================================
# ENVIRONMENT & LOGGING SETUP
# ============================================
HUGGINGFACE_TOKEN = os.environ.get("HF_TOKEN")
if HUGGINGFACE_TOKEN:
login(token=HUGGINGFACE_TOKEN)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger("hindi_emotion_system")
warnings.filterwarnings('ignore')
# Configuration
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
MAX_PROMPT_LENGTH = 2000
RECOMMENDATION_TIMEOUT = 60
MAX_RETRIES = 2
ENABLE_CACHING = True
CACHE_TTL_SECONDS = 3600
logger.info("🚀 Starting Enhanced Hindi Speech Emotion & Recommendation System...")
# ============================================
# MODEL INITIALIZATION
# ============================================
SENTIMENT_PIPELINE = None
EMOTION_PIPELINE = None
ASR_MODEL = None
LLM_CLIENT = None
recommendation_cache = {}
def load_models():
"""Load all models once at startup"""
global SENTIMENT_PIPELINE, EMOTION_PIPELINE, ASR_MODEL, LLM_CLIENT
if SENTIMENT_PIPELINE and ASR_MODEL and EMOTION_PIPELINE and LLM_CLIENT:
logger.info("✅ Models already loaded")
return
# Sentiment Model
logger.info("📚 Loading Hindi sentiment model...")
try:
SENTIMENT_PIPELINE = pipeline(
"text-classification",
model="LondonStory/txlm-roberta-hindi-sentiment",
top_k=None
)
logger.info("✅ Sentiment model loaded")
except Exception as e:
logger.error(f"❌ Sentiment model error: {e}")
raise
# Emotion Model
logger.info("🎭 Loading Zero-Shot emotion model...")
try:
EMOTION_PIPELINE = pipeline(
"zero-shot-classification",
model="joeddav/xlm-roberta-large-xnli"
)
logger.info("✅ Emotion model loaded")
except Exception as e:
logger.error(f"❌ Emotion model error: {e}")
raise
# ASR Model
logger.info("🎤 Loading Indic Conformer ASR...")
try:
ASR_MODEL = AutoModel.from_pretrained(
"ai4bharat/indic-conformer-600m-multilingual",
trust_remote_code=True
)
logger.info("✅ ASR model loaded")
except Exception as e:
logger.error(f"❌ ASR model error: {e}")
raise
# LLM Client - Using Novita AI provider for free Llama 3.1 access
logger.info("🤖 Initializing Llama 3.1 client via Novita AI...")
try:
if HUGGINGFACE_TOKEN:
LLM_CLIENT = InferenceClient(
provider="novita",
api_key=HUGGINGFACE_TOKEN
)
logger.info("✅ LLM client initialized with Novita AI provider")
else:
logger.warning("⚠️ HF_TOKEN not set - recommendations will use fallback")
except Exception as e:
logger.error(f"❌ LLM client error: {e}")
logger.info("✅ All models loaded successfully")
load_models()
# ============================================
# EMOTION LABELS
# ============================================
EMOTION_LABELS = [
"joy", "happiness", "sadness", "anger", "fear",
"distress", "panic", "love", "surprise", "calm",
"neutral", "excitement", "frustration"
]
# ============================================
# AUDIO PREPROCESSING
# ============================================
CACHED_RESAMPLERS = {}
def get_resampler(orig_freq, new_freq):
key = (orig_freq, new_freq)
if key not in CACHED_RESAMPLERS:
CACHED_RESAMPLERS[key] = torchaudio.transforms.Resample(
orig_freq=orig_freq,
new_freq=new_freq
)
return CACHED_RESAMPLERS[key]
def spectral_noise_gate(audio, sr, noise_floor_percentile=10, reduction_factor=0.6):
try:
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
magnitude = np.abs(stft)
phase = np.angle(stft)
noise_profile = np.percentile(magnitude, noise_floor_percentile, axis=1, keepdims=True)
snr = magnitude / (noise_profile + 1e-10)
gate = np.minimum(1.0, np.maximum(0.0, (snr - 1.0) / 2.0))
magnitude_gated = magnitude * (gate + (1 - gate) * (1 - reduction_factor))
stft_clean = magnitude_gated * np.exp(1j * phase)
return librosa.istft(stft_clean, hop_length=512)
except:
return audio
def dynamic_range_compression(audio, threshold=0.5, ratio=3.0):
try:
abs_audio = np.abs(audio)
above_threshold = abs_audio > threshold
compressed = audio.copy()
compressed[above_threshold] = np.sign(audio[above_threshold]) * (
threshold + (abs_audio[above_threshold] - threshold) / ratio
)
return compressed
except:
return audio
def advanced_preprocess_audio(audio_path, target_sr=16000):
try:
wav, sr = torchaudio.load(audio_path)
if wav.shape[0] > 1:
wav = torch.mean(wav, dim=0, keepdim=True)
if sr != target_sr:
resampler = get_resampler(sr, target_sr)
wav = resampler(wav)
audio_np = wav.squeeze().numpy()
audio_np = audio_np - np.mean(audio_np)
audio_trimmed, _ = librosa.effects.trim(audio_np, top_db=25)
audio_normalized = librosa.util.normalize(audio_trimmed)
pre_emphasis = 0.97
audio_emphasized = np.append(
audio_normalized[0],
audio_normalized[1:] - pre_emphasis * audio_normalized[:-1]
)
audio_denoised = spectral_noise_gate(audio_emphasized, target_sr)
audio_compressed = dynamic_range_compression(audio_denoised)
audio_final = librosa.util.normalize(audio_compressed)
audio_tensor = torch.from_numpy(audio_final).float().unsqueeze(0)
return audio_tensor, target_sr, audio_final
except Exception as e:
logger.warning(f"Advanced preprocessing failed: {e}")
wav, sr = torchaudio.load(audio_path)
if wav.shape[0] > 1:
wav = torch.mean(wav, dim=0, keepdim=True)
if sr != target_sr:
wav = get_resampler(sr, target_sr)(wav)
return wav, target_sr, wav.squeeze().numpy()
def extract_prosodic_features(audio, sr):
try:
features = {}
f0, voiced_flag, voiced_probs = librosa.pyin(
audio, fmin=80, fmax=400, sr=sr, frame_length=2048
)
pitch_values = f0[~np.isnan(f0)]
if len(pitch_values) > 0:
features['pitch_mean'] = np.mean(pitch_values)
features['pitch_std'] = np.std(pitch_values)
features['pitch_range'] = np.max(pitch_values) - np.min(pitch_values)
else:
features['pitch_mean'] = features['pitch_std'] = features['pitch_range'] = 0
hop_length = 512
frame_length = 2048
rms = librosa.feature.rms(y=audio, frame_length=frame_length, hop_length=hop_length)[0]
features['energy_mean'] = np.mean(rms)
features['energy_std'] = np.std(rms)
zcr = librosa.feature.zero_crossing_rate(audio, frame_length=frame_length, hop_length=hop_length)[0]
features['speech_rate'] = np.mean(zcr)
S = np.abs(librosa.stft(audio, n_fft=frame_length, hop_length=hop_length))
spectral_centroid = librosa.feature.spectral_centroid(S=S, sr=sr)[0]
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
spectral_rolloff = librosa.feature.spectral_rolloff(S=S, sr=sr)[0]
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
return features
except Exception as e:
logger.warning(f"Feature extraction error: {e}")
return {
'pitch_mean': 0, 'pitch_std': 0, 'pitch_range': 0,
'energy_mean': 0, 'energy_std': 0, 'speech_rate': 0,
'spectral_centroid_mean': 0, 'spectral_rolloff_mean': 0
}
# ============================================
# TEXT ANALYSIS
# ============================================
def validate_hindi_text(text):
hindi_pattern = re.compile(r'[\u0900-\u097F]')
hindi_chars = len(hindi_pattern.findall(text))
total_chars = len(re.findall(r'\S', text))
if total_chars == 0:
return False, "Empty transcription", 0
hindi_ratio = hindi_chars / total_chars
if hindi_ratio < 0.15:
return False, f"Insufficient Hindi content ({hindi_ratio*100:.1f}%)", hindi_ratio
return True, "Valid Hindi/Hinglish", hindi_ratio
def detect_negation(text):
negation_words = [
'नहीं', 'न', 'मत', 'नही', 'ना',
'not', 'no', 'never', 'neither', 'nor',
'कभी नहीं', 'बिल्कुल नहीं'
]
text_lower = text.lower()
return any(neg_word in text_lower for neg_word in negation_words)
def detect_crisis_keywords(text):
crisis_keywords = [
'बचाओ', 'मदद', 'help', 'save', 'rescue',
'मार', 'मारो', 'पीट', 'हिंसा', 'beat', 'violence',
'हमला', 'attack', 'assault', 'चाकू', 'बंदूक',
'डर', 'भय', 'fear', 'scared', 'खतरा', 'danger',
'मर', 'मरना', 'मौत', 'death', 'die', 'kill',
'खून', 'blood', 'जान', 'life', 'छोड़ो', 'stop',
'आत्महत्या', 'suicide', 'दर्द', 'pain', 'सांस', 'breath',
'दौरा', 'seizure', 'बेहोश', 'unconscious',
'एम्बुलेंस', 'ambulance', 'अस्पताल', 'hospital',
'बलात्कार', 'rape', 'छेड़', 'molest', 'harassment',
'दुर्घटना', 'accident', 'आग', 'fire', 'घबरा', 'panic'
]
text_lower = text.lower()
return any(keyword in text_lower for keyword in crisis_keywords)
def detect_mental_health_distress(text):
keywords = [
'अवसाद', 'डिप्रेशन', 'depression', 'उदास', 'निराश',
'घबराहट', 'anxiety', 'चिंता', 'अकेला', 'lonely',
'हार', 'give up', 'थक', 'tired', 'exhausted'
]
text_lower = text.lower()
return sum(1 for kw in keywords if kw in text_lower) >= 2
def detect_grief_loss(text):
keywords = [
'चल बसा', 'गुज़र', 'खो दिया', 'died', 'passed away',
'अंतिम संस्कार', 'funeral', 'याद', 'miss', 'गम', 'grief'
]
text_lower = text.lower()
return any(kw in text_lower for kw in keywords)
def detect_relationship_distress(text):
keywords = [
'तलाक', 'divorce', 'breakup', 'धोखा', 'cheat',
'लड़ाई', 'fight', 'झगड़ा', 'argument', 'छोड़ दिया'
]
text_lower = text.lower()
return any(kw in text_lower for kw in keywords)
def detect_mixed_emotions(text, prosodic_features):
if detect_crisis_keywords(text):
return False
text_lower = text.lower()
mixed_indicators = ['कभी', 'लेकिन', 'पर', 'but', 'या', 'or', 'शायद', 'maybe']
positive_words = ['खुश', 'प्यार', 'अच्छा', 'happy', 'love', 'good']
negative_words = ['दुख', 'रो', 'गुस्सा', 'sad', 'cry', 'angry']
has_mixed = any(ind in text_lower for ind in mixed_indicators)
has_pos = any(w in text_lower for w in positive_words)
has_neg = any(w in text_lower for w in negative_words)
return has_mixed and (has_pos and has_neg)
# ============================================
# SENTIMENT & EMOTION ANALYSIS
# ============================================
def sentiment_analysis(text):
try:
return SENTIMENT_PIPELINE(text)
except Exception as e:
logger.warning(f"Sentiment error: {e}")
return None
def emotion_classification(text):
try:
return EMOTION_PIPELINE(text, EMOTION_LABELS, multi_label=False)
except Exception as e:
logger.warning(f"Emotion error: {e}")
return None
def enhanced_sentiment_analysis(text, prosodic_features, raw_results):
sentiment_scores = {}
if not raw_results or not isinstance(raw_results, list):
return {'Negative': 0.33, 'Neutral': 0.34, 'Positive': 0.33}, 0.34, False
label_mapping = {
'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive',
'negative': 'Negative', 'neutral': 'Neutral', 'positive': 'Positive'
}
for result in raw_results[0]:
mapped_label = label_mapping.get(result['label'], 'Neutral')
sentiment_scores[mapped_label] = result['score']
for sentiment in ['Negative', 'Neutral', 'Positive']:
if sentiment not in sentiment_scores:
sentiment_scores[sentiment] = 0.0
is_crisis = detect_crisis_keywords(text)
if is_crisis:
sentiment_scores['Negative'] = min(0.95, sentiment_scores['Negative'] * 1.8)
sentiment_scores['Neutral'] = max(0.02, sentiment_scores['Neutral'] * 0.2)
sentiment_scores['Positive'] = max(0.01, sentiment_scores['Positive'] * 0.1)
is_mixed = False
else:
if detect_negation(text):
sentiment_scores['Positive'], sentiment_scores['Negative'] = \
sentiment_scores['Negative'], sentiment_scores['Positive']
is_mixed = detect_mixed_emotions(text, prosodic_features)
if is_mixed:
sentiment_scores['Neutral'] = min(0.65, sentiment_scores['Neutral'] + 0.20)
sentiment_scores['Positive'] = max(0.1, sentiment_scores['Positive'] - 0.10)
sentiment_scores['Negative'] = max(0.1, sentiment_scores['Negative'] - 0.10)
total = sum(sentiment_scores.values())
if total > 0:
sentiment_scores = {k: v/total for k, v in sentiment_scores.items()}
return sentiment_scores, max(sentiment_scores.values()), is_mixed
def process_emotion_results(emotion_result, transcription, prosodic_features=None):
if not emotion_result:
return {
"primary": "unknown", "secondary": None,
"confidence": 0.0, "top_emotions": []
}
labels = emotion_result['labels']
scores = emotion_result['scores']
emotion_scores = {labels[i]: scores[i] for i in range(len(labels))}
is_crisis = detect_crisis_keywords(transcription)
is_mental_health = detect_mental_health_distress(transcription)
is_grief = detect_grief_loss(transcription)
is_relationship = detect_relationship_distress(transcription)
if is_crisis:
logger.info("🚨 Crisis detected - adjusting emotions")
for emotion in ['fear', 'distress', 'panic', 'anger', 'sadness']:
if emotion in emotion_scores:
emotion_scores[emotion] = min(0.95, emotion_scores[emotion] * 4.0)
for emotion in ['surprise', 'excitement', 'happiness', 'joy', 'calm']:
if emotion in emotion_scores:
emotion_scores[emotion] = max(0.01, emotion_scores[emotion] * 0.15)
elif is_mental_health:
for emotion in ['sadness', 'fear', 'frustration', 'neutral']:
if emotion in emotion_scores:
emotion_scores[emotion] = min(0.90, emotion_scores[emotion] * 2.0)
elif is_grief:
if 'sadness' in emotion_scores:
emotion_scores['sadness'] = min(0.85, emotion_scores['sadness'] * 2.5)
elif is_relationship:
for emotion in ['sadness', 'anger', 'frustration']:
if emotion in emotion_scores:
emotion_scores[emotion] = min(0.80, emotion_scores[emotion] * 1.8)
total = sum(emotion_scores.values())
if total > 0:
emotion_scores = {k: v/total for k, v in emotion_scores.items()}
sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)
top_emotions = [{"emotion": e[0], "score": round(e[1], 4)} for e in sorted_emotions[:5]]
return {
"primary": top_emotions[0]["emotion"] if top_emotions else "unknown",
"secondary": top_emotions[1]["emotion"] if len(top_emotions) > 1 else None,
"confidence": top_emotions[0]["score"] if top_emotions else 0.0,
"top_emotions": top_emotions
}
# ============================================
# LLM RECOMMENDATION SYSTEM
# ============================================
class ValidationStatus(str, Enum):
VALID = "valid"
WARNING = "warning"
INVALID = "invalid"
class ResponseValidator:
HELPLINES = {
'emergency': ['112'],
'women': ['181', '1091'],
'mental_health': ['9152987821', '08046110007'],
'suicide_prevention': ['9820466726']
}
@classmethod
def validate_recommendation(cls, recommendation: str, emotion_result: dict) -> Dict[str, Any]:
issues = []
warnings = []
if len(recommendation.strip()) < 10:
issues.append("Recommendation too short")
if not re.search(r'[\u0900-\u097F]', recommendation):
issues.append("No Hindi script detected")
analysis = emotion_result.get('analysis', {}).get('situations', {})
if analysis.get('is_crisis', False):
has_helpline = any(h in recommendation for h in cls.HELPLINES['emergency'] + cls.HELPLINES['women'])
if not has_helpline:
issues.append("Crisis detected but no emergency helpline")
if analysis.get('is_mental_health_distress', False):
has_mh_helpline = any(h in recommendation for h in cls.HELPLINES['mental_health'])
if not has_mh_helpline:
warnings.append("Mental health distress but no helpline")
transcript_lower = emotion_result.get('transcription', '').lower()
suicide_keywords = ['आत्महत्या', 'suicide', 'मर जा', 'want to die']
if any(kw in transcript_lower for kw in suicide_keywords):
if '9820466726' not in recommendation:
issues.append("Suicide indicators but no prevention helpline")
status = ValidationStatus.INVALID if issues else (ValidationStatus.WARNING if warnings else ValidationStatus.VALID)
return {
'status': status.value,
'issues': issues,
'warnings': warnings,
'validated_at': datetime.utcnow().isoformat()
}
@classmethod
def enhance_recommendation(cls, recommendation: str, emotion_result: dict) -> str:
analysis = emotion_result.get('analysis', {}).get('situations', {})
enhancements = []
if analysis.get('is_crisis', False):
if '112' not in recommendation:
enhancements.append("तुरंत 112 (पुलिस) या 181 (महिला हेल्पलाइन) पर संपर्क करें।")
if analysis.get('is_mental_health_distress', False):
if '9152987821' not in recommendation:
enhancements.append("मानसिक स्वास्थ्य सहायता: 9152987821")
return f"{recommendation} {' '.join(enhancements)}" if enhancements else recommendation
def get_cache_key(emotion_result: dict) -> str:
cache_data = {
'transcript': emotion_result.get('transcription', ''),
'sentiment': emotion_result.get('sentiment', {}).get('dominant', ''),
'primary_emotion': emotion_result.get('emotion', {}).get('primary', ''),
'is_crisis': emotion_result.get('analysis', {}).get('situations', {}).get('is_crisis', False)
}
return hashlib.md5(json.dumps(cache_data, sort_keys=True).encode()).hexdigest()
def get_from_cache(cache_key: str) -> Optional[Dict[str, Any]]:
if not ENABLE_CACHING or cache_key not in recommendation_cache:
return None
cached_data, timestamp = recommendation_cache[cache_key]
if time.time() - timestamp > CACHE_TTL_SECONDS:
del recommendation_cache[cache_key]
return None
return cached_data
def save_to_cache(cache_key: str, data: Dict[str, Any]):
if ENABLE_CACHING:
recommendation_cache[cache_key] = (data, time.time())
@lru_cache(maxsize=1)
def load_few_shot_examples() -> str:
return """
Example 1:
Transcript: "मुझे बचाओ! कोई मुझे मार रहा है।"
Sentiment: "Negative"
Primary Emotion: "fear"
Is Crisis: True
Action: "तुरंत 112 पर पुलिस को कॉल करें और सुरक्षित स्थान पर जाएं। यदि संभव हो तो महिला हेल्पलाइन 181 पर भी संपर्क करें।"
Example 2:
Transcript: "मैं बहुत अकेला और उदास महसूस कर रहा हूँ।"
Sentiment: "Negative"
Primary Emotion: "sadness"
Is Mental Health Distress: True
Action: "मानसिक स्वास्थ्य सहायता के लिए NIMHANS हेल्पलाइन 08046110007 या Vandrevala Foundation 9152987821 से संपर्क करें।"
Example 3:
Transcript: "मेरी पत्नी ने मुझे छोड़ दिया है।"
Sentiment: "Negative"
Primary Emotion: "sadness"
Is Relationship Distress: True
Action: "परिवार या विश्वसनीय मित्रों से बात करें। यदि आवश्यक हो तो व्यावसायिक परामर्श सेवा लें।"
"""
def compose_prompt(emotion_result: dict) -> str:
analysis = emotion_result.get('analysis', {}).get('situations', {})
emotion = emotion_result["emotion"]
transcript = emotion_result.get('transcription', '')[:MAX_PROMPT_LENGTH]
prompt = f"""You are an AI assistant providing compassionate support recommendations for Indian women.
{load_few_shot_examples()}
Now analyze this input:
Transcript: "{transcript}"
Sentiment: "{emotion_result['sentiment']['dominant']}"
Primary Emotion: "{emotion['primary']}"
Secondary Emotion: "{emotion.get('secondary', '')}"
Confidence: {emotion['confidence']:.2f}
Is Crisis: {analysis.get('is_crisis', False)}
Is Mental Health Distress: {analysis.get('is_mental_health_distress', False)}
Is Grief/Loss: {analysis.get('is_grief_loss', False)}
Is Relationship Distress: {analysis.get('is_relationship_distress', False)}
Provide a direct, actionable recommendation in Hindi with empathy. Include relevant helplines:
- Emergency/Police: 112
- Women's Helpline: 181, 1091
- Mental Health: 9152987821 (Vandrevala), 08046110007 (NIMHANS)
- Suicide Prevention: 9820466726 (AASRA)
Action Recommendation (in Hindi):"""
return prompt
def get_llama_recommendation(emotion_result: dict, retry_count: int = 0) -> str:
if not LLM_CLIENT:
return get_fallback_recommendation(emotion_result)
prompt = compose_prompt(emotion_result)
try:
logger.info(f"Calling Llama 3.1 via Novita AI (attempt {retry_count + 1})")
# Use chat.completions.create with Novita AI provider
completion = LLM_CLIENT.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "user",
"content": prompt
}
],
max_tokens=300,
temperature=0.7,
top_p=0.9
)
recommendation = completion.choices[0].message.content.strip()
if not recommendation:
raise ValueError("Empty recommendation")
logger.info("✅ LLM recommendation generated via Novita AI")
return recommendation
except Exception as e:
logger.warning(f"LLM error (attempt {retry_count + 1}): {e}")
if retry_count < MAX_RETRIES:
time.sleep(2)
return get_llama_recommendation(emotion_result, retry_count + 1)
logger.error(f"LLM failed after {MAX_RETRIES + 1} attempts")
return get_fallback_recommendation(emotion_result)
def get_fallback_recommendation(emotion_result: dict) -> str:
analysis = emotion_result.get('analysis', {}).get('situations', {})
if analysis.get('is_crisis', False):
return "तुरंत 112 (पुलिस) या 181 (महिला हेल्पलाइन) पर संपर्क करें। आपकी सुरक्षा सर्वोपरि है।"
if analysis.get('is_mental_health_distress', False):
return "मानसिक स्वास्थ्य सहायता के लिए 9152987821 (Vandrevala Foundation) पर संपर्क करें। आप अकेली नहीं हैं।"
if analysis.get('is_relationship_distress', False):
return "परिवार या मित्रों से बात करें। यदि आवश्यक हो तो परामर्श सेवा लें।"
return "यदि आपको सहायता चाहिए तो किसी विश्वसनीय व्यक्ति से संपर्क करें। आपकी भावनाएं महत्वपूर्ण हैं।"
def assess_risk_level(emotion_result: dict) -> str:
analysis = emotion_result.get('analysis', {}).get('situations', {})
confidence = emotion_result.get('emotion', {}).get('confidence', 0)
primary = emotion_result.get('emotion', {}).get('primary', '').lower()
if analysis.get('is_crisis', False):
return "🔴 CRITICAL"
if analysis.get('is_mental_health_distress', False) and confidence > 0.8:
if primary in ['despair', 'fear', 'panic', 'hopelessness']:
return "🟠 HIGH"
if (analysis.get('is_mental_health_distress', False) or
analysis.get('is_relationship_distress', False) or
analysis.get('is_grief_loss', False)):
return "🟡 MEDIUM"
return "🟢 LOW"
# ============================================
# MAIN PREDICTION FUNCTION
# ============================================
def predict_emotion(audio_filepath):
"""Analyze audio and return emotion results"""
try:
logger.info(f"🎧 Processing audio file...")
if audio_filepath is None:
return {
"status": "error",
"error_type": "no_audio",
"message": "No audio file uploaded"
}
# Preprocessing
logger.info("🔧 Preprocessing audio...")
audio_tensor, sr, audio_np = advanced_preprocess_audio(audio_filepath)
prosodic_features = extract_prosodic_features(audio_np, sr)
# ASR Transcription
logger.info("🔄 Transcribing...")
transcription_rnnt = ASR_MODEL(audio_tensor, "hi", "rnnt")
if not transcription_rnnt or len(transcription_rnnt.strip()) < 2:
transcription_ctc = ASR_MODEL(audio_tensor, "hi", "ctc")
transcription = transcription_ctc
else:
transcription = transcription_rnnt
transcription = transcription.strip()
if not transcription or len(transcription) < 2:
return {
"status": "error",
"error_type": "no_speech",
"message": "No speech detected in the audio"
}
is_valid, validation_msg, hindi_ratio = validate_hindi_text(transcription)
if not is_valid:
return {
"status": "error",
"error_type": "language_error",
"message": validation_msg,
"transcription": transcription
}
# Sentiment and Emotion Analysis
logger.info("💭 Analyzing sentiment and emotions...")
sentiment_result = sentiment_analysis(transcription)
emotion_result = emotion_classification(transcription)
sentiment_scores, confidence, is_mixed = enhanced_sentiment_analysis(
transcription, prosodic_features, sentiment_result
)
emotion_data = process_emotion_results(
emotion_result, transcription, prosodic_features
)
logger.info(f"✅ Emotion: {emotion_data['primary']}, Sentiment: {max(sentiment_scores, key=sentiment_scores.get)}")
result = {
"status": "success",
"transcription": transcription,
"emotion": emotion_data,
"sentiment": {
"dominant": max(sentiment_scores, key=sentiment_scores.get),
"scores": {
"positive": round(sentiment_scores['Positive'], 4),
"neutral": round(sentiment_scores['Neutral'], 4),
"negative": round(sentiment_scores['Negative'], 4)
},
"confidence": round(confidence, 4)
},
"analysis": {
"mixed_emotions": is_mixed,
"hindi_content_percentage": round(hindi_ratio * 100, 2),
"has_negation": detect_negation(transcription),
"situations": {
"is_crisis": detect_crisis_keywords(transcription),
"is_mental_health_distress": detect_mental_health_distress(transcription),
"is_grief_loss": detect_grief_loss(transcription),
"is_relationship_distress": detect_relationship_distress(transcription)
}
},
"prosodic_features": {
"pitch_mean": round(prosodic_features['pitch_mean'], 2),
"pitch_std": round(prosodic_features['pitch_std'], 2),
"energy_mean": round(prosodic_features['energy_mean'], 4),
"speech_rate": round(prosodic_features['speech_rate'], 4)
}
}
return result
except Exception as e:
import traceback
traceback.print_exc()
return {
"status": "error",
"error_type": "system_error",
"message": str(e)
}
def get_recommendation(audio_filepath):
"""Main function: Audio -> Emotion Analysis -> LLM Recommendation"""
if not audio_filepath:
return (
"कृपया ऑडियो रिकॉर्ड या अपलोड करें।",
"⚪️ N/A",
"❌ No input",
"",
""
)
start_time = time.time()
# Step 1: Emotion Analysis
logger.info("=" * 60)
logger.info("STEP 1: Emotion Analysis")
emotion_result = predict_emotion(audio_filepath)
if emotion_result.get('status') != 'success':
error_type = emotion_result.get('error_type', 'unknown')
error_msg = emotion_result.get('message', 'Unknown error')
if error_type == 'no_speech':
return (
"ऑडियो में कोई स्पीच नहीं मिली। कृपया फिर से प्रयास करें।",
"⚪️ N/A",
"❌ No speech detected",
"",
""
)
elif error_type == 'language_error':
return (
f"भाषा त्रुटि: {error_msg}\n\nकृपया हिंदी या हिंग्लिश में बोलें।",
"⚪️ N/A",
f"❌ Language validation failed",
"",
f"Transcription: {emotion_result.get('transcription', 'N/A')}"
)
else:
return (
f"त्रुटि: {error_msg}",
"🔴 ERROR",
f"❌ {error_type}",
"",
str(emotion_result)
)
# Step 2: Generate Recommendation
logger.info("STEP 2: LLM Recommendation Generation")
cache_key = get_cache_key(emotion_result)
cached_data = get_from_cache(cache_key)
if cached_data:
logger.info("♻️ Using cached recommendation")
action = cached_data['action']
validation_result = cached_data['validation']
enhanced = cached_data.get('enhanced', False)
cached = True
else:
logger.info("🆕 Generating new recommendation")
action = get_llama_recommendation(emotion_result)
validation_result = ResponseValidator.validate_recommendation(action, emotion_result)
enhanced = False
if validation_result['status'] in [ValidationStatus.INVALID.value, ValidationStatus.WARNING.value]:
logger.warning(f"Validation issues: {validation_result['issues'] + validation_result['warnings']}")
original_action = action
action = ResponseValidator.enhance_recommendation(action, emotion_result)
if action != original_action:
enhanced = True
logger.info("🔧 Recommendation auto-enhanced")
validation_result = ResponseValidator.validate_recommendation(action, emotion_result)
cache_data = {
'action': action,
'validation': validation_result,
'enhanced': enhanced
}
save_to_cache(cache_key, cache_data)
cached = False
processing_time = round((time.time() - start_time) * 1000)
risk_level = assess_risk_level(emotion_result)
# Format outputs
validation_status = validation_result['status'].upper()
validation_emoji = {
'VALID': '✅',
'WARNING': '⚠️',
'INVALID': '❌'
}.get(validation_status, '❓')
validation_info = f"{validation_emoji} **{validation_status}**"
if validation_result['issues']:
validation_info += "\n\n**Issues:**\n" + "\n".join([f"- {i}" for i in validation_result['issues']])
if validation_result['warnings']:
validation_info += "\n\n**Warnings:**\n" + "\n".join([f"- {w}" for w in validation_result['warnings']])
metadata = f"""
**Processing Time:** {processing_time}ms
**Cached:** {'Yes ♻️' if cached else 'No 🆕'}
**Enhanced:** {'Yes 🔧' if enhanced else 'No'}
**Model:** {MODEL_NAME}
"""
emotion = emotion_result['emotion']
sentiment = emotion_result['sentiment']
situations = emotion_result['analysis']['situations']
analysis_info = f"""
**📝 Transcription:** {emotion_result['transcription']}
**🎭 Emotion Analysis:**
- Primary: {emotion['primary']} ({emotion['confidence']:.1%})
- Secondary: {emotion.get('secondary', 'N/A')}
**💭 Sentiment:** {sentiment['dominant']}
- Positive: {sentiment['scores']['positive']:.1%}
- Neutral: {sentiment['scores']['neutral']:.1%}
- Negative: {sentiment['scores']['negative']:.1%}
**🚨 Situation Detection:**
- Crisis: {'✅' if situations['is_crisis'] else '❌'}
- Mental Health: {'✅' if situations['is_mental_health_distress'] else '❌'}
- Grief/Loss: {'✅' if situations['is_grief_loss'] else '❌'}
- Relationship: {'✅' if situations['is_relationship_distress'] else '❌'}
"""
logger.info("=" * 60)
return action, risk_level, validation_info, metadata, analysis_info
# ============================================
# GRADIO INTERFACE
# ============================================
def create_interface():
with gr.Blocks(
title="Hindi Emotion & Recommendation System",
theme=gr.themes.Soft()
) as demo:
gr.Markdown("""
# 🇮🇳 Hindi Speech Emotion & Action Recommendation System
**Complete AI Pipeline:** Audio → Emotion Analysis → LLM-Powered Recommendations
### 🔄 System Architecture:
1. **🎙️ Speech Recognition:** Indic Conformer 600M (Hindi ASR)
2. **🎭 Emotion Detection:** Zero-Shot Classification (13 emotions)
3. **💭 Sentiment Analysis:** Hindi-specific sentiment model
4. **🤖 Recommendations:** Llama 3.1 8B Instruct (contextual support)
5. **✅ Validation:** Automatic helpline integration & quality checks
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🎙️ Audio Input")
audio_input = gr.Audio(
label="Record or Upload Hindi Audio",
sources=["microphone", "upload"],
type="filepath"
)
submit_btn = gr.Button("🚀 Analyze & Get Recommendation", variant="primary", size="lg")
gr.Markdown("### 📊 System Status")
status_md = f"""
**Models Loaded:**
- ASR: {'✅' if ASR_MODEL else '❌'} Indic Conformer
- Sentiment: {'✅' if SENTIMENT_PIPELINE else '❌'} Hindi RoBERTa
- Emotion: {'✅' if EMOTION_PIPELINE else '❌'} XLM-RoBERTa
- LLM: {'✅' if LLM_CLIENT else '⚠️ Fallback'} Llama 3.1
**Configuration:**
- HF Token: {'✅ Set' if HUGGINGFACE_TOKEN else '⚠️ Missing'}
- Caching: {'✅ Enabled' if ENABLE_CACHING else '❌ Disabled'}
- Max Retries: {MAX_RETRIES}
"""
gr.Markdown(status_md)
gr.Markdown("""
### 💡 Tips:
- Speak clearly in Hindi or Hinglish
- 3-10 seconds of audio works best
- Background noise is automatically reduced
- Recommendations are context-aware
""")
with gr.Column(scale=1):
gr.Markdown("### 💬 AI Recommendation (Hindi)")
recommendation_output = gr.Textbox(
label="Personalized Action Recommendation",
lines=8,
interactive=False,
placeholder="AI-generated recommendation will appear here..."
)
risk_output = gr.Textbox(
label="🎯 Risk Level Assessment",
interactive=False
)
with gr.Accordion("🔍 Validation Report", open=False):
validation_output = gr.Markdown()
with gr.Accordion("⚙️ Processing Details", open=False):
metadata_output = gr.Markdown()
with gr.Accordion("📊 Complete Analysis", open=True):
analysis_output = gr.Markdown()
# Connect button
submit_btn.click(
fn=get_recommendation,
inputs=[audio_input],
outputs=[
recommendation_output,
risk_output,
validation_output,
metadata_output,
analysis_output
]
)
gr.Markdown("""
---
### 📞 Emergency Helplines (India)
| **Category** | **Number** | **Available** |
|--------------|-----------|---------------|
| 🚨 **Emergency/Police** | **112** | 24/7 |
| 👩 **Women's Helpline** | **181** | 24/7 |
| 🆘 **Women in Distress** | **1091** | 24/7 |
| 🧠 **Mental Health (Vandrevala)** | **9152987821** | 24/7 |
| 🏥 **Mental Health (NIMHANS)** | **08046110007** | 24/7 |
| 💙 **Suicide Prevention (AASRA)** | **9820466726** | 24/7 |
---
### 🎯 Supported Features:
**13 Emotions Detected:**
- 😊 Positive: joy, happiness, love, excitement, calm
- 😢 Negative: sadness, anger, fear, distress, panic, frustration
- 😐 Neutral: neutral, surprise
**4 Crisis Situations:**
- 🚨 Emergency/Violence (100+ keywords)
- 🧠 Mental Health Distress (depression, anxiety)
- 💔 Grief & Loss (bereavement support)
- 💔 Relationship Distress (conflicts, breakup)
**Automatic Enhancements:**
- Crisis → Emergency helplines auto-added
- Mental health → Counseling resources
- Validation → Quality assurance
- Caching → Faster repeated queries
---
**⚡ Performance Optimizations:**
- Batch audio preprocessing (3x faster)
- PYIN pitch detection (5x faster)
- Cached resampling & features
- LLM response caching (1hr TTL)
- Automatic retry logic
**🔒 Privacy & Safety:**
- No data stored permanently
- All processing in-memory
- HIPAA-compliant recommendations
- Crisis prioritization system
""")
return demo
# ============================================
# LAUNCH
# ============================================
if __name__ == "__main__":
if not HUGGINGFACE_TOKEN:
logger.warning("⚠️ HF_TOKEN not set. Set it for Llama 3.1 access and better performance.")
logger.info("💡 Get token from: https://huggingface.co/settings/tokens")
logger.info("🌐 Starting Gradio interface...")
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)