import gradio as gr |
import torch |
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
import librosa |
import numpy as np |
import plotly.graph_objects as go |
import warnings |
import os |
from scipy.stats import kurtosis, skew |
from anthropic import Anthropic |
from dotenv import load_dotenv |
load_dotenv() |
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', 'your_anthropic_api_key') |
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN', 'your_huggingface_api_token') |
warnings.filterwarnings('ignore') |
processor = None |
whisper_model = None |
emotion_tokenizer = None |
emotion_model = None |
clinical_analyzer = None |
def load_models(): |
"""Initialize and load required ML models.""" |
global processor, whisper_model, emotion_tokenizer, emotion_model |
try: |
print("Loading Whisper model...") |
processor = WhisperProcessor.from_pretrained( |
"openai/whisper-tiny", |
use_auth_token=HUGGINGFACE_TOKEN |
) |
whisper_model = WhisperForConditionalGeneration.from_pretrained( |
"openai/whisper-tiny", |
use_auth_token=HUGGINGFACE_TOKEN |
) |
print("Loading emotion model...") |
emotion_tokenizer = AutoTokenizer.from_pretrained( |
"j-hartmann/emotion-english-distilroberta-base", |
use_auth_token=HUGGINGFACE_TOKEN |
) |
emotion_model = AutoModelForSequenceClassification.from_pretrained( |
"j-hartmann/emotion-english-distilroberta-base", |
use_auth_token=HUGGINGFACE_TOKEN |
) |
device = "cpu" |
whisper_model.to(device) |
emotion_model.to(device) |
print("Models loaded successfully!") |
return True |
except Exception as e: |
print(f"Error loading models: {str(e)}") |
return False |
def extract_prosodic_features(waveform, sr): |
"""Extract voice features from audio data.""" |
try: |
if waveform is None or len(waveform) == 0: |
return None |
features = {} |
try: |
pitches, magnitudes = librosa.piptrack( |
y=waveform, |
sr=sr, |
fmin=50, |
fmax=2000, |
n_mels=128, |
hop_length=512, |
win_length=2048 |
) |
f0_contour = [ |
pitches[magnitudes[:, t].argmax(), t] |
for t in range(pitches.shape[1]) |
if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000 |
] |
if f0_contour: |
features['pitch_mean'] = float(np.mean(f0_contour)) |
features['pitch_std'] = float(np.std(f0_contour)) |
features['pitch_range'] = float(np.ptp(f0_contour)) |
else: |
features['pitch_mean'] = 160.0 |
features['pitch_std'] = 0.0 |
features['pitch_range'] = 0.0 |
except Exception as e: |
print(f"Pitch extraction error: {e}") |
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0}) |
try: |
rms = librosa.feature.rms( |
y=waveform, |
frame_length=2048, |
hop_length=512, |
center=True |
)[0] |
features.update({ |
'energy_mean': float(np.mean(rms)), |
'energy_std': float(np.std(rms)), |
'energy_range': float(np.ptp(rms)) |
}) |
except Exception as e: |
print(f"Energy extraction error: {e}") |
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0}) |
try: |
onset_env = librosa.onset.onset_strength( |
y=waveform, |
sr=sr, |
hop_length=512, |
aggregate=np.median |
) |
tempo = librosa.beat.tempo( |
onset_envelope=onset_env, |
sr=sr, |
hop_length=512, |
aggregate=None |
)[0] |
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0 |
except Exception as e: |
print(f"Rhythm extraction error: {e}") |
features['tempo'] = 120.0 |
return features |
except Exception as e: |
print(f"Feature extraction failed: {e}") |
return None |
class ClinicalVoiceAnalyzer: |
"""Analyze voice characteristics for psychological indicators.""" |
def __init__(self): |
"""Initialize analyzer with API and reference ranges.""" |
try: |
raise ValueError("ANTHROPIC_API_KEY not found in environment variables") |
self.anthropic = Anthropic(api_key=ANTHROPIC_API_KEY) |
self.model = "claude-3-opus-20240229" |
self.reference_ranges = { |
'pitch': {'min': 150, 'max': 400}, |
'tempo': {'min': 90, 'max': 130}, |
'energy': {'min': 0.01, 'max': 0.05} |
} |
print("Clinical analyzer ready") |
except Exception as e: |
print(f"Error initializing clinical analyzer: {e}") |
self.anthropic = None |
def analyze_voice_metrics(self, features, emotions, transcription): |
"""Generate clinical insights from voice and emotion data.""" |
try: |
if not self.anthropic: |
return self._generate_backup_analysis(features, emotions) |
prompt = self._create_clinical_prompt(features, emotions, transcription) |
print("Sending analysis request to Anthropic API...") |
response = self.anthropic.messages.create( |
model=self.model, |
max_tokens=1000, |
messages=[{ |
"role": "user", |
"content": prompt |
}], |
temperature=0.7 |
) |
if response and hasattr(response, 'content'): |
print("Received response from Anthropic API") |
return self._format_analysis(response.content) |
else: |
print("No valid response from API") |
return self._generate_backup_analysis(features, emotions) |
except Exception as e: |
print(f"Clinical analysis error: {e}") |
return self._generate_backup_analysis(features, emotions) |
def _create_clinical_prompt(self, features, emotions, transcription): |
"""Create detailed prompt for clinical analysis.""" |
prompt = f"""As a clinical voice analysis expert, provide a detailed psychological assessment based on the following data: |
Voice Characteristics Analysis: |
- Pitch: {features['pitch_mean']:.2f} Hz (Normal range: {self.reference_ranges['pitch']['min']}-{self.reference_ranges['pitch']['max']} Hz) |
- Pitch Variation: {features['pitch_std']:.2f} Hz |
- Speech Rate: {features['tempo']:.2f} BPM (Normal range: {self.reference_ranges['tempo']['min']}-{self.reference_ranges['tempo']['max']} BPM) |
- Voice Energy Level: {features['energy_mean']:.4f} (Normal range: {self.reference_ranges['energy']['min']}-{self.reference_ranges['energy']['max']}) |
Emotional Analysis: |
{', '.join(f'{emotion}: {score:.1%}' for emotion, score in emotions.items())} |
Speech Content: |
"{transcription}" |
Please provide a comprehensive assessment including: |
1. Detailed voice characteristic analysis and what it indicates about mental state |
2. Assessment of emotional state based on both voice features and detected emotions |
3. Potential indicators of anxiety, depression, or other mental health concerns |
4. Evaluation of stress levels and emotional stability |
5. Specific recommendations for mental health professionals or further assessment if needed |
Base your analysis on established clinical research connecting voice biomarkers to psychological states.""" |
print(f"Generated prompt length: {len(prompt)} characters") |
return prompt |
def _format_analysis(self, analysis): |
"""Format the clinical analysis output.""" |
return f"\nClinical Assessment:\n{analysis}" |
def _generate_backup_analysis(self, features, emotions): |
"""Generate basic analysis when API is unavailable.""" |
try: |
dominant_emotion = max(emotions.items(), key=lambda x: x[1]) |
pitch_status = ( |
"elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] |
else "reduced" if features['pitch_mean'] < self.reference_ranges['pitch']['min'] |
else "normal" |
) |
tempo_status = ( |
"rapid" if features['tempo'] > self.reference_ranges['tempo']['max'] |
else "slow" if features['tempo'] < self.reference_ranges['tempo']['min'] |
else "normal" |
) |
energy_status = ( |
"high" if features['energy_mean'] > self.reference_ranges['energy']['max'] |
else "low" if features['energy_mean'] < self.reference_ranges['energy']['min'] |
else "normal" |
) |
return f""" |
Detailed Voice Analysis: |
- Pitch Status: {pitch_status} ({features['pitch_mean']:.2f} Hz) |
- Speech Rate: {features['tempo']:.2f} BPM ({tempo_status}) |
- Voice Energy Level: {features['energy_mean']:.4f} ({energy_status}) |
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence) |
Potential Indicators: |
- Pitch: {self._interpret_pitch(features['pitch_mean'], pitch_status)} |
- Rate: {self._interpret_tempo(features['tempo'], tempo_status)} |
- Energy: {self._interpret_energy(features['energy_mean'], energy_status)} |
""" |
except Exception as e: |
print(f"Error in backup analysis: {e}") |
return "Error generating analysis. Please try again." |
def _interpret_pitch(self, pitch, status): |
if status == "elevated": |
return "May indicate heightened stress or anxiety" |
elif status == "reduced": |
return "Could suggest low energy or depressed mood" |
return "Within normal range, suggesting stable emotional state" |
def _interpret_tempo(self, tempo, status): |
if status == "rapid": |
return "May indicate anxiety or agitation" |
elif status == "slow": |
return "Could suggest fatigue or low mood" |
return "Normal pacing indicates balanced emotional state" |
def _interpret_energy(self, energy, status): |
if status == "high": |
return "May indicate heightened emotional state or agitation" |
elif status == "low": |
return "Could suggest reduced emotional expression or fatigue" |
return "Appropriate energy level suggests emotional stability" |
def create_feature_plots(features): |
"""Create visualizations for voice features.""" |
try: |
fig = go.Figure() |
pitch_data = { |
'Mean': features['pitch_mean'], |
'Std Dev': features['pitch_std'], |
'Range': features['pitch_range'] |
} |
fig.add_trace(go.Bar( |
name='Pitch Features (Hz)', |
x=list(pitch_data.keys()), |
y=list(pitch_data.values()), |
marker_color='blue' |
)) |
energy_data = { |
'Mean': features['energy_mean'], |
'Std Dev': features['energy_std'], |
'Range': features['energy_range'] |
} |
fig.add_trace(go.Bar( |
name='Energy Features', |
x=[f"Energy {k}" for k in energy_data.keys()], |
y=list(energy_data.values()), |
marker_color='red' |
)) |
fig.add_trace(go.Scatter( |
name='Speech Rate (BPM)', |
x=['Tempo'], |
y=[features['tempo']], |
mode='markers', |
marker=dict(size=15, color='green') |
)) |
fig.update_layout( |
title='Voice Feature Analysis', |
showlegend=True, |
height=600, |
barmode='group', |
xaxis_title='Feature Type', |
yaxis_title='Value', |
template='plotly_white' |
) |
return fig.to_html(include_plotlyjs=True) |
except Exception as e: |
print(f"Plot creation error: {e}") |
return None |
def create_emotion_plot(emotions): |
"""Create visualization for emotion analysis.""" |
try: |
fig = go.Figure(data=[ |
go.Bar( |
x=list(emotions.keys()), |
y=list(emotions.values()), |
marker_color=['#FF9999', '#66B2FF', '#99FF99', |
'#FFCC99', '#FF99CC', '#99FFFF'] |
) |
]) |
fig.update_layout( |
title='Emotion Analysis', |
xaxis_title='Emotion', |
yaxis_title='Confidence Score', |
yaxis_range=[0, 1], |
template='plotly_white', |
height=400 |
) |
return fig.to_html(include_plotlyjs=True) |
except Exception as e: |
print(f"Emotion plot error: {e}") |
return None |
def analyze_audio(audio_input): |
"""Main function for audio analysis.""" |
try: |
if audio_input is None: |
return "Please provide an audio input", None, None |
audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input |
waveform, sr = librosa.load(audio_path, sr=16000, duration=30) |
duration = len(waveform) / sr |
if duration < 0.5: |
return "Audio too short (minimum 0.5 seconds needed)", None, None |
features = extract_prosodic_features(waveform, sr) |
if features is None: |
return "Feature extraction failed", None, None |
feature_viz = create_feature_plots(features) |
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features |
with torch.no_grad(): |
predicted_ids = whisper_model.generate(inputs) |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
emotion_inputs = emotion_tokenizer( |
transcription, |
return_tensors="pt", |
padding=True, |
truncation=True, |
max_length=512 |
) |
with torch.no_grad(): |
emotion_outputs = emotion_model(**emotion_inputs) |
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
emotion_scores = { |
label: float(score) |
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) |
} |
emotion_viz = create_emotion_plot(emotion_scores) |
global clinical_analyzer |
if clinical_analyzer is None: |
clinical_analyzer = ClinicalVoiceAnalyzer() |
print("Initiating clinical analysis...") |
clinical_analysis = clinical_analyzer.analyze_voice_metrics( |
features, emotion_scores, transcription |
) |
print("Clinical analysis completed") |
summary = f"""Voice Analysis Summary: |
Speech Content: |
{transcription} |
Voice Characteristics: |
- Average Pitch: {features['pitch_mean']:.2f} Hz |
- Pitch Variation: {features['pitch_std']:.2f} Hz |
- Speech Rate (Tempo): {features['tempo']:.2f} BPM |
- Voice Energy: {features['energy_mean']:.4f} |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} |
Emotion Confidence: {max(emotion_scores.values()):.2%} |
Recording Duration: {duration:.2f} seconds |
{clinical_analysis} |
""" |
return summary, emotion_viz, feature_viz |
except Exception as e: |
error_msg = f"Analysis failed: {str(e)}" |
print(error_msg) |
return error_msg, None, None |
try: |
print("===== Application Startup =====") |
if not load_models(): |
raise RuntimeError("Model loading failed") |
clinical_analyzer = ClinicalVoiceAnalyzer() |
print("Clinical analyzer initialized") |
description = """This application provides comprehensive voice analysis with clinical insights: |
1. Voice Features: |
- Pitch analysis (fundamental frequency and variation) |
- Energy patterns (volume and intensity) |
- Speech rate (words per minute) |
- Voice quality metrics |
2. Clinical Analysis: |
- Mental health indicators |
- Emotional state evaluation |
- Risk assessment |
- Clinical recommendations |
3. Emotional Content: |
- Emotion detection (6 basic emotions) |
- Emotional intensity analysis |
For optimal results: |
- Record in a quiet environment |
- Speak clearly and naturally |
- Keep recordings between 1-5 seconds |
- Maintain consistent volume |
Upload an audio file or record directly through your microphone.""" |
demo = gr.Interface( |
fn=analyze_audio, |
inputs=gr.Audio( |
sources=["microphone", "upload"], |
type="filepath", |
label="Audio Input (Recommended: 1-5 seconds of clear speech)" |
), |
outputs=[ |
gr.Textbox(label="Analysis Summary", lines=15), |
gr.HTML(label="Emotion Analysis"), |
gr.HTML(label="Voice Feature Analysis") |
], |
title="Voice Analysis System with Clinical Interpretation", |
description=description, |
article="""This system uses advanced AI models to analyze voice patterns and provide mental health insights. |
The analysis combines speech recognition, emotion detection, and clinical interpretation to offer |
a comprehensive understanding of psychological indicators present in voice characteristics. |
Note: This tool is for informational purposes only and should not be used as a substitute for |
professional medical advice, diagnosis, or treatment.""", |
examples=None, |
cache_examples=False, |
theme="default" |
) |
if __name__ == "__main__": |
demo.launch( |
server_name="", |
server_port=7860, |
share=False, |
debug=False |
) |
except Exception as e: |
print(f"Error during application startup: {str(e)}") |
raise |