|
|
|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
import librosa |
|
import numpy as np |
|
import plotly.graph_objects as go |
|
import warnings |
|
import os |
|
from scipy.stats import kurtosis, skew |
|
from anthropic import Anthropic |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', 'your_anthropic_api_key') |
|
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN', 'your_huggingface_api_token') |
|
|
|
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
processor = None |
|
whisper_model = None |
|
emotion_tokenizer = None |
|
emotion_model = None |
|
clinical_analyzer = None |
|
|
|
def load_models(): |
|
"""Initialize and load required ML models.""" |
|
global processor, whisper_model, emotion_tokenizer, emotion_model |
|
|
|
try: |
|
print("Loading Whisper model...") |
|
processor = WhisperProcessor.from_pretrained( |
|
"openai/whisper-tiny", |
|
use_auth_token=HUGGINGFACE_TOKEN |
|
) |
|
whisper_model = WhisperForConditionalGeneration.from_pretrained( |
|
"openai/whisper-tiny", |
|
use_auth_token=HUGGINGFACE_TOKEN |
|
) |
|
|
|
print("Loading emotion model...") |
|
emotion_tokenizer = AutoTokenizer.from_pretrained( |
|
"j-hartmann/emotion-english-distilroberta-base", |
|
use_auth_token=HUGGINGFACE_TOKEN |
|
) |
|
emotion_model = AutoModelForSequenceClassification.from_pretrained( |
|
"j-hartmann/emotion-english-distilroberta-base", |
|
use_auth_token=HUGGINGFACE_TOKEN |
|
) |
|
|
|
device = "cpu" |
|
whisper_model.to(device) |
|
emotion_model.to(device) |
|
|
|
print("Models loaded successfully!") |
|
return True |
|
except Exception as e: |
|
print(f"Error loading models: {str(e)}") |
|
return False |
|
|
|
def extract_prosodic_features(waveform, sr): |
|
"""Extract voice features from audio data.""" |
|
try: |
|
if waveform is None or len(waveform) == 0: |
|
return None |
|
|
|
features = {} |
|
|
|
|
|
try: |
|
pitches, magnitudes = librosa.piptrack( |
|
y=waveform, |
|
sr=sr, |
|
fmin=50, |
|
fmax=2000, |
|
n_mels=128, |
|
hop_length=512, |
|
win_length=2048 |
|
) |
|
|
|
f0_contour = [ |
|
pitches[magnitudes[:, t].argmax(), t] |
|
for t in range(pitches.shape[1]) |
|
if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000 |
|
] |
|
|
|
if f0_contour: |
|
features['pitch_mean'] = float(np.mean(f0_contour)) |
|
features['pitch_std'] = float(np.std(f0_contour)) |
|
features['pitch_range'] = float(np.ptp(f0_contour)) |
|
else: |
|
features['pitch_mean'] = 160.0 |
|
features['pitch_std'] = 0.0 |
|
features['pitch_range'] = 0.0 |
|
|
|
except Exception as e: |
|
print(f"Pitch extraction error: {e}") |
|
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0}) |
|
|
|
|
|
try: |
|
rms = librosa.feature.rms( |
|
y=waveform, |
|
frame_length=2048, |
|
hop_length=512, |
|
center=True |
|
)[0] |
|
|
|
features.update({ |
|
'energy_mean': float(np.mean(rms)), |
|
'energy_std': float(np.std(rms)), |
|
'energy_range': float(np.ptp(rms)) |
|
}) |
|
except Exception as e: |
|
print(f"Energy extraction error: {e}") |
|
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0}) |
|
|
|
|
|
try: |
|
onset_env = librosa.onset.onset_strength( |
|
y=waveform, |
|
sr=sr, |
|
hop_length=512, |
|
aggregate=np.median |
|
) |
|
|
|
tempo = librosa.beat.tempo( |
|
onset_envelope=onset_env, |
|
sr=sr, |
|
hop_length=512, |
|
aggregate=None |
|
)[0] |
|
|
|
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0 |
|
|
|
except Exception as e: |
|
print(f"Rhythm extraction error: {e}") |
|
features['tempo'] = 120.0 |
|
|
|
return features |
|
except Exception as e: |
|
print(f"Feature extraction failed: {e}") |
|
return None |
|
|
|
class ClinicalVoiceAnalyzer: |
|
"""Analyze voice characteristics for psychological indicators.""" |
|
|
|
def __init__(self): |
|
"""Initialize analyzer with API and reference ranges.""" |
|
try: |
|
if not ANTHROPIC_API_KEY: |
|
raise ValueError("ANTHROPIC_API_KEY not found in environment variables") |
|
|
|
self.anthropic = Anthropic(api_key=ANTHROPIC_API_KEY) |
|
self.model = "claude-3-opus-20240229" |
|
|
|
self.reference_ranges = { |
|
'pitch': {'min': 150, 'max': 400}, |
|
'tempo': {'min': 90, 'max': 130}, |
|
'energy': {'min': 0.01, 'max': 0.05} |
|
} |
|
print("Clinical analyzer ready") |
|
except Exception as e: |
|
print(f"Error initializing clinical analyzer: {e}") |
|
self.anthropic = None |
|
|
|
def analyze_voice_metrics(self, features, emotions, transcription): |
|
"""Generate clinical insights from voice and emotion data.""" |
|
try: |
|
if not self.anthropic: |
|
return self._generate_backup_analysis(features, emotions) |
|
|
|
prompt = self._create_clinical_prompt(features, emotions, transcription) |
|
print("Sending analysis request to Anthropic API...") |
|
|
|
response = self.anthropic.messages.create( |
|
model=self.model, |
|
max_tokens=1000, |
|
messages=[{ |
|
"role": "user", |
|
"content": prompt |
|
}], |
|
temperature=0.7 |
|
) |
|
|
|
if response and hasattr(response, 'content'): |
|
print("Received response from Anthropic API") |
|
return self._format_analysis(response.content) |
|
else: |
|
print("No valid response from API") |
|
return self._generate_backup_analysis(features, emotions) |
|
|
|
except Exception as e: |
|
print(f"Clinical analysis error: {e}") |
|
return self._generate_backup_analysis(features, emotions) |
|
|
|
def _create_clinical_prompt(self, features, emotions, transcription): |
|
"""Create detailed prompt for clinical analysis.""" |
|
prompt = f"""As a clinical voice analysis expert, provide a detailed psychological assessment based on the following data: |
|
|
|
Voice Characteristics Analysis: |
|
- Pitch: {features['pitch_mean']:.2f} Hz (Normal range: {self.reference_ranges['pitch']['min']}-{self.reference_ranges['pitch']['max']} Hz) |
|
- Pitch Variation: {features['pitch_std']:.2f} Hz |
|
- Speech Rate: {features['tempo']:.2f} BPM (Normal range: {self.reference_ranges['tempo']['min']}-{self.reference_ranges['tempo']['max']} BPM) |
|
- Voice Energy Level: {features['energy_mean']:.4f} (Normal range: {self.reference_ranges['energy']['min']}-{self.reference_ranges['energy']['max']}) |
|
|
|
Emotional Analysis: |
|
{', '.join(f'{emotion}: {score:.1%}' for emotion, score in emotions.items())} |
|
|
|
Speech Content: |
|
"{transcription}" |
|
|
|
Please provide a comprehensive assessment including: |
|
1. Detailed voice characteristic analysis and what it indicates about mental state |
|
2. Assessment of emotional state based on both voice features and detected emotions |
|
3. Potential indicators of anxiety, depression, or other mental health concerns |
|
4. Evaluation of stress levels and emotional stability |
|
5. Specific recommendations for mental health professionals or further assessment if needed |
|
|
|
Base your analysis on established clinical research connecting voice biomarkers to psychological states.""" |
|
|
|
print(f"Generated prompt length: {len(prompt)} characters") |
|
return prompt |
|
|
|
def _format_analysis(self, analysis): |
|
"""Format the clinical analysis output.""" |
|
return f"\nClinical Assessment:\n{analysis}" |
|
|
|
def _generate_backup_analysis(self, features, emotions): |
|
"""Generate basic analysis when API is unavailable.""" |
|
try: |
|
dominant_emotion = max(emotions.items(), key=lambda x: x[1]) |
|
pitch_status = ( |
|
"elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] |
|
else "reduced" if features['pitch_mean'] < self.reference_ranges['pitch']['min'] |
|
else "normal" |
|
) |
|
|
|
tempo_status = ( |
|
"rapid" if features['tempo'] > self.reference_ranges['tempo']['max'] |
|
else "slow" if features['tempo'] < self.reference_ranges['tempo']['min'] |
|
else "normal" |
|
) |
|
|
|
energy_status = ( |
|
"high" if features['energy_mean'] > self.reference_ranges['energy']['max'] |
|
else "low" if features['energy_mean'] < self.reference_ranges['energy']['min'] |
|
else "normal" |
|
) |
|
|
|
return f""" |
|
Detailed Voice Analysis: |
|
- Pitch Status: {pitch_status} ({features['pitch_mean']:.2f} Hz) |
|
- Speech Rate: {features['tempo']:.2f} BPM ({tempo_status}) |
|
- Voice Energy Level: {features['energy_mean']:.4f} ({energy_status}) |
|
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence) |
|
|
|
Potential Indicators: |
|
- Pitch: {self._interpret_pitch(features['pitch_mean'], pitch_status)} |
|
- Rate: {self._interpret_tempo(features['tempo'], tempo_status)} |
|
- Energy: {self._interpret_energy(features['energy_mean'], energy_status)} |
|
""" |
|
except Exception as e: |
|
print(f"Error in backup analysis: {e}") |
|
return "Error generating analysis. Please try again." |
|
|
|
def _interpret_pitch(self, pitch, status): |
|
if status == "elevated": |
|
return "May indicate heightened stress or anxiety" |
|
elif status == "reduced": |
|
return "Could suggest low energy or depressed mood" |
|
return "Within normal range, suggesting stable emotional state" |
|
|
|
def _interpret_tempo(self, tempo, status): |
|
if status == "rapid": |
|
return "May indicate anxiety or agitation" |
|
elif status == "slow": |
|
return "Could suggest fatigue or low mood" |
|
return "Normal pacing indicates balanced emotional state" |
|
|
|
def _interpret_energy(self, energy, status): |
|
if status == "high": |
|
return "May indicate heightened emotional state or agitation" |
|
elif status == "low": |
|
return "Could suggest reduced emotional expression or fatigue" |
|
return "Appropriate energy level suggests emotional stability" |
|
|
|
def create_feature_plots(features): |
|
"""Create visualizations for voice features.""" |
|
try: |
|
fig = go.Figure() |
|
|
|
|
|
pitch_data = { |
|
'Mean': features['pitch_mean'], |
|
'Std Dev': features['pitch_std'], |
|
'Range': features['pitch_range'] |
|
} |
|
fig.add_trace(go.Bar( |
|
name='Pitch Features (Hz)', |
|
x=list(pitch_data.keys()), |
|
y=list(pitch_data.values()), |
|
marker_color='blue' |
|
)) |
|
|
|
|
|
energy_data = { |
|
'Mean': features['energy_mean'], |
|
'Std Dev': features['energy_std'], |
|
'Range': features['energy_range'] |
|
} |
|
fig.add_trace(go.Bar( |
|
name='Energy Features', |
|
x=[f"Energy {k}" for k in energy_data.keys()], |
|
y=list(energy_data.values()), |
|
marker_color='red' |
|
)) |
|
|
|
|
|
fig.add_trace(go.Scatter( |
|
name='Speech Rate (BPM)', |
|
x=['Tempo'], |
|
y=[features['tempo']], |
|
mode='markers', |
|
marker=dict(size=15, color='green') |
|
)) |
|
|
|
fig.update_layout( |
|
title='Voice Feature Analysis', |
|
showlegend=True, |
|
height=600, |
|
barmode='group', |
|
xaxis_title='Feature Type', |
|
yaxis_title='Value', |
|
template='plotly_white' |
|
) |
|
|
|
return fig.to_html(include_plotlyjs=True) |
|
except Exception as e: |
|
print(f"Plot creation error: {e}") |
|
return None |
|
|
|
def create_emotion_plot(emotions): |
|
"""Create visualization for emotion analysis.""" |
|
try: |
|
fig = go.Figure(data=[ |
|
go.Bar( |
|
x=list(emotions.keys()), |
|
y=list(emotions.values()), |
|
marker_color=['#FF9999', '#66B2FF', '#99FF99', |
|
'#FFCC99', '#FF99CC', '#99FFFF'] |
|
) |
|
]) |
|
|
|
fig.update_layout( |
|
title='Emotion Analysis', |
|
xaxis_title='Emotion', |
|
yaxis_title='Confidence Score', |
|
yaxis_range=[0, 1], |
|
template='plotly_white', |
|
height=400 |
|
) |
|
|
|
return fig.to_html(include_plotlyjs=True) |
|
except Exception as e: |
|
print(f"Emotion plot error: {e}") |
|
return None |
|
|
|
def analyze_audio(audio_input): |
|
"""Main function for audio analysis.""" |
|
try: |
|
if audio_input is None: |
|
return "Please provide an audio input", None, None |
|
|
|
audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input |
|
waveform, sr = librosa.load(audio_path, sr=16000, duration=30) |
|
|
|
duration = len(waveform) / sr |
|
if duration < 0.5: |
|
return "Audio too short (minimum 0.5 seconds needed)", None, None |
|
|
|
features = extract_prosodic_features(waveform, sr) |
|
if features is None: |
|
return "Feature extraction failed", None, None |
|
|
|
feature_viz = create_feature_plots(features) |
|
|
|
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features |
|
with torch.no_grad(): |
|
predicted_ids = whisper_model.generate(inputs) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
emotion_inputs = emotion_tokenizer( |
|
transcription, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512 |
|
) |
|
|
|
with torch.no_grad(): |
|
emotion_outputs = emotion_model(**emotion_inputs) |
|
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) |
|
|
|
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
|
emotion_scores = { |
|
label: float(score) |
|
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) |
|
} |
|
|
|
emotion_viz = create_emotion_plot(emotion_scores) |
|
|
|
global clinical_analyzer |
|
if clinical_analyzer is None: |
|
clinical_analyzer = ClinicalVoiceAnalyzer() |
|
|
|
print("Initiating clinical analysis...") |
|
clinical_analysis = clinical_analyzer.analyze_voice_metrics( |
|
features, emotion_scores, transcription |
|
) |
|
print("Clinical analysis completed") |
|
|
|
|
|
summary = f"""Voice Analysis Summary: |
|
|
|
Speech Content: |
|
{transcription} |
|
|
|
Voice Characteristics: |
|
- Average Pitch: {features['pitch_mean']:.2f} Hz |
|
- Pitch Variation: {features['pitch_std']:.2f} Hz |
|
- Speech Rate (Tempo): {features['tempo']:.2f} BPM |
|
- Voice Energy: {features['energy_mean']:.4f} |
|
|
|
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} |
|
Emotion Confidence: {max(emotion_scores.values()):.2%} |
|
|
|
Recording Duration: {duration:.2f} seconds |
|
|
|
{clinical_analysis} |
|
""" |
|
return summary, emotion_viz, feature_viz |
|
|
|
except Exception as e: |
|
error_msg = f"Analysis failed: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, None |
|
|
|
|
|
try: |
|
print("===== Application Startup =====") |
|
|
|
|
|
if not load_models(): |
|
raise RuntimeError("Model loading failed") |
|
|
|
|
|
clinical_analyzer = ClinicalVoiceAnalyzer() |
|
print("Clinical analyzer initialized") |
|
|
|
description = """This application provides comprehensive voice analysis with clinical insights: |
|
|
|
1. Voice Features: |
|
- Pitch analysis (fundamental frequency and variation) |
|
- Energy patterns (volume and intensity) |
|
- Speech rate (words per minute) |
|
- Voice quality metrics |
|
|
|
2. Clinical Analysis: |
|
- Mental health indicators |
|
- Emotional state evaluation |
|
- Risk assessment |
|
- Clinical recommendations |
|
|
|
3. Emotional Content: |
|
- Emotion detection (6 basic emotions) |
|
- Emotional intensity analysis |
|
|
|
For optimal results: |
|
- Record in a quiet environment |
|
- Speak clearly and naturally |
|
- Keep recordings between 1-5 seconds |
|
- Maintain consistent volume |
|
|
|
Upload an audio file or record directly through your microphone.""" |
|
|
|
demo = gr.Interface( |
|
fn=analyze_audio, |
|
inputs=gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Audio Input (Recommended: 1-5 seconds of clear speech)" |
|
), |
|
outputs=[ |
|
gr.Textbox(label="Analysis Summary", lines=15), |
|
gr.HTML(label="Emotion Analysis"), |
|
gr.HTML(label="Voice Feature Analysis") |
|
], |
|
title="Voice Analysis System with Clinical Interpretation", |
|
description=description, |
|
article="""This system uses advanced AI models to analyze voice patterns and provide mental health insights. |
|
The analysis combines speech recognition, emotion detection, and clinical interpretation to offer |
|
a comprehensive understanding of psychological indicators present in voice characteristics. |
|
|
|
Note: This tool is for informational purposes only and should not be used as a substitute for |
|
professional medical advice, diagnosis, or treatment.""", |
|
examples=None, |
|
cache_examples=False, |
|
theme="default" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
debug=False |
|
) |
|
|
|
except Exception as e: |
|
print(f"Error during application startup: {str(e)}") |
|
raise |