Spaces:

harveysamson
/

wav2vec2-speech-emotion-recognition

Runtime error

harveysamson commited on Mar 28, 2022

Commit

cd87e9f

1 Parent(s): 5d47fc0

added comments

Files changed (3) hide show

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torch.nn.functional as F
 from transformers import AutoConfig, Wav2Vec2FeatureExtractor
-from src.models import Wav2Vec2ForSpeechClassification
 import gradio as gr
 import librosa
@@ -12,6 +12,7 @@ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
 sampling_rate = feature_extractor.sampling_rate
 model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
 def load_data(path):
     speech, sampling_rate = librosa.load(path)
     if len(speech.shape) > 1:
@@ -20,6 +21,7 @@ def load_data(path):
         speech = librosa.resample(speech, sampling_rate,16000)
     return speech
 def inference(path):
     speech = load_data(path)
     inputs = feature_extractor(speech, return_tensors="pt").input_values
@@ -32,7 +34,7 @@ def inference(path):
 inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="upload")
 outputs = gr.outputs.Label(type="confidences", label = "Output Scores")
 title = "Wav2Vec2 Speech Emotion Recognition"
-description = "This is a demo of the Wav2Vec2 Speech Emotion Recognition model. Upload a .wav file (preferably small) and the top emotions predicted will be displayed."
 examples = ['data/heart.wav', 'data/happy26.wav', 'data/jm24.wav', 'data/newton.wav', 'data/speeding.wav']
 article = "<a href = 'https://github.com/m3hrdadfi/soxan'> Wav2Vec2 Speech Classification Github Repository"

 import torch
 import torch.nn.functional as F
 from transformers import AutoConfig, Wav2Vec2FeatureExtractor
+from src.models import Wav2Vec2ForSpeechClassification  #imported from https://github.com/m3hrdadfi/soxan
 import gradio as gr
 import librosa
 sampling_rate = feature_extractor.sampling_rate
 model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path)
+#load input file and resample to 16kHz
 def load_data(path):
     speech, sampling_rate = librosa.load(path)
     if len(speech.shape) > 1:
         speech = librosa.resample(speech, sampling_rate,16000)
     return speech
+#modified version of predict function from https://github.com/m3hrdadfi/soxan
 def inference(path):
     speech = load_data(path)
     inputs = feature_extractor(speech, return_tensors="pt").input_values
 inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="upload")
 outputs = gr.outputs.Label(type="confidences", label = "Output Scores")
 title = "Wav2Vec2 Speech Emotion Recognition"
+description = "This is a demo of the Wav2Vec2 Speech Emotion Recognition model. Upload an audio file and the top emotions predicted will be displayed."
 examples = ['data/heart.wav', 'data/happy26.wav', 'data/jm24.wav', 'data/newton.wav', 'data/speeding.wav']
 article = "<a href = 'https://github.com/m3hrdadfi/soxan'> Wav2Vec2 Speech Classification Github Repository"

src/modeling_outputs.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch

+#imported from https://github.com/m3hrdadfi/soxan to implement Wav2Vec2 for speech classification
 from dataclasses import dataclass
 from typing import Optional, Tuple
 import torch

src/models.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

+#imported from https://github.com/m3hrdadfi/soxan to implement Wav2Vec2 for speech classification
 import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss