File size: 4,421 Bytes
cef05ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dff819c
0067ac9
 
cef05ee
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import sys
import logging
import gradio as gr
import shutil
from demucs_handler import DemucsProcessor, check_dependencies, configure_model
from whisper_handler import WhisperTranscriber
import tempfile
import torch
import torchaudio
import soundfile as sf
import librosa
import numpy as np

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def validate_environment():
    try:
        import torch
        import torchaudio
        import demucs
        logging.info(f"PyTorch version: {torch.__version__}")
        logging.info(f"Torchaudio version: {torchaudio.__version__}")
        logging.info(f"CUDA available: {torch.cuda.is_available()}")
    except ImportError as e:
        logging.error(f"Environment validation failed: {e}")
        sys.exit(1)

def create_interface():
    validate_environment()
    processor = DemucsProcessor()
    transcriber = WhisperTranscriber()
    
    def process_audio(audio_file, whisper_model="base", progress=gr.Progress()):
        if audio_file is None:
            return None, "Please upload an audio file."
        
        temp_files = []
        try:
            progress(0, desc="Starting processing")
            logging.info(f"Processing file: {audio_file}")
            
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_audio_path = os.path.join(temp_dir, "input.wav")
                vocals_output_path = os.path.join(temp_dir, "vocals.wav")
                
                # Convert to WAV first
                audio, sr = librosa.load(audio_file, sr=44100)
                # Fixed: use samplerate instead of sr
                sf.write(temp_audio_path, audio, samplerate=sr)
                temp_files.append(temp_audio_path)
                
                progress(0.1, desc="Separating vocals")
                try:
                    vocals_path = processor.separate_vocals(temp_audio_path)
                    # Copy vocals to output path
                    shutil.copy2(vocals_path, vocals_output_path)
                    temp_files.append(vocals_output_path)
                except RuntimeError as e:
                    logging.error(f"Vocal separation failed: {str(e)}")
                    return None, f"Vocal separation failed: {str(e)}"
                
                # Load the processed vocals for playback
                vocals_audio, vocals_sr = librosa.load(vocals_output_path, sr=None)
                
                progress(0.75, desc="Transcribing")
                lyrics = transcriber.transcribe(vocals_output_path)
                progress(1.0, desc="Processing complete")
                
                # Return the audio data tuple and lyrics
                return (vocals_sr, vocals_audio), lyrics
                
        except Exception as e:
            error_message = f"Processing error: {str(e)}"
            logging.error(error_message)
            return None, error_message
        finally:
            # Cleanup temporary files
            for file in temp_files:
                if file and os.path.exists(file):
                    try:
                        os.remove(file)
                    except:
                        pass

    interface = gr.Interface(
        fn=process_audio,
        inputs=[
            gr.Audio(label="Upload Audio File", type="filepath"),
            gr.Dropdown(
                choices=["tiny", "base", "small", "medium", "large-v2"],
                value="medium",
                label="Whisper Model Size"
            )
        ],
        outputs=[
            gr.Audio(label="Isolated Vocals", type="numpy"),
            gr.Textbox(label="Transcribed Lyrics", lines=10, max_lines=20)
        ],
        title="Audio Lyrics Extractor",
        description="Upload an audio file to extract vocals and transcribe lyrics\n"+
                   " Created by Ever Olivares - Looking for Summer 2025 Internship Opportunities\n" +
                   " Connect with me: [LinkedIn](https://www.linkedin.com/in/everolivares/)"+" Currently not working as intended on HF tested on LightningAI with T4 running largeV2",
        analytics_enabled=False
    )
    return interface

if __name__ == "__main__":
    if not check_dependencies():
        print("Please install missing dependencies")
        exit(1)
    interface = create_interface()
    interface.launch()