Hev832
/

hex-rvc

Voice Conversion

Model card Files Files and versions Community

hex-rvc / lib /infer_libs /audio.py

Hev832's picture

Upload folder using huggingface_hub

736c789 verified 7 months ago

2.91 kB

	import numpy as np
	import av
	import ffmpeg
	import os
	import traceback
	import sys
	import subprocess

	platform_stft_mapping = {
	'linux': os.path.join(os.getcwd(), 'stftpitchshift'),
	'darwin': os.path.join(os.getcwd(), 'stftpitchshift'),
	'win32': os.path.join(os.getcwd(), 'stftpitchshift.exe'),
	}

	stft = platform_stft_mapping.get(sys.platform)

	def wav2(i, o, format):
	inp = av.open(i, 'rb')
	if format == "m4a": format = "mp4"
	out = av.open(o, 'wb', format=format)
	if format == "ogg": format = "libvorbis"
	if format == "mp4": format = "aac"

	ostream = out.add_stream(format)

	for frame in inp.decode(audio=0):
	for p in ostream.encode(frame): out.mux(p)

	for p in ostream.encode(None): out.mux(p)

	out.close()
	inp.close()

	def load_audio(file, sr, DoFormant=False, Quefrency=1.0, Timbre=1.0):
	formanted = False
	file = file.strip(' \n"')
	if not os.path.exists(file):
	raise RuntimeError(
	"Wrong audio path, that does not exist."
	)

	try:
	if DoFormant:
	print("Starting formant shift. Please wait as this process takes a while.")
	formanted_file = f"{os.path.splitext(os.path.basename(file))[0]}_formanted{os.path.splitext(os.path.basename(file))[1]}"
	command = (
	f'{stft} -i "{file}" -q "{Quefrency}" '
	f'-t "{Timbre}" -o "{formanted_file}"'
	)
	subprocess.run(command, shell=True)
	file = formanted_file
	print(f"Formanted {file}\n")

	# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
	# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
	# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
	file = (
	file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
	) # Prevent small white copy path head and tail with spaces and " and return
	out, _ = (
	ffmpeg.input(file, threads=0)
	.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
	.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
	)

	return np.frombuffer(out, np.float32).flatten()

	except Exception as e:
	raise RuntimeError(f"Failed to load audio: {e}")

	def check_audio_duration(file):
	try:
	file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")

	probe = ffmpeg.probe(file)

	duration = float(probe['streams'][0]['duration'])

	if duration < 0.76:
	print(
	f"Audio file, {file.split('/')[-1]}, under ~0.76s detected - file is too short. Target at least 1-2s for best results."
	)
	return False

	return True
	except Exception as e:
	raise RuntimeError(f"Failed to check audio duration: {e}")