Spaces:

naonauno
/

dialogs-factory

Paused

App Files Files Community

dialogs-factory / app.py

naonauno

Update app.py

49d166b verified 23 days ago

raw

history blame contribute delete

16.3 kB

	import os
	import gradio as gr
	import threading
	import discord
	from discord import app_commands
	from typing import List
	from elevenlabs import set_api_key, voices, generate, Voice, VoiceSettings, User
	import tempfile
	import io
	import speech_recognition as sr
	from pydub import AudioSegment
	import logging
	import google.generativeai as genai
	import asyncio

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Configure Gemini AI
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if GEMINI_API_KEY:
	genai.configure(api_key=GEMINI_API_KEY)
	model = genai.GenerativeModel('gemini-pro')
	else:
	logger.warning("GEMINI_API_KEY not found! Accent modification will be disabled.")

	# Set your ElevenLabs API key
	ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
	set_api_key(ELEVENLABS_API_KEY)

	# Role configurations
	ALLOWED_ROLES = ["+mechanic", "+trusted"]
	CHAR_LIMITS = {
	"+mechanic": 500,
	"+trusted": 3000,
	"Administrator": float('inf')
	}

	# Accent configurations
	ACCENT_OPTIONS = ["American", "Arabian", "Russian"]
	# Update the ACCENT_PROMPTS dictionary
	ACCENT_PROMPTS = {
	"Russian": """Modify this text to sound like someone speaking with a strong Russian accent. Rules:
	1. Use simple Russian words like 'da', 'nyet', 'tovarisch' occasionally
	2. Drop articles ('the', 'a') sometimes and insist on the 'r' sometimes
	3. DO NOT add any annotations, asterisks, or explanations
	4. DO NOT add formatting or parentheses
	5. Return ONLY the modified text
	Example input: "We will work tomorrow my friend"
	Example output: Ve vill vork tomorrow, tovarisch""",
	"Arabian": """Modify this text to sound like someone speaking with an Arabian accent. Rules:
	1. Use simple Arabic words like 'habibi', 'yalla', 'wallah' occasionally
	2. Modify 'th' sounds to 'z' sometimes
	3. DO NOT add any annotations, asterisks, or explanations
	4. DO NOT add formatting or parentheses
	5. Return ONLY the modified text
	Example input: "Hello my friend, how are you today?"
	Example output: Habibi, how are you zis beautiful day?""",
	"American": """Modify this text to sound like someone speaking with a strong American accent. Rules:
	1. Use American colloquialisms
	2. Add casual American phrases
	3. DO NOT add any annotations, asterisks, or explanations
	4. DO NOT add formatting or parentheses
	5. Return ONLY the modified text
	Example input: "Hello there, how are you?"
	Example output: Howdy partner, how ya doin'?"""
	}

	def get_available_voices():
	"""Fetch only custom voices from ElevenLabs account"""
	all_voices = voices()
	return {voice.name: voice.voice_id for voice in all_voices if not voice.category == "premade"}

	def get_remaining_credits():
	"""Get remaining character credits from ElevenLabs"""
	user = User.from_api()
	subscription = user.subscription
	return {
	"character_count": subscription.character_count,
	"character_limit": subscription.character_limit
	}

	def format_credits_message(credits_info):
	"""Format credits information into a readable message"""
	return f"Credits Status: {credits_info['character_count']} / {credits_info['character_limit']}"

	def has_permission(member: discord.Member) -> tuple[bool, int]:
	"""Check if member has permission and return their character limit"""
	if member.guild_permissions.administrator:
	return True, CHAR_LIMITS["Administrator"]

	member_roles = [role.name for role in member.roles]
	for role_name in ALLOWED_ROLES:
	if role_name in member_roles:
	return True, CHAR_LIMITS.get(role_name, 0)

	return False, 0

	async def modify_accent(text: str, accent: str, enhance: bool = False) -> str:
	"""Modify text based on selected accent using Gemini AI"""
	if not GEMINI_API_KEY or not accent or accent == "American":
	return text

	base_prompt = ACCENT_PROMPTS[accent]
	if enhance:
	base_prompt += """\n\nAdd more authentic elements:
	1. Include more cultural phrases
	2. Use more native words (but keep text mostly understandable)
	3. Adjust speech patterns
	BUT REMEMBER:
	- DO NOT add any annotations or explanations
	- DO NOT use asterisks or parentheses
	- Return ONLY the modified text"""

	prompt = f"{base_prompt}\n\nInput text: {text}\nModified text:"

	try:
	response = await model.generate_content_async(prompt)
	# Get content from the first part of the first candidate
	parts = response.candidates[0].content.parts
	if not parts:
	return text

	modified_text = parts[0].text.strip()

	# Clean up any remaining annotations or formatting
	modified_text = modified_text.replace('*', '').replace('(', '').replace(')', '')
	modified_text = modified_text.split('\n')[0] if '\n' in modified_text else modified_text

	# If response is empty or contains unwanted formatting, return original
	if not modified_text or '**' in modified_text or 'Enhanced Text:' in modified_text:
	return text

	return modified_text
	except Exception as e:
	logger.error(f"Error modifying accent: {str(e)}")
	return text

	# Get available voices early
	VOICE_LIST = get_available_voices()

	# Discord bot setup
	class VoiceBot(discord.Client):
	def __init__(self):
	super().__init__(intents=discord.Intents.default())
	self.tree = app_commands.CommandTree(self)
	self.guild_id = int(os.getenv('DISCORD_GUILD_ID', '0'))
	self.activity = discord.Activity(
	type=discord.ActivityType.watching,
	name="voice creation \| /create /list"
	)

	async def setup_hook(self):
	"""This is called when the bot starts up"""
	guild = discord.Object(id=self.guild_id)
	self.tree.copy_global_to(guild=guild)
	await self.tree.sync(guild=guild)

	client = VoiceBot()
	tree = client.tree

	@tree.command(name="list", description="List all available voices", guild=discord.Object(id=int(os.getenv('DISCORD_GUILD_ID', '0'))))
	async def voice_list(interaction: discord.Interaction):
	await interaction.response.defer()
	voice_list = "\n".join([f"• {name}" for name in VOICE_LIST.keys()])
	credits_info = get_remaining_credits()
	credits_msg = format_credits_message(credits_info)

	embed = discord.Embed(
	title="Available Voices",
	description=f"{voice_list}\n\n{credits_msg}",
	color=0x2B2D31
	)
	await interaction.followup.send(embed=embed)

	async def voice_autocomplete(interaction: discord.Interaction, current: str) -> List[app_commands.Choice[str]]:
	return [
	app_commands.Choice(name=voice, value=voice)
	for voice in VOICE_LIST.keys()
	if current.lower() in voice.lower()
	][:25]

	@tree.command(name="create", description="Create a voice message", guild=discord.Object(id=int(os.getenv('DISCORD_GUILD_ID', '0'))))
	@app_commands.describe(
	text="Text to convert to speech",
	voice_name="Select a voice to use",
	stability="Voice stability (0-1)",
	clarity="Voice clarity (0-1)",
	style="Speaking style (0-1)",
	accent="Select an accent style (optional)",
	accent_enhance="Add cultural elements to enhance the accent"
	)
	@app_commands.choices(accent=[
	app_commands.Choice(name=accent, value=accent)
	for accent in ACCENT_OPTIONS
	])
	@app_commands.autocomplete(voice_name=voice_autocomplete)
	async def voice_create(
	interaction: discord.Interaction,
	text: str,
	voice_name: str,
	stability: float = 0.5,
	clarity: float = 0.75,
	style: float = 0.5,
	accent: str = None,
	accent_enhance: bool = False
	):
	await interaction.response.defer()

	# Check permissions
	has_perm, char_limit = has_permission(interaction.user)
	if not has_perm:
	embed = discord.Embed(
	title="Permission Denied",
	description="You need to be an administrator or have the +mechanic/+trusted role to use this command.",
	color=0xFF0000
	)
	await interaction.followup.send(embed=embed)
	return

	# Check character limit
	if len(text) > char_limit:
	embed = discord.Embed(
	title="Character Limit Exceeded",
	description=f"Your message exceeds your character limit of {char_limit}. Current length: {len(text)}",
	color=0xFF0000
	)
	await interaction.followup.send(embed=embed)
	return

	# Process accent if specified
	if accent:
	text = await modify_accent(text, accent, accent_enhance)

	if voice_name not in VOICE_LIST:
	embed = discord.Embed(
	title="Voice Not Found",
	description=f"The voice '{voice_name}' was not found. Use `/list` to see available voices.",
	color=0x2B2D31
	)
	await interaction.followup.send(embed=embed)
	return

	try:
	voice_settings = VoiceSettings(
	stability=stability,
	similarity_boost=clarity,
	style=style,
	use_speaker_boost=True
	)

	audio = generate(
	text=text,
	voice=Voice(
	voice_id=VOICE_LIST[voice_name],
	settings=voice_settings
	)
	)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
	audio_segment = AudioSegment.from_file(io.BytesIO(audio), format="mp3")
	audio_segment = audio_segment.set_frame_rate(22050).set_channels(1).set_sample_width(2)
	audio_segment.export(temp_file.name, format='wav')
	temp_path = temp_file.name

	credits_info = get_remaining_credits()
	credits_msg = format_credits_message(credits_info)

	accent_info = f"\nAccent: {accent}" if accent else ""
	accent_enhance_info = f"\nAccent Enhancement: {'On' if accent_enhance else 'Off'}" if accent else ""

	embed = discord.Embed(
	title="Voice Generated",
	description=f"Prompt: {text}\nVoice: {voice_name}\nStability: {stability}\nClarity: {clarity}\nStyle: {style}{accent_info}{accent_enhance_info}\n\n{credits_msg}",
	color=0x57F287
	)
	await interaction.followup.send(
	embed=embed,
	file=discord.File(temp_path)
	)

	os.unlink(temp_path)

	except Exception as e:
	logger.error(f"Error generating audio: {str(e)}")
	await interaction.followup.send(f"Error generating audio: {str(e)}")

	@client.event
	async def on_ready():
	logger.info(f"Bot is ready and logged in as {client.user}")
	await client.change_presence(activity=client.activity)

	# Gradio interface functions
	def text_to_speech(text, voice_name, stability, clarity, style):
	"""Convert text to speech using selected voice and parameters"""
	voice_settings = VoiceSettings(
	stability=stability,
	similarity_boost=clarity,
	style=style,
	use_speaker_boost=True
	)

	voice_id = VOICE_LIST[voice_name]
	audio = generate(
	text=text,
	voice=Voice(
	voice_id=voice_id,
	settings=voice_settings
	)
	)

	credits_info = get_remaining_credits()
	credits_message = format_credits_message(credits_info)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
	temp_file.write(audio)
	return temp_file.name, credits_message

	def speech_to_text(audio_file):
	"""Convert speech to text using speech recognition"""
	recognizer = sr.Recognizer()

	audio = AudioSegment.from_file(audio_file)
	wav_path = tempfile.mktemp(suffix=".wav")
	audio.export(wav_path, format="wav")

	with sr.AudioFile(wav_path) as source:
	audio_data = recognizer.record(source)
	try:
	text = recognizer.recognize_google(audio_data)
	return text
	except sr.UnknownValueError:
	return "Could not understand audio"
	except sr.RequestError:
	return "Error in speech recognition service"
	finally:
	os.unlink(wav_path)

	def speech_to_speech(audio_file, voice_name, stability, clarity, style):
	"""Convert speech to speech by first converting to text, then to speech"""
	text = speech_to_text(audio_file)
	if text.startswith("Error") or text.startswith("Could not"):
	return None, text, ""

	audio_output, credits_message = text_to_speech(text, voice_name, stability, clarity, style)
	return audio_output, text, credits_message

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# ElevenLabs Voice Generation")

	credits_info = get_remaining_credits()
	credits_display = gr.Markdown(format_credits_message(credits_info))

	with gr.Tab("Text to Speech"):
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(label="Text to convert", lines=5)
	voice_dropdown = gr.Dropdown(choices=list(VOICE_LIST.keys()), label="Select Voice")

	with gr.Row():
	stability = gr.Slider(minimum=0, maximum=1, value=0.5, label="Stability")
	clarity = gr.Slider(minimum=0, maximum=1, value=0.75, label="Clarity/Similarity Boost")
	style = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style")

	convert_btn = gr.Button("Convert")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio")
	credits_output = gr.Markdown()

	convert_btn.click(
	fn=text_to_speech,
	inputs=[text_input, voice_dropdown, stability, clarity, style],
	outputs=[audio_output, credits_output]
	)

	with gr.Tab("Speech to Speech"):
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(label="Input Audio", sources=["microphone", "upload"])
	voice_dropdown_s2s = gr.Dropdown(choices=list(VOICE_LIST.keys()), label="Select Voice")

	with gr.Row():
	stability_s2s = gr.Slider(minimum=0, maximum=1, value=0.5, label="Stability")
	clarity_s2s = gr.Slider(minimum=0, maximum=1, value=0.75, label="Clarity/Similarity Boost")
	style_s2s = gr.Slider(minimum=0, maximum=1, value=0.5, label="Style")

	convert_btn_s2s = gr.Button("Convert")

	with gr.Column():
	text_output = gr.Textbox(label="Recognized Text", lines=3)
	audio_output_s2s = gr.Audio(label="Generated Audio")
	credits_output_s2s = gr.Markdown()

	convert_btn_s2s.click(
	fn=speech_to_speech,
	inputs=[audio_input, voice_dropdown_s2s, stability, clarity_s2s, style_s2s],
	outputs=[audio_output_s2s, text_output, credits_output_s2s]
	)

	def start_discord_bot():
	"""Start the Discord bot"""
	DISCORD_TOKEN = os.getenv("DISCORD_BOT_TOKEN")
	if not DISCORD_TOKEN:
	logger.error("DISCORD_BOT_TOKEN not found!")
	return

	logger.info("Starting Discord bot...")
	try:
	asyncio.set_event_loop(asyncio.new_event_loop())
	client.run(DISCORD_TOKEN)
	except Exception as e:
	logger.error(f"Failed to start Discord bot: {str(e)}")

	# Start Discord bot in a separate thread
	discord_thread = threading.Thread(target=start_discord_bot, daemon=True)
	discord_thread.start()

	# Launch Gradio interface
	demo.launch(server_name="0.0.0.0", share=False)