!pip install gradio transformers torch gtts

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, MarianMTModel, MarianTokenizer, BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import torch
import logging
import traceback
from PIL import Image

logging.basicConfig(filename="error_log.txt", level=logging.ERROR, format="%(asctime)s - %(message)s")

chatbot_model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(chatbot_model_name)
chatbot_model = AutoModelForCausalLM.from_pretrained(chatbot_model_name)

blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name)

def get_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

chat_history_ids = None
MAX_LENGTH = 1024
MAX_HISTORY_LENGTH = 5

def generate_image_caption(image_path):
    try:
        image = Image.open(image_path)
        image.show() 
        image = blip_processor(images=image, return_tensors="pt").pixel_values
        with torch.no_grad():
            caption = blip_model.generate(image, max_length=50, num_beams=5)
        return blip_processor.decode(caption[0], skip_special_tokens=True)
    except Exception as e:
        logging.error(f"Error in BLIP image captioning: {str(e)}\n{traceback.format_exc()}")
        return "Error processing image."

def chatbot_with_image(message, language, image_path=None, reset=False):
    global chat_history_ids

    if reset:
        chat_history_ids = None
        return "Chat history reset.", None

    if not message.strip() and not image_path:
        return "Please enter a message or upload an image.", None

    bot_response = ""

    try:
        if message.strip():
            new_user_input_ids = tokenizer.encode(message + tokenizer.eos_token, return_tensors="pt")
            if chat_history_ids is not None:
                chat_history_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
            else:
                chat_history_ids = new_user_input_ids

            if chat_history_ids.shape[-1] > MAX_HISTORY_LENGTH * MAX_LENGTH:
                chat_history_ids = chat_history_ids[:, -MAX_HISTORY_LENGTH * MAX_LENGTH:]

            bot_input_ids = chat_history_ids
            chat_history_ids = chatbot_model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
            bot_response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    except Exception as e:
        bot_response = f"Error processing message: {str(e)}"
        logging.error(f"Error in chatbot response generation: {str(e)}\n{traceback.format_exc()}")

    if image_path:
        try:
            image_caption = generate_image_caption(image_path)
            bot_response += f"The image shows: {image_caption}."
        except Exception as e:
            bot_response += f" Error processing image: {str(e)}"
            logging.error(f"Error in image processing: {str(e)}\n{traceback.format_exc()}")

    try:
        if language != "en":
            translation_model, translation_tokenizer = get_translation_model("en", language)
            translated = translation_model.generate(**translation_tokenizer(bot_response, return_tensors="pt", padding=True, truncation=True))
            bot_response = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
    except Exception as e:
        bot_response += f" Error in translation: {str(e)}"
        logging.error(f"Error in translation: {str(e)}\n{traceback.format_exc()}")

    try:
        tts = gTTS(bot_response, lang=language)
        audio_path = "response.mp3"
        tts.save(audio_path)
    except Exception as e:
        bot_response += f" Error generating TTS: {str(e)}"
        logging.error(f"Error in TTS generation: {str(e)}\n{traceback.format_exc()}")
        audio_path = None

    return bot_response, audio_path

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("### Chatbot with Image Understanding and Language Support")

    with gr.Row():
        output_audio = gr.Audio(label="Generated Speech", type="filepath")
        output_text = gr.Textbox(label="Bot Response")
    
    language_dropdown = gr.Dropdown(
        choices=["en", "es", "fr", "de", "it", "zh", "pl"], 
        label="Select Language",
        value="en"  
    )
    image_input = gr.Image(label="Upload Image", type="filepath")
    text_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
  
    with gr.Row():
        send_button = gr.Button("Send")
        reset_button = gr.Button("Reset Chat")
        
  
    send_button.click(
        chatbot_with_image,
        inputs=[text_input, language_dropdown, image_input, gr.State(False)],
        outputs=[output_text, output_audio]
    )

    reset_button.click(
        fn=lambda reset: ("Chat history reset.", None) if reset else ("", None),
        inputs=[gr.State(True)],
        outputs=[output_text, output_audio]
    )

if __name__ == "__main__":
    demo.launch(share=True)