import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, MarianMTModel, MarianTokenizer, BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import torch
import logging
import traceback
from PIL import Image

logging.basicConfig(filename="error_log.txt", level=logging.ERROR, format="%(asctime)s - %(message)s")

chatbot_model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(chatbot_model_name)
chatbot_model = AutoModelForCausalLM.from_pretrained(chatbot_model_name)

blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name)

def get_translation_model(src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

chat_history_ids = None
MAX_LENGTH = 1024
MAX_HISTORY_LENGTH = 5

def generate_image_caption(image_path):
    try:
        image = Image.open(image_path)
        image.show() 
        image = blip_processor(images=image, return_tensors="pt").pixel_values
        with torch.no_grad():
            caption = blip_model.generate(image, max_length=50, num_beams=5)
        return blip_processor.decode(caption[0], skip_special_tokens=True)
    except Exception as e:
        logging.error(f"Error in BLIP image captioning: {str(e)}\n{traceback.format_exc()}")
        return "Error processing image."

def chatbot_with_image(message, language, image_path=None, reset=False):
    global chat_history_ids

    if reset:
        chat_history_ids = None
        return "Chat history reset.", None

    if not message.strip() and not image_path:
        return "Please enter a message or upload an image.", None

    bot_response = ""

    try:
        if message.strip():
            new_user_input_ids = tokenizer.encode(message + tokenizer.eos_token, return_tensors="pt")
            if chat_history_ids is not None:
                chat_history_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
            else:
                chat_history_ids = new_user_input_ids

            if chat_history_ids.shape[-1] > MAX_HISTORY_LENGTH * MAX_LENGTH:
                chat_history_ids = chat_history_ids[:, -MAX_HISTORY_LENGTH * MAX_LENGTH:]

            bot_input_ids = chat_history_ids
            chat_history_ids = chatbot_model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
            bot_response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    except Exception as e:
        bot_response = f"Error processing message: {str(e)}"
        logging.error(f"Error in chatbot response generation: {str(e)}\n{traceback.format_exc()}")

    if image_path:
        try:
            image_caption = generate_image_caption(image_path)
            bot_response += f"The image shows: {image_caption}."
        except Exception as e:
            bot_response += f" Error processing image: {str(e)}"
            logging.error(f"Error in image processing: {str(e)}\n{traceback.format_exc()}")

    try:
        if language != "en":
            translation_model, translation_tokenizer = get_translation_model("en", language)
            translated = translation_model.generate(**translation_tokenizer(bot_response, return_tensors="pt", padding=True, truncation=True))
            bot_response = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
    except Exception as e:
        bot_response += f" Error in translation: {str(e)}"
        logging.error(f"Error in translation: {str(e)}\n{traceback.format_exc()}")

    try:
        tts = gTTS(bot_response, lang=language)
        audio_path = "response.mp3"
        tts.save(audio_path)
    except Exception as e:
        bot_response += f" Error generating TTS: {str(e)}"
        logging.error(f"Error in TTS generation: {str(e)}\n{traceback.format_exc()}")
        audio_path = None

    return bot_response, audio_path

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("### Chatbot with Image Understanding and Language Support")

    with gr.Row():
        output_audio = gr.Audio(label="Generated Speech", type="filepath")
        output_text = gr.Textbox(label="Bot Response")
    
    language_dropdown = gr.Dropdown(
        choices=["en", "es", "fr", "de", "it", "zh", "pl"], 
        label="Select Language",
        value="en"  
    )
    image_input = gr.Image(label="Upload Image", type="filepath")
    text_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
  
    with gr.Row():
        send_button = gr.Button("Send")
        reset_button = gr.Button("Reset Chat")
        
  
    send_button.click(
        chatbot_with_image,
        inputs=[text_input, language_dropdown, image_input, gr.State(False)],
        outputs=[output_text, output_audio]
    )

    reset_button.click(
        fn=lambda reset: ("Chat history reset.", None) if reset else ("", None),
        inputs=[gr.State(True)],
        outputs=[output_text, output_audio]
    )

if __name__ == "__main__":
    demo.launch(share=True)