import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, MarianMTModel, MarianTokenizer, BlipProcessor, BlipForConditionalGeneration from gtts import gTTS import torch import logging import traceback from PIL import Image logging.basicConfig(filename="error_log.txt", level=logging.ERROR, format="%(asctime)s - %(message)s") chatbot_model_name = "microsoft/DialoGPT-medium" tokenizer = AutoTokenizer.from_pretrained(chatbot_model_name) chatbot_model = AutoModelForCausalLM.from_pretrained(chatbot_model_name) blip_model_name = "Salesforce/blip-image-captioning-base" blip_processor = BlipProcessor.from_pretrained(blip_model_name) blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name) def get_translation_model(src_lang, tgt_lang): model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}' model = MarianMTModel.from_pretrained(model_name) tokenizer = MarianTokenizer.from_pretrained(model_name) return model, tokenizer chat_history_ids = None MAX_LENGTH = 1024 MAX_HISTORY_LENGTH = 5 def generate_image_caption(image_path): try: image = Image.open(image_path) image.show() image = blip_processor(images=image, return_tensors="pt").pixel_values with torch.no_grad(): caption = blip_model.generate(image, max_length=50, num_beams=5) return blip_processor.decode(caption[0], skip_special_tokens=True) except Exception as e: logging.error(f"Error in BLIP image captioning: {str(e)}\n{traceback.format_exc()}") return "Error processing image." def chatbot_with_image(message, language, image_path=None, reset=False): global chat_history_ids if reset: chat_history_ids = None return "Chat history reset.", None if not message.strip() and not image_path: return "Please enter a message or upload an image.", None bot_response = "" try: if message.strip(): new_user_input_ids = tokenizer.encode(message + tokenizer.eos_token, return_tensors="pt") if chat_history_ids is not None: chat_history_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) else: chat_history_ids = new_user_input_ids if chat_history_ids.shape[-1] > MAX_HISTORY_LENGTH * MAX_LENGTH: chat_history_ids = chat_history_ids[:, -MAX_HISTORY_LENGTH * MAX_LENGTH:] bot_input_ids = chat_history_ids chat_history_ids = chatbot_model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id) bot_response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True) except Exception as e: bot_response = f"Error processing message: {str(e)}" logging.error(f"Error in chatbot response generation: {str(e)}\n{traceback.format_exc()}") if image_path: try: image_caption = generate_image_caption(image_path) bot_response += f"The image shows: {image_caption}." except Exception as e: bot_response += f" Error processing image: {str(e)}" logging.error(f"Error in image processing: {str(e)}\n{traceback.format_exc()}") try: if language != "en": translation_model, translation_tokenizer = get_translation_model("en", language) translated = translation_model.generate(**translation_tokenizer(bot_response, return_tensors="pt", padding=True, truncation=True)) bot_response = translation_tokenizer.decode(translated[0], skip_special_tokens=True) except Exception as e: bot_response += f" Error in translation: {str(e)}" logging.error(f"Error in translation: {str(e)}\n{traceback.format_exc()}") try: tts = gTTS(bot_response, lang=language) audio_path = "response.mp3" tts.save(audio_path) except Exception as e: bot_response += f" Error generating TTS: {str(e)}" logging.error(f"Error in TTS generation: {str(e)}\n{traceback.format_exc()}") audio_path = None return bot_response, audio_path with gr.Blocks() as demo: with gr.Row(): gr.Markdown("### Chatbot with Image Understanding and Language Support") with gr.Row(): output_audio = gr.Audio(label="Generated Speech", type="filepath") output_text = gr.Textbox(label="Bot Response") language_dropdown = gr.Dropdown( choices=["en", "es", "fr", "de", "it", "zh", "pl"], label="Select Language", value="en" ) image_input = gr.Image(label="Upload Image", type="filepath") text_input = gr.Textbox(label="Your Message", placeholder="Type your message here...") with gr.Row(): send_button = gr.Button("Send") reset_button = gr.Button("Reset Chat") send_button.click( chatbot_with_image, inputs=[text_input, language_dropdown, image_input, gr.State(False)], outputs=[output_text, output_audio] ) reset_button.click( fn=lambda reset: ("Chat history reset.", None) if reset else ("", None), inputs=[gr.State(True)], outputs=[output_text, output_audio] ) if __name__ == "__main__": demo.launch(share=True)