Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import tempfile | |
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer | |
import gradio as gr | |
import fitz # PyMuPDF | |
import requests | |
from PIL import Image | |
import pytesseract | |
from langid import langid | |
from deep_translator import GoogleTranslator | |
import torch # Add this import | |
logging.basicConfig(level=logging.INFO) | |
device = 0 if torch.cuda.is_available() else -1 | |
logging.basicConfig(level=logging.INFO) | |
device = 0 if torch.cuda.is_available() else -1 | |
# Initialize multilingual QA pipeline | |
model_name = "mrm8488/bert-multi-cased-finetuned-xquadv1" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device) | |
INDIAN_LANGUAGES = { | |
'hi': 'Hindi', | |
'pa': 'Punjabi', | |
'bn': 'Bengali', | |
'gu': 'Gujarati', | |
'mr': 'Marathi', | |
'ta': 'Tamil', | |
'te': 'Telugu', | |
'kn': 'Kannada', | |
'ml': 'Malayalam', | |
'en': 'English' | |
} | |
def download_pdf_from_url(url): | |
try: | |
response = requests.get(url) | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: | |
temp_pdf.write(response.content) | |
return temp_pdf.name | |
except Exception as e: | |
logging.error(f"Error downloading PDF: {e}") | |
return None | |
def extract_text_from_pdf(pdf_path): | |
text = "" | |
try: | |
doc = fitz.open(pdf_path) | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
text += page.get_text("text") or "" | |
if not text.strip(): | |
images = [] | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
pix = page.get_pixmap() | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
for image in images: | |
ocr_text = pytesseract.image_to_string( | |
image, | |
lang='+'.join(['eng', 'hin', 'pan', 'ben', 'guj', 'mar', 'tam', 'tel', 'kan', 'mal']) | |
) | |
text += ocr_text | |
except Exception as e: | |
logging.error(f"Error extracting text: {e}") | |
return text | |
def detect_language(text): | |
if not text.strip(): | |
return 'en' | |
try: | |
lang_code, _ = langid.classify(text) | |
if lang_code in INDIAN_LANGUAGES: | |
return lang_code | |
else: | |
return 'en' | |
except Exception as e: | |
logging.error(f"Language detection error: {e}") | |
return 'en' | |
def process_qa(question, context, output_lang): | |
try: | |
result = qa_pipeline(question=question, context=context) | |
answer = result['answer'] | |
# Translate answer to the specified output language | |
if output_lang != 'en': | |
answer = GoogleTranslator(source='en', target=output_lang).translate(answer) | |
return answer | |
except Exception as e: | |
logging.error(f"QA processing error: {e}") | |
return str(e) | |
def analyze_input(input_source, question, output_lang): | |
try: | |
if isinstance(input_source, str) and input_source.startswith(('http://', 'https://')): | |
pdf_path = download_pdf_from_url(input_source) | |
else: | |
pdf_path = input_source.name | |
if not pdf_path: | |
return "Error: Invalid input source" | |
text = extract_text_from_pdf(pdf_path) | |
if not text.strip(): | |
return "No text extracted from document" | |
question_lang = detect_language(question) | |
logging.info(f"Detected question language: {question_lang}") | |
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] | |
answers = [process_qa(question, chunk, output_lang) for chunk in chunks if chunk.strip()] | |
final_answer = " ".join(filter(None, answers)) | |
return f"Answer ({INDIAN_LANGUAGES.get(output_lang, 'English')}): {final_answer}" | |
except Exception as e: | |
logging.error(f"Analysis error: {e}") | |
return f"Error: {str(e)}" | |
# Gradio Interface | |
def create_interface(): | |
output_lang_list = list(INDIAN_LANGUAGES.keys()) | |
return gr.Interface( | |
fn=analyze_input, | |
inputs=[ | |
gr.File(label="Upload PDF or Enter PDF URL"), | |
gr.Textbox(label="Enter your question"), | |
gr.Dropdown(choices=output_lang_list, label="Select Output Language", value='en') | |
], | |
outputs="text", | |
title="Indian Languages PDF QA System", | |
description="Support for Hindi, Punjabi, Bengali, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam, and English" | |
) | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch() |