Spaces:
Sleeping
Sleeping
import gradio as gr | |
from llama_index.core import VectorStoreIndex, Document | |
from llama_index.core.node_parser import SentenceSplitter | |
from llama_index.core import Settings | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
from llama_index.llms.huggingface import HuggingFaceLLM | |
import csv | |
from docx import Document as DocxDocument | |
import fitz | |
import os | |
import torch | |
from HybridRetriever import HybridRetriever | |
from ChatEngine import ChatEngine | |
from llama_index.retrievers.bm25 import BM25Retriever | |
from llama_index.core.retrievers import VectorIndexRetriever | |
lm_list = { | |
"google/gemma-2-9b-it": "google/gemma-2-9b-it", | |
"mistralai/Mistral-7B-Instruct-v0.3": "mistralai/Mistral-7B-Instruct-v0.3" | |
} | |
query_engine = None | |
def process_file(file): | |
file_extension = file.name.split(".")[-1].lower() | |
if file_extension == 'txt': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
text = f.read() | |
elif file_extension == 'csv': | |
with open(file.name, 'r', encoding='utf-8') as f: | |
reader = csv.reader(f) | |
text = '\n'.join(','.join(row) for row in reader) | |
elif file_extension == 'pdf': | |
pdf_document = fitz.open(file.name, filetype=file_extension) | |
text = "" | |
for page_num in range(pdf_document.page_count): | |
page = pdf_document.load_page(page_num) | |
text += page.get_text("text") | |
pdf_document.close() | |
elif file_extension == 'docx': | |
docx_document = DocxDocument(file.name) | |
text = "" | |
for paragraph in docx_document.paragraphs: | |
text += paragraph.text + "\n" | |
return [Document(text=text)] | |
def handle_file_upload(file, llm_name): | |
global query_engine | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
Settings.llm = HuggingFaceLLM(model_name=llm_name) | |
documents = process_file(file) | |
text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10) | |
Settings.embed_model = HuggingFaceEmbedding(model_name="nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) | |
Settings.text_splitter = text_splitter | |
index = VectorStoreIndex.from_documents( | |
documents, transformations=[text_splitter], embed_model=Settings.embed_model | |
) | |
bm25_retriever = BM25Retriever(nodes=documents, similarity_top_k=2, tokenizer=text_splitter.split_text) | |
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2) | |
hybrid_retriever = HybridRetriever(bm25_retriever=bm25_retriever, vector_retriever=vector_retriever) | |
chat_engine = ChatEngine(hybrid_retriever) | |
response = chat_engine.ask_question(question, llm) | |
return response | |
def document_qa(file_upload, llm_choice, question_input): | |
response = handle_file_upload(file_upload, llm_choice) | |
return response | |
llm_choice = gr.Dropdown(choices=list(lm_list.values()), label="Choose LLM") | |
file_upload = gr.File(label="Upload Document") | |
question_input = gr.Textbox(label="Enter your question") | |
gr.Interface( | |
fn=document_qa, | |
inputs=[file_upload, llm_choice, question_input], | |
outputs=gr.Textbox(label="Answer"), | |
title="Document Question Answering", | |
description="Upload a document and choose a language model to get answers.", | |
allow_flagging=False | |
).launch() |