hydererankrag / app.py
Ley_Fill7
Add the HyDE RAG app file
00621a0
# Import modules and classes
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine
from langchain_core.documents import Document as LangDocument
from llama_index.core import Document as LlamaDocument
from llama_index.core import Settings
from llama_parse import LlamaParse
import streamlit as st
import os
# Set environmental variables
nvidia_api_key = os.getenv("NVIDIA_KEY")
llamaparse_api_key = os.getenv("PARSE_KEY")
# Initialize ChatNVIDIA, NVIDIARerank, and NVIDIAEmbeddings
client = ChatNVIDIA(
model="meta/llama-3.1-8b-instruct",
api_key=nvidia_api_key,
temperature=0.2,
top_p=0.7,
max_tokens=1024
)
embed_model = NVIDIAEmbeddings(
model="nvidia/nv-embedqa-e5-v5",
api_key=nvidia_api_key,
truncate="NONE"
)
reranker = NVIDIARerank(
model="nvidia/nv-rerankqa-mistral-4b-v3",
api_key=nvidia_api_key,
)
# Set the NVIDIA models globally
Settings.embed_model = embed_model
Settings.llm = client
# Parse the local PDF document
parser = LlamaParse(
api_key=llamaparse_api_key,
result_type="markdown",
verbose=True
)
documents = parser.load_data("C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\Files\\PhilDataset.pdf")
print("Document Parsed")
# Split parsed text into chunks for embedding model
def split_text(text, max_tokens=512):
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
word_length = len(word)
if current_length + word_length + 1 > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_length = word_length + 1
else:
current_chunk.append(word)
current_length += word_length + 1
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
# Generate embeddings for document chunks
all_embeddings = []
all_documents = []
for doc in documents:
text_chunks = split_text(doc.text)
for chunk in text_chunks:
embedding = embed_model.embed_query(chunk)
all_embeddings.append(embedding)
all_documents.append(LlamaDocument(text=chunk))
print("Embeddings generated")
# Create and persist index with NVIDIAEmbeddings
index = VectorStoreIndex.from_documents(all_documents, embeddings=all_embeddings, embed_model=embed_model)
index.set_index_id("vector_index")
index.storage_context.persist("./storage")
print("Index created")
# Load index from storage
storage_context = StorageContext.from_defaults(persist_dir="storage")
index = load_index_from_storage(storage_context, index_id="vector_index")
print("Index loaded")
# Initialize HyDEQueryTransform and TransformQueryEngine
hyde = HyDEQueryTransform(include_original=True)
query_engine = index.as_query_engine()
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
# Query the index with HyDE and use output as LLM context
def query_model_with_context(question):
# Generate a hypothetical document using HyDE
hyde_response = hyde_query_engine.query(question)
print(f"HyDE Response: {hyde_response}")
if isinstance(hyde_response, str):
hyde_query = hyde_response
else:
hyde_query = hyde_response.response
# Use the hypothetical document to retrieve relevant documents
retriever = index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve(hyde_query)
for node in nodes:
print(node)
# Rerank the retrieved documents
ranked_documents = reranker.compress_documents(
query=question,
documents=[LangDocument(page_content=node.text) for node in nodes]
)
# Print the most relevant and least relevant node
print(f"Most relevant node: {ranked_documents[0].page_content}")
# Use the most relevant node as context
context = ranked_documents[0].page_content
# Send context and question to the client (NVIDIA Llama 3.1 8B model)
messages = [
{"role": "system", "content": context},
{"role": "user", "content": str(question)}
]
completion = client.stream(messages)
# Process response
response_text = ""
for chunk in completion:
if chunk.content is not None:
response_text += chunk.content
return response_text
# Streamlit UI
st.title("Chat with HyDE + Rerank RAG")
question = st.text_input("Enter your question:")
if st.button("Submit"):
if question:
st.write("**RAG Response:**")
response = query_model_with_context(question)
st.write(response)
else:
st.warning("Please enter a question.")