rakeshkumar1812
commited on
Upload three files for url RAG
Browse files- app.py +67 -0
- requirements.txt +20 -0
- utils.py +63 -0
app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import utils
|
3 |
+
|
4 |
+
|
5 |
+
# https://github.com/serkanyasr/RAG-with-LangChain-URL-PDF/blob/main/utils.py
|
6 |
+
|
7 |
+
|
8 |
+
st.set_page_config(layout="wide")
|
9 |
+
st.markdown("<h1 style='font-size:24px;'>RAG with LangChain & GenAI: Any url</h1>", unsafe_allow_html=True)
|
10 |
+
# st.title("RAG with LangChain & GenAI: Any url")
|
11 |
+
|
12 |
+
# URL text box for user input
|
13 |
+
url_input = st.text_input("Enter a URL to be queried:", "")
|
14 |
+
|
15 |
+
# Input text box for user input
|
16 |
+
user_input = st.text_input("Enter your Question below:", "")
|
17 |
+
|
18 |
+
# Display the user input
|
19 |
+
# st.write("You entered:", user_input)
|
20 |
+
# st.write("URL entered:", url_input)
|
21 |
+
sumbit_btn = st.button(label="Submit",key="url_btn")
|
22 |
+
|
23 |
+
if sumbit_btn:
|
24 |
+
with st.spinner("Processing..."):
|
25 |
+
st.success("Response: Answering with RAG...")
|
26 |
+
response = utils.rag_with_url(url_input,user_input)
|
27 |
+
st.markdown(response)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
# st.title("Retrieval-Augmented Generation (RAG) with LangChain : PDF ")
|
37 |
+
# st.divider()
|
38 |
+
|
39 |
+
# col_input , col_rag , col_normal = st.columns([3,5,5])
|
40 |
+
# with col_input:
|
41 |
+
# selected_file = st.file_uploader("PDF File", type=["pdf"])
|
42 |
+
# st.divider()
|
43 |
+
# prompt = st.text_input("Prompt",key="pdf_prompt")
|
44 |
+
# st.divider()
|
45 |
+
# sumbit_btn = st.button(label="Submit",key="pdf_btn")
|
46 |
+
|
47 |
+
# if sumbit_btn:
|
48 |
+
# with col_rag:
|
49 |
+
# with st.spinner("Processing..."):
|
50 |
+
# st.success("Response: Answering with RAG...")
|
51 |
+
# response,relevant_documents = utils.rag_with_pdf(file_path=f"./data/{selected_file.name}",
|
52 |
+
# prompt=prompt)
|
53 |
+
# st.markdown(response)
|
54 |
+
# st.divider()
|
55 |
+
# st.info("Documents")
|
56 |
+
# for doc in relevant_documents:
|
57 |
+
# st.caption(doc.page_content)
|
58 |
+
# st.markdown(f"Source: {doc.metadata}")
|
59 |
+
# st.divider()
|
60 |
+
|
61 |
+
# with col_normal:
|
62 |
+
# with st.spinner("Processing..."):
|
63 |
+
# st.info("Response: Answering without RAG...")
|
64 |
+
# response = utils.ask_gemini(prompt)
|
65 |
+
# st.markdown(response)
|
66 |
+
# st.divider()
|
67 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
python-dotenv
|
3 |
+
langchain-openai
|
4 |
+
langchain-cohere
|
5 |
+
langchain-google-genai
|
6 |
+
openai
|
7 |
+
streamlit
|
8 |
+
python-dotenv
|
9 |
+
bs4
|
10 |
+
cohere
|
11 |
+
faiss-cpu
|
12 |
+
pypdf
|
13 |
+
huggingface_hub
|
14 |
+
langchain_community
|
15 |
+
|
16 |
+
unstructured
|
17 |
+
tiktoken
|
18 |
+
libmagic
|
19 |
+
python-magic
|
20 |
+
python-magic-bin
|
utils.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
2 |
+
from langchain_openai import OpenAIEmbeddings
|
3 |
+
from langchain_cohere import CohereEmbeddings
|
4 |
+
from langchain_openai import OpenAI
|
5 |
+
from langchain_community.document_loaders.web_base import WebBaseLoader
|
6 |
+
from langchain_community.document_loaders.pdf import PyPDFLoader
|
7 |
+
from langchain_community.vectorstores.faiss import FAISS
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
|
10 |
+
import os
|
11 |
+
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
GEMINI_API_KEY = os.getenv("GOOGLE_AI_API_KEY")
|
17 |
+
HF_API_KEY = os.getenv("HF_API_KEY")
|
18 |
+
|
19 |
+
llm_gemini = ChatGoogleGenerativeAI( google_api_key= GEMINI_API_KEY, model="gemini-pro")
|
20 |
+
embeddings_hf = HuggingFaceInferenceAPIEmbeddings(api_key=HF_API_KEY, model="sentence-transformers/all-MiniLM-16-v2")
|
21 |
+
|
22 |
+
# OPEN_AI_API_KEY = os.getenv("OPEN_AI_API_KEY")
|
23 |
+
# COHERE_API_KEY = os.getenv("COHERE_API_KEY")
|
24 |
+
# llm_openai = OpenAI(api_key=OPEN_AI_API_KEY, model="gpt-3.5-turbo")
|
25 |
+
# embeddings_open_ai = OpenAIEmbeddings(api_key=OPEN_AI_API_KEY) # OPEN_AI
|
26 |
+
# embeddings_cohere = CohereEmbeddings(api_key=COHERE_API_KEY,model="embed-multilingual-v3.0") # embed-english-v3.0
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
def ask_gemini(prompt):
|
31 |
+
AI_Respose = llm_gemini.invoke(prompt)
|
32 |
+
return AI_Respose.content
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def rag_with_url(target_url, prompt):
|
37 |
+
|
38 |
+
loader = WebBaseLoader(target_url)
|
39 |
+
raw_document = loader.load()
|
40 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
|
41 |
+
splited_document = text_splitter.split_documents(raw_document)
|
42 |
+
vector_store = FAISS.from_documents(splited_document, embeddings_hf)
|
43 |
+
retriever = vector_store.as_retriever()
|
44 |
+
relevant_documents = retriever.get_relevant_documents(prompt)
|
45 |
+
final_prompt = prompt + " " + " ".join([doc.page_content for doc in relevant_documents])
|
46 |
+
AI_Respose = llm_gemini.invoke(final_prompt)
|
47 |
+
|
48 |
+
return AI_Respose.content
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# def rag_with_pdf(file_path, prompt):
|
53 |
+
# loader = PyPDFLoader(file_path)
|
54 |
+
# raw_document = loader.load()
|
55 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, length_function = len)
|
56 |
+
# splited_document = text_splitter.split_documents(raw_document)
|
57 |
+
# vector_store = FAISS.from_documents(splited_document, embeddings_hf)
|
58 |
+
# retriever = vector_store.as_retriever()
|
59 |
+
# relevant_documents = retriever.get_relevant_documents(prompt)
|
60 |
+
# final_prompt = prompt + " " + " ".join([doc.page_content for doc in relevant_documents])
|
61 |
+
# AI_Respose = llm_gemini.invoke(final_prompt)
|
62 |
+
# return AI_Respose.content, relevant_documents
|
63 |
+
|