Spaces:
Sleeping
Sleeping
lakshmivairamani
commited on
Upload 16 files
Browse files- config/__pycache__/settings.cpython-310.pyc +0 -0
- config/__pycache__/settings.cpython-311.pyc +0 -0
- config/__pycache__/settings.cpython-312.pyc +0 -0
- config/settings.py +4 -0
- logs/redmindgen.log +0 -0
- services/__pycache__/chat_service.cpython-310.pyc +0 -0
- services/__pycache__/chat_service.cpython-311.pyc +0 -0
- services/__pycache__/chat_service.cpython-312.pyc +0 -0
- services/__pycache__/file_upload_service.cpython-310.pyc +0 -0
- services/__pycache__/file_upload_service.cpython-312.pyc +0 -0
- services/__pycache__/multidoc_files_upload.cpython-310.pyc +0 -0
- services/__pycache__/multidoc_files_upload.cpython-311.pyc +0 -0
- services/chat_service.py +137 -0
- services/file_upload_service.py +141 -0
- static/img/AI.jpg +0 -0
- static/img/redmindlogo3.jpg +0 -0
config/__pycache__/settings.cpython-310.pyc
ADDED
Binary file (424 Bytes). View file
|
|
config/__pycache__/settings.cpython-311.pyc
ADDED
Binary file (511 Bytes). View file
|
|
config/__pycache__/settings.cpython-312.pyc
ADDED
Binary file (444 Bytes). View file
|
|
config/settings.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
class Settings:
|
4 |
+
DB_URI = "mysql+mysqlconnector://redmindgen:51([email protected]:3306/collegedb"
|
logs/redmindgen.log
ADDED
The diff for this file is too large to render.
See raw diff
|
|
services/__pycache__/chat_service.cpython-310.pyc
ADDED
Binary file (6.05 kB). View file
|
|
services/__pycache__/chat_service.cpython-311.pyc
ADDED
Binary file (9.56 kB). View file
|
|
services/__pycache__/chat_service.cpython-312.pyc
ADDED
Binary file (8.75 kB). View file
|
|
services/__pycache__/file_upload_service.cpython-310.pyc
ADDED
Binary file (5.25 kB). View file
|
|
services/__pycache__/file_upload_service.cpython-312.pyc
ADDED
Binary file (8.64 kB). View file
|
|
services/__pycache__/multidoc_files_upload.cpython-310.pyc
ADDED
Binary file (4.42 kB). View file
|
|
services/__pycache__/multidoc_files_upload.cpython-311.pyc
ADDED
Binary file (8.14 kB). View file
|
|
services/chat_service.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from langchain.memory import ConversationSummaryMemory
|
5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
6 |
+
from langchain_community.utilities import SQLDatabase
|
7 |
+
from langchain_core.output_parsers import StrOutputParser
|
8 |
+
from langchain_core.runnables import RunnablePassthrough
|
9 |
+
from langchain_openai import ChatOpenAI
|
10 |
+
from langchain_openai import OpenAIEmbeddings
|
11 |
+
from langchain.agents import create_tool_calling_agent, AgentExecutor, Tool
|
12 |
+
from langchain_community.vectorstores import FAISS
|
13 |
+
from config.settings import Settings
|
14 |
+
|
15 |
+
# Load environment variables
|
16 |
+
load_dotenv()
|
17 |
+
open_api_key_token = os.getenv('OPENAI_API_KEY')
|
18 |
+
#db_uri = os.getenv('POST_DB_URI')
|
19 |
+
db_uri = Settings.DB_URI
|
20 |
+
|
21 |
+
class ChatAgentService:
|
22 |
+
def __init__(self):
|
23 |
+
# Database setup
|
24 |
+
self.db = SQLDatabase.from_uri(db_uri)
|
25 |
+
self.llm = ChatOpenAI(model="gpt-3.5-turbo-0125", api_key=open_api_key_token,max_tokens=150,temperature=0.2)
|
26 |
+
self.memory = ConversationSummaryMemory(llm=self.llm, return_messages=True)
|
27 |
+
|
28 |
+
|
29 |
+
# Tools setup
|
30 |
+
self.tools = [
|
31 |
+
Tool(
|
32 |
+
name="DatabaseQuery",
|
33 |
+
func=self.database_tool,
|
34 |
+
description="Queries the SQL database using dynamically generated SQL queries based on user questions. Aimed to retrieve structured data like counts, specific records, or summaries from predefined schemas.",
|
35 |
+
tool_choice="required"
|
36 |
+
),
|
37 |
+
Tool(
|
38 |
+
name="DocumentData",
|
39 |
+
func=self.document_data_tool,
|
40 |
+
description="Searches through indexed documents to find relevant information based on user queries. Handles unstructured data from various document formats like PDF, DOCX, or TXT files.",
|
41 |
+
tool_choice="required"
|
42 |
+
),
|
43 |
+
]
|
44 |
+
|
45 |
+
# Agent setup
|
46 |
+
prompt_template = self.setup_prompt()
|
47 |
+
self.agent = create_tool_calling_agent(self.llm.bind(memory=self.memory), self.tools, prompt_template)
|
48 |
+
self.agent_executor = AgentExecutor(agent=self.agent, tools=self.tools, memory=self.memory, verbose=True)
|
49 |
+
|
50 |
+
def setup_prompt(self):
|
51 |
+
prompt_template = f"""
|
52 |
+
You are an assistant that helps with database queries and document retrieval.
|
53 |
+
Please base your responses strictly on available data and avoid assumptions.
|
54 |
+
If the question pertains to numerical data or structured queries, use the DatabaseQuery tool.
|
55 |
+
If the question relates to content within various documents, use the DocumentData tool.
|
56 |
+
Question: {{input}}
|
57 |
+
{{agent_scratchpad}}
|
58 |
+
"""
|
59 |
+
return ChatPromptTemplate.from_template(prompt_template)
|
60 |
+
|
61 |
+
def database_tool(self, question):
|
62 |
+
sql_query = self.generate_sql_query(question)
|
63 |
+
return self.run_query(sql_query)
|
64 |
+
|
65 |
+
def get_schema(self,_):
|
66 |
+
# print(self.db.get_table_info())
|
67 |
+
return self.db.get_table_info()
|
68 |
+
def generate_sql_query(self, question):
|
69 |
+
schema = self.get_schema(None) # Get the schema using the function
|
70 |
+
template_query_generation = """Generate a SQL query to answer the user's question based on the available database schema.
|
71 |
+
{schema}
|
72 |
+
Question: {question}
|
73 |
+
SQL Query:"""
|
74 |
+
|
75 |
+
prompt_query_generation = ChatPromptTemplate.from_template(template_query_generation)
|
76 |
+
# Correctly setting up the initial data dictionary for the chain
|
77 |
+
input_data = {'question': question}
|
78 |
+
# Setup the chain correctly
|
79 |
+
sql_chain = (RunnablePassthrough.assign(schema=self.get_schema)
|
80 |
+
| prompt_query_generation
|
81 |
+
| self.llm.bind(stop="\nSQL Result:")
|
82 |
+
| StrOutputParser())
|
83 |
+
|
84 |
+
# Make sure to invoke with an empty dictionary if all needed data is already assigned
|
85 |
+
return sql_chain.invoke(input_data)
|
86 |
+
|
87 |
+
def run_query(self, query):
|
88 |
+
try:
|
89 |
+
logging.info(f"Executing SQL query: {query}")
|
90 |
+
result = self.db.run(query)
|
91 |
+
logging.info(f"Query successful: {result}")
|
92 |
+
return result
|
93 |
+
except Exception as e:
|
94 |
+
logging.error(f"Error executing query: {query}, Error: {str(e)}")
|
95 |
+
return None
|
96 |
+
|
97 |
+
def document_data_tool(self, query):
|
98 |
+
try:
|
99 |
+
logging.info(f"Searching documents for query: {query}")
|
100 |
+
embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
|
101 |
+
index_paths = self.find_index_for_document(query)
|
102 |
+
responses = []
|
103 |
+
for index_path in index_paths:
|
104 |
+
vector_store = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
|
105 |
+
response = self.query_vector_store(vector_store, query)
|
106 |
+
responses.append(response)
|
107 |
+
logging.info(f"Document search results: {responses}")
|
108 |
+
return "\n".join(responses)
|
109 |
+
except Exception as e:
|
110 |
+
logging.error(f"Error in document data tool for query: {query}, Error: {str(e)}")
|
111 |
+
return "Error processing document query."
|
112 |
+
|
113 |
+
def find_index_for_document(self, query):
|
114 |
+
base_path = os.getenv('VECTOR_DB_PATH')
|
115 |
+
# document_hint = self.extract_document_hint(query)
|
116 |
+
index_paths = []
|
117 |
+
for root, dirs, files in os.walk(base_path):
|
118 |
+
for dir in dirs:
|
119 |
+
if 'index.faiss' in os.listdir(os.path.join(root, dir)):
|
120 |
+
index_paths.append(os.path.join(root, dir, ''))
|
121 |
+
return index_paths
|
122 |
+
|
123 |
+
def query_vector_store(self, vector_store, query):
|
124 |
+
docs = vector_store.similarity_search(query)
|
125 |
+
return '\n\n'.join([doc.page_content for doc in docs])
|
126 |
+
|
127 |
+
def answer_question(self, user_question):
|
128 |
+
try:
|
129 |
+
logging.info(f"Received question: {user_question}")
|
130 |
+
response = self.agent_executor.invoke({"input": user_question})
|
131 |
+
output_response = response.get("output", "No valid response generated.")
|
132 |
+
logging.info(f"Response generated: {output_response}")
|
133 |
+
return output_response
|
134 |
+
except Exception as e:
|
135 |
+
logging.error(f"Error processing question: {user_question}, Error: {str(e)}")
|
136 |
+
return f"An error occurred: {str(e)}"
|
137 |
+
|
services/file_upload_service.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import hashlib
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
import pandas as pd
|
8 |
+
from datetime import datetime
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from langchain_community.vectorstores import FAISS
|
11 |
+
from langchain_openai import OpenAIEmbeddings
|
12 |
+
from langchain.text_splitter import CharacterTextSplitter
|
13 |
+
from PyPDF2 import PdfReader
|
14 |
+
from docx import Document
|
15 |
+
# from transformers import pipeline
|
16 |
+
|
17 |
+
# Load environment variables
|
18 |
+
load_dotenv()
|
19 |
+
open_api_key_token = os.getenv('OPENAI_API_KEY')
|
20 |
+
|
21 |
+
|
22 |
+
class FileHandler:
|
23 |
+
def __init__(self, vector_db_path):
|
24 |
+
self.vector_db_path = vector_db_path
|
25 |
+
self.embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
|
26 |
+
# self.summarizer = pipeline("summarization")
|
27 |
+
|
28 |
+
def prepare_metadata_string(self, document_name, document_description, department, version, last_updated):
|
29 |
+
metadata_string = f"\nDocument Name: {document_name}\nDocument Description: {document_description}\nDepartment: {department}\nVersion: {version}\nLast Updated: {last_updated}"
|
30 |
+
return metadata_string
|
31 |
+
|
32 |
+
async def handle_file_upload(self, file, document_name, document_description, department, version, last_updated):
|
33 |
+
content = await file.read()
|
34 |
+
file_hash = hashlib.md5(content).hexdigest()
|
35 |
+
file_key = f"{file.filename}_{file_hash}"
|
36 |
+
vector_store_path = os.path.join(self.vector_db_path, f"{file_key}.vectorstore")
|
37 |
+
metadata_path = os.path.join(self.vector_db_path, f"{file_key}.metadata.json")
|
38 |
+
|
39 |
+
metadata_string = self.prepare_metadata_string(document_name, document_description, department, version,
|
40 |
+
last_updated)
|
41 |
+
|
42 |
+
if os.path.exists(vector_store_path) and os.path.exists(metadata_path):
|
43 |
+
with open(metadata_path, 'r') as md_file:
|
44 |
+
metadata = json.load(md_file)
|
45 |
+
return {'path': vector_store_path, 'metadata': metadata, 'status': 'skipped - duplicate'}
|
46 |
+
|
47 |
+
if file.filename.endswith('.csv') or file.filename.endswith('.xlsx'):
|
48 |
+
texts = self.load_and_split_table(content, file.filename,metadata_string)
|
49 |
+
else:
|
50 |
+
texts = await self.load_and_split_text(content, file.filename,metadata_string)
|
51 |
+
|
52 |
+
vector_store = self.create_vector_store(texts)
|
53 |
+
vector_store.save_local(vector_store_path)
|
54 |
+
|
55 |
+
metadata = {
|
56 |
+
'filename': file.filename,
|
57 |
+
'document_name': document_name,
|
58 |
+
'document_description': document_description,
|
59 |
+
'department': department,
|
60 |
+
'version': version,
|
61 |
+
'last_updated': last_updated,
|
62 |
+
'hash': file_hash,
|
63 |
+
'upload_date': datetime.now().isoformat(),
|
64 |
+
'file_path': vector_store_path,
|
65 |
+
'file_size': len(content),
|
66 |
+
'content_type': file.content_type
|
67 |
+
}
|
68 |
+
|
69 |
+
with open(metadata_path, 'w') as md_file:
|
70 |
+
json.dump(metadata, md_file)
|
71 |
+
|
72 |
+
return {"message": "File processed and vector store created successfully", "file_metadata": metadata}
|
73 |
+
|
74 |
+
def summarize_text(self, text):
|
75 |
+
try:
|
76 |
+
summary = self.summarizer(text, max_length=150, min_length=10, do_sample=False)
|
77 |
+
logging.info("Text summarization successful")
|
78 |
+
return summary[0]['summary_text']
|
79 |
+
except Exception as e:
|
80 |
+
logging.error(f"Error in summarization: {str(e)}")
|
81 |
+
# Log error or handle exception
|
82 |
+
return text # Return original text if summarization is not possible
|
83 |
+
|
84 |
+
def load_and_split_table(self, content, filename,metadata_string):
|
85 |
+
# Handle CSV and Excel file reading
|
86 |
+
if filename.endswith('.csv'):
|
87 |
+
df = pd.read_csv(io.StringIO(content.decode('utf-8')))
|
88 |
+
else: # Excel
|
89 |
+
df = pd.read_excel(io.BytesIO(content))
|
90 |
+
text = df.to_string(index=False) # Convert DataFrame to string
|
91 |
+
text += metadata_string # Append metadata to the text
|
92 |
+
return self.split_text(text)
|
93 |
+
|
94 |
+
async def load_and_split_text(self, content, filename,metadata_string):
|
95 |
+
with tempfile.NamedTemporaryFile(delete=False, mode='w+b', suffix=f"_{filename}") as temp_file:
|
96 |
+
temp_file.write(content)
|
97 |
+
temp_file.flush()
|
98 |
+
temp_file_path = temp_file.name
|
99 |
+
|
100 |
+
# Ensure the temp file is closed before reading from it
|
101 |
+
if filename.endswith('.pdf'):
|
102 |
+
texts = await self.load_and_split_pdf(temp_file_path,metadata_string)
|
103 |
+
elif filename.endswith('.docx'):
|
104 |
+
texts = await self.load_and_split_docx(temp_file_path,metadata_string)
|
105 |
+
elif filename.endswith('.txt'):
|
106 |
+
texts = await self.load_and_split_txt(temp_file_path,metadata_string)
|
107 |
+
|
108 |
+
# Apply summarization here to each text segment
|
109 |
+
# summarized_texts = [self.summarize_text(text) for text in texts]
|
110 |
+
|
111 |
+
# os.unlink(temp_file_path) # Explicitly remove the temporary file
|
112 |
+
# return summarized_texts
|
113 |
+
os.unlink(temp_file_path) # Explicitly remove the temporary file
|
114 |
+
return texts
|
115 |
+
|
116 |
+
async def load_and_split_pdf(self, pdf_path,metadata_string):
|
117 |
+
reader = PdfReader(pdf_path)
|
118 |
+
text = ''
|
119 |
+
for page in reader.pages:
|
120 |
+
text += page.extract_text() or ""
|
121 |
+
text += metadata_string # Append metadata to the text
|
122 |
+
return self.split_text(text)
|
123 |
+
|
124 |
+
async def load_and_split_docx(self, docx_path,metadata_string):
|
125 |
+
doc = Document(docx_path)
|
126 |
+
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs if paragraph.text])
|
127 |
+
text += metadata_string # Append metadata to the text
|
128 |
+
return self.split_text(text)
|
129 |
+
|
130 |
+
async def load_and_split_txt(self, txt_path,metadata_string):
|
131 |
+
with open(txt_path, 'r', encoding='utf-8') as file:
|
132 |
+
text = file.read()
|
133 |
+
text += metadata_string # Append metadata to the text
|
134 |
+
return self.split_text(text)
|
135 |
+
|
136 |
+
def split_text(self, text):
|
137 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
138 |
+
return text_splitter.split_text(text)
|
139 |
+
|
140 |
+
def create_vector_store(self, texts):
|
141 |
+
return FAISS.from_texts(texts, self.embeddings)
|
static/img/AI.jpg
ADDED
static/img/redmindlogo3.jpg
ADDED