lakshmivairamani commited on
Commit
8bac072
·
verified ·
1 Parent(s): b534452

Upload 16 files

Browse files
config/__pycache__/settings.cpython-310.pyc ADDED
Binary file (424 Bytes). View file
 
config/__pycache__/settings.cpython-311.pyc ADDED
Binary file (511 Bytes). View file
 
config/__pycache__/settings.cpython-312.pyc ADDED
Binary file (444 Bytes). View file
 
config/settings.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+
3
+ class Settings:
4
+ DB_URI = "mysql+mysqlconnector://redmindgen:51([email protected]:3306/collegedb"
logs/redmindgen.log ADDED
The diff for this file is too large to render. See raw diff
 
services/__pycache__/chat_service.cpython-310.pyc ADDED
Binary file (6.05 kB). View file
 
services/__pycache__/chat_service.cpython-311.pyc ADDED
Binary file (9.56 kB). View file
 
services/__pycache__/chat_service.cpython-312.pyc ADDED
Binary file (8.75 kB). View file
 
services/__pycache__/file_upload_service.cpython-310.pyc ADDED
Binary file (5.25 kB). View file
 
services/__pycache__/file_upload_service.cpython-312.pyc ADDED
Binary file (8.64 kB). View file
 
services/__pycache__/multidoc_files_upload.cpython-310.pyc ADDED
Binary file (4.42 kB). View file
 
services/__pycache__/multidoc_files_upload.cpython-311.pyc ADDED
Binary file (8.14 kB). View file
 
services/chat_service.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from dotenv import load_dotenv
4
+ from langchain.memory import ConversationSummaryMemory
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_community.utilities import SQLDatabase
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.runnables import RunnablePassthrough
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_openai import OpenAIEmbeddings
11
+ from langchain.agents import create_tool_calling_agent, AgentExecutor, Tool
12
+ from langchain_community.vectorstores import FAISS
13
+ from config.settings import Settings
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+ open_api_key_token = os.getenv('OPENAI_API_KEY')
18
+ #db_uri = os.getenv('POST_DB_URI')
19
+ db_uri = Settings.DB_URI
20
+
21
+ class ChatAgentService:
22
+ def __init__(self):
23
+ # Database setup
24
+ self.db = SQLDatabase.from_uri(db_uri)
25
+ self.llm = ChatOpenAI(model="gpt-3.5-turbo-0125", api_key=open_api_key_token,max_tokens=150,temperature=0.2)
26
+ self.memory = ConversationSummaryMemory(llm=self.llm, return_messages=True)
27
+
28
+
29
+ # Tools setup
30
+ self.tools = [
31
+ Tool(
32
+ name="DatabaseQuery",
33
+ func=self.database_tool,
34
+ description="Queries the SQL database using dynamically generated SQL queries based on user questions. Aimed to retrieve structured data like counts, specific records, or summaries from predefined schemas.",
35
+ tool_choice="required"
36
+ ),
37
+ Tool(
38
+ name="DocumentData",
39
+ func=self.document_data_tool,
40
+ description="Searches through indexed documents to find relevant information based on user queries. Handles unstructured data from various document formats like PDF, DOCX, or TXT files.",
41
+ tool_choice="required"
42
+ ),
43
+ ]
44
+
45
+ # Agent setup
46
+ prompt_template = self.setup_prompt()
47
+ self.agent = create_tool_calling_agent(self.llm.bind(memory=self.memory), self.tools, prompt_template)
48
+ self.agent_executor = AgentExecutor(agent=self.agent, tools=self.tools, memory=self.memory, verbose=True)
49
+
50
+ def setup_prompt(self):
51
+ prompt_template = f"""
52
+ You are an assistant that helps with database queries and document retrieval.
53
+ Please base your responses strictly on available data and avoid assumptions.
54
+ If the question pertains to numerical data or structured queries, use the DatabaseQuery tool.
55
+ If the question relates to content within various documents, use the DocumentData tool.
56
+ Question: {{input}}
57
+ {{agent_scratchpad}}
58
+ """
59
+ return ChatPromptTemplate.from_template(prompt_template)
60
+
61
+ def database_tool(self, question):
62
+ sql_query = self.generate_sql_query(question)
63
+ return self.run_query(sql_query)
64
+
65
+ def get_schema(self,_):
66
+ # print(self.db.get_table_info())
67
+ return self.db.get_table_info()
68
+ def generate_sql_query(self, question):
69
+ schema = self.get_schema(None) # Get the schema using the function
70
+ template_query_generation = """Generate a SQL query to answer the user's question based on the available database schema.
71
+ {schema}
72
+ Question: {question}
73
+ SQL Query:"""
74
+
75
+ prompt_query_generation = ChatPromptTemplate.from_template(template_query_generation)
76
+ # Correctly setting up the initial data dictionary for the chain
77
+ input_data = {'question': question}
78
+ # Setup the chain correctly
79
+ sql_chain = (RunnablePassthrough.assign(schema=self.get_schema)
80
+ | prompt_query_generation
81
+ | self.llm.bind(stop="\nSQL Result:")
82
+ | StrOutputParser())
83
+
84
+ # Make sure to invoke with an empty dictionary if all needed data is already assigned
85
+ return sql_chain.invoke(input_data)
86
+
87
+ def run_query(self, query):
88
+ try:
89
+ logging.info(f"Executing SQL query: {query}")
90
+ result = self.db.run(query)
91
+ logging.info(f"Query successful: {result}")
92
+ return result
93
+ except Exception as e:
94
+ logging.error(f"Error executing query: {query}, Error: {str(e)}")
95
+ return None
96
+
97
+ def document_data_tool(self, query):
98
+ try:
99
+ logging.info(f"Searching documents for query: {query}")
100
+ embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
101
+ index_paths = self.find_index_for_document(query)
102
+ responses = []
103
+ for index_path in index_paths:
104
+ vector_store = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
105
+ response = self.query_vector_store(vector_store, query)
106
+ responses.append(response)
107
+ logging.info(f"Document search results: {responses}")
108
+ return "\n".join(responses)
109
+ except Exception as e:
110
+ logging.error(f"Error in document data tool for query: {query}, Error: {str(e)}")
111
+ return "Error processing document query."
112
+
113
+ def find_index_for_document(self, query):
114
+ base_path = os.getenv('VECTOR_DB_PATH')
115
+ # document_hint = self.extract_document_hint(query)
116
+ index_paths = []
117
+ for root, dirs, files in os.walk(base_path):
118
+ for dir in dirs:
119
+ if 'index.faiss' in os.listdir(os.path.join(root, dir)):
120
+ index_paths.append(os.path.join(root, dir, ''))
121
+ return index_paths
122
+
123
+ def query_vector_store(self, vector_store, query):
124
+ docs = vector_store.similarity_search(query)
125
+ return '\n\n'.join([doc.page_content for doc in docs])
126
+
127
+ def answer_question(self, user_question):
128
+ try:
129
+ logging.info(f"Received question: {user_question}")
130
+ response = self.agent_executor.invoke({"input": user_question})
131
+ output_response = response.get("output", "No valid response generated.")
132
+ logging.info(f"Response generated: {output_response}")
133
+ return output_response
134
+ except Exception as e:
135
+ logging.error(f"Error processing question: {user_question}, Error: {str(e)}")
136
+ return f"An error occurred: {str(e)}"
137
+
services/file_upload_service.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import tempfile
4
+ import hashlib
5
+ import json
6
+ import logging
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from dotenv import load_dotenv
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_openai import OpenAIEmbeddings
12
+ from langchain.text_splitter import CharacterTextSplitter
13
+ from PyPDF2 import PdfReader
14
+ from docx import Document
15
+ # from transformers import pipeline
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+ open_api_key_token = os.getenv('OPENAI_API_KEY')
20
+
21
+
22
+ class FileHandler:
23
+ def __init__(self, vector_db_path):
24
+ self.vector_db_path = vector_db_path
25
+ self.embeddings = OpenAIEmbeddings(api_key=open_api_key_token)
26
+ # self.summarizer = pipeline("summarization")
27
+
28
+ def prepare_metadata_string(self, document_name, document_description, department, version, last_updated):
29
+ metadata_string = f"\nDocument Name: {document_name}\nDocument Description: {document_description}\nDepartment: {department}\nVersion: {version}\nLast Updated: {last_updated}"
30
+ return metadata_string
31
+
32
+ async def handle_file_upload(self, file, document_name, document_description, department, version, last_updated):
33
+ content = await file.read()
34
+ file_hash = hashlib.md5(content).hexdigest()
35
+ file_key = f"{file.filename}_{file_hash}"
36
+ vector_store_path = os.path.join(self.vector_db_path, f"{file_key}.vectorstore")
37
+ metadata_path = os.path.join(self.vector_db_path, f"{file_key}.metadata.json")
38
+
39
+ metadata_string = self.prepare_metadata_string(document_name, document_description, department, version,
40
+ last_updated)
41
+
42
+ if os.path.exists(vector_store_path) and os.path.exists(metadata_path):
43
+ with open(metadata_path, 'r') as md_file:
44
+ metadata = json.load(md_file)
45
+ return {'path': vector_store_path, 'metadata': metadata, 'status': 'skipped - duplicate'}
46
+
47
+ if file.filename.endswith('.csv') or file.filename.endswith('.xlsx'):
48
+ texts = self.load_and_split_table(content, file.filename,metadata_string)
49
+ else:
50
+ texts = await self.load_and_split_text(content, file.filename,metadata_string)
51
+
52
+ vector_store = self.create_vector_store(texts)
53
+ vector_store.save_local(vector_store_path)
54
+
55
+ metadata = {
56
+ 'filename': file.filename,
57
+ 'document_name': document_name,
58
+ 'document_description': document_description,
59
+ 'department': department,
60
+ 'version': version,
61
+ 'last_updated': last_updated,
62
+ 'hash': file_hash,
63
+ 'upload_date': datetime.now().isoformat(),
64
+ 'file_path': vector_store_path,
65
+ 'file_size': len(content),
66
+ 'content_type': file.content_type
67
+ }
68
+
69
+ with open(metadata_path, 'w') as md_file:
70
+ json.dump(metadata, md_file)
71
+
72
+ return {"message": "File processed and vector store created successfully", "file_metadata": metadata}
73
+
74
+ def summarize_text(self, text):
75
+ try:
76
+ summary = self.summarizer(text, max_length=150, min_length=10, do_sample=False)
77
+ logging.info("Text summarization successful")
78
+ return summary[0]['summary_text']
79
+ except Exception as e:
80
+ logging.error(f"Error in summarization: {str(e)}")
81
+ # Log error or handle exception
82
+ return text # Return original text if summarization is not possible
83
+
84
+ def load_and_split_table(self, content, filename,metadata_string):
85
+ # Handle CSV and Excel file reading
86
+ if filename.endswith('.csv'):
87
+ df = pd.read_csv(io.StringIO(content.decode('utf-8')))
88
+ else: # Excel
89
+ df = pd.read_excel(io.BytesIO(content))
90
+ text = df.to_string(index=False) # Convert DataFrame to string
91
+ text += metadata_string # Append metadata to the text
92
+ return self.split_text(text)
93
+
94
+ async def load_and_split_text(self, content, filename,metadata_string):
95
+ with tempfile.NamedTemporaryFile(delete=False, mode='w+b', suffix=f"_{filename}") as temp_file:
96
+ temp_file.write(content)
97
+ temp_file.flush()
98
+ temp_file_path = temp_file.name
99
+
100
+ # Ensure the temp file is closed before reading from it
101
+ if filename.endswith('.pdf'):
102
+ texts = await self.load_and_split_pdf(temp_file_path,metadata_string)
103
+ elif filename.endswith('.docx'):
104
+ texts = await self.load_and_split_docx(temp_file_path,metadata_string)
105
+ elif filename.endswith('.txt'):
106
+ texts = await self.load_and_split_txt(temp_file_path,metadata_string)
107
+
108
+ # Apply summarization here to each text segment
109
+ # summarized_texts = [self.summarize_text(text) for text in texts]
110
+
111
+ # os.unlink(temp_file_path) # Explicitly remove the temporary file
112
+ # return summarized_texts
113
+ os.unlink(temp_file_path) # Explicitly remove the temporary file
114
+ return texts
115
+
116
+ async def load_and_split_pdf(self, pdf_path,metadata_string):
117
+ reader = PdfReader(pdf_path)
118
+ text = ''
119
+ for page in reader.pages:
120
+ text += page.extract_text() or ""
121
+ text += metadata_string # Append metadata to the text
122
+ return self.split_text(text)
123
+
124
+ async def load_and_split_docx(self, docx_path,metadata_string):
125
+ doc = Document(docx_path)
126
+ text = '\n'.join([paragraph.text for paragraph in doc.paragraphs if paragraph.text])
127
+ text += metadata_string # Append metadata to the text
128
+ return self.split_text(text)
129
+
130
+ async def load_and_split_txt(self, txt_path,metadata_string):
131
+ with open(txt_path, 'r', encoding='utf-8') as file:
132
+ text = file.read()
133
+ text += metadata_string # Append metadata to the text
134
+ return self.split_text(text)
135
+
136
+ def split_text(self, text):
137
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
138
+ return text_splitter.split_text(text)
139
+
140
+ def create_vector_store(self, texts):
141
+ return FAISS.from_texts(texts, self.embeddings)
static/img/AI.jpg ADDED
static/img/redmindlogo3.jpg ADDED