alexkueck commited on
Commit
b042dde
·
verified ·
1 Parent(s): 3e7acc0

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +366 -10
utils.py CHANGED
@@ -15,6 +15,9 @@ import sys
15
  import gc
16
  from pygments.lexers import guess_lexer, ClassNotFound
17
  import time
 
 
 
18
 
19
  import gradio as gr
20
  from pypinyin import lazy_pinyin
@@ -37,9 +40,22 @@ from langchain.llms import HuggingFaceTextGenInference
37
  from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
38
  from langchain.tools import DuckDuckGoSearchRun
39
  from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
40
-
 
41
  from langchain.embeddings.openai import OpenAIEmbeddings
42
  from langchain.prompts import PromptTemplate
 
 
 
 
 
 
 
 
 
 
 
 
43
  from langchain.text_splitter import RecursiveCharacterTextSplitter
44
  from langchain.vectorstores import Chroma
45
  from chromadb.errors import InvalidDimensionException
@@ -121,6 +137,13 @@ WEB_URL = "https://openai.com/research/gpt-4"
121
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
122
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
123
  #YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
 
 
 
 
 
 
 
124
 
125
 
126
  #################################################
@@ -202,14 +225,24 @@ def document_loading_splitting():
202
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
203
  word_loader = create_directory_loader('.word', CHROMA_WORD)
204
 
205
- # Load the files
206
  pdf_documents = pdf_loader.load()
207
  word_documents = word_loader.load()
208
 
209
- #alle zusammen in docs...
210
- docs.extend(pdf_documents)
211
- docs.extend(word_documents)
212
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  #andere loader...
215
  # Load PDF
@@ -223,20 +256,29 @@ def document_loading_splitting():
223
  #docs.extend(loader.load())
224
  ################################
225
  # Document splitting
226
- text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
227
- splits = text_splitter.split_documents(docs)
 
 
228
 
229
  return splits
230
 
231
  ###########################################
232
  #Chroma DB die splits ablegen - vektorisiert...
233
  def document_storage_chroma(splits):
 
 
 
 
234
  #OpenAi embeddings----------------------------------
235
- Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
236
 
237
  #HF embeddings--------------------------------------
238
  #Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
239
-
 
 
 
240
  #Mongo DB die splits ablegen - vektorisiert...
241
  def document_storage_mongodb(splits):
242
  MongoDBAtlasVectorSearch.from_documents(documents = splits,
@@ -288,6 +330,9 @@ def document_retrieval_mongodb(llm, prompt):
288
  OpenAIEmbeddings(disallowed_special = ()),
289
  index_name = MONGODB_INDEX_NAME)
290
  return db
 
 
 
291
 
292
  ###############################################
293
  #Langchain anlegen
@@ -1005,4 +1050,315 @@ class CustomDocTemplate(SimpleDocTemplate):
1005
  current_date = datetime.now().strftime("%Y-%m-%d")
1006
  # Passen Sie hier die Positionierung an Ihre Bedürfnisse an
1007
  self.canv.drawRightString(550, 800, current_date) # Position anpassen
1008
- self.canv.restoreState()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  import gc
16
  from pygments.lexers import guess_lexer, ClassNotFound
17
  import time
18
+ import json
19
+ import operator
20
+ from typing import Annotated, Sequence, TypedDict
21
 
22
  import gradio as gr
23
  from pypinyin import lazy_pinyin
 
40
  from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
41
  from langchain.tools import DuckDuckGoSearchRun
42
  from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
43
+ from typing import Dict, TypedDict
44
+ from langchain_core.messages import BaseMessage
45
  from langchain.embeddings.openai import OpenAIEmbeddings
46
  from langchain.prompts import PromptTemplate
47
+
48
+ from langchain import hub
49
+ from langchain.output_parsers.openai_tools import PydanticToolsParser
50
+ from langchain.prompts import PromptTemplate
51
+ from langchain.schema import Document
52
+ from langchain_community.tools.tavily_search import TavilySearchResults
53
+ from langchain_community.vectorstores import Chroma
54
+ from langchain_core.messages import BaseMessage, FunctionMessage
55
+ from langchain_core.output_parsers import StrOutputParser
56
+ from langchain_core.pydantic_v1 import BaseModel, Field
57
+ from langchain_core.runnables import RunnablePassthrough
58
+ from langchain_core.utils.function_calling import convert_to_openai_tool
59
  from langchain.text_splitter import RecursiveCharacterTextSplitter
60
  from langchain.vectorstores import Chroma
61
  from chromadb.errors import InvalidDimensionException
 
137
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
138
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
139
  #YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
140
+ #spezielle Webseiten als Datenbasis laden
141
+ urls = [
142
+ "https://kkg.hamburg.de/unser-leitbild/"
143
+ "https://kkg.hamburg.de/unsere-schulcharta/",
144
+ "https://kkg.hamburg.de/koordination-unterrichtsentwicklung/",
145
+ "https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/",
146
+ ]
147
 
148
 
149
  #################################################
 
225
  pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
226
  word_loader = create_directory_loader('.word', CHROMA_WORD)
227
 
228
+ # Load the files - pdf und word
229
  pdf_documents = pdf_loader.load()
230
  word_documents = word_loader.load()
231
 
232
+ #urls -zum Thema passend
233
+ docs_web = [WebBaseLoader(url).load() for url in urls]
234
+ docs_list = [item for sublist in docs_web for item in sublist]
235
 
236
+ #alle zusammen in docs...
237
+ #pdf_docs als Liste umschreiben, um es mit den anderen Materialien in der docs_list zusammenzubringen
238
+ pdf_list = [pdf_documents]
239
+ word_list = [word_documents]
240
+ #die neuen Dokeumente der Gesamt-Liste von material hinzufügen
241
+ #alle zusammen in docs...
242
+ for doc in pdf_list:
243
+ docs_list.extend(doc)
244
+ for doc in word_list:
245
+ docs_list.extend(doc)
246
 
247
  #andere loader...
248
  # Load PDF
 
256
  #docs.extend(loader.load())
257
  ################################
258
  # Document splitting
259
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap=250)
260
+ doc_splits = text_splitter.split_documents(docs_list)
261
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
262
+ #splits = text_splitter.split_documents(docs)
263
 
264
  return splits
265
 
266
  ###########################################
267
  #Chroma DB die splits ablegen - vektorisiert...
268
  def document_storage_chroma(splits):
269
+ # Add to vectorDB
270
+ vectorstore = Chroma.from_documents(documents=splits,collection_name="rag-chroma",embedding=OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
271
+ retriever = vectorstore.as_retriever(search_kwargs = {"k": 4})
272
+
273
  #OpenAi embeddings----------------------------------
274
+ #Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
275
 
276
  #HF embeddings--------------------------------------
277
  #Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
278
+ return vectorstore, retriever
279
+
280
+
281
+ """
282
  #Mongo DB die splits ablegen - vektorisiert...
283
  def document_storage_mongodb(splits):
284
  MongoDBAtlasVectorSearch.from_documents(documents = splits,
 
330
  OpenAIEmbeddings(disallowed_special = ()),
331
  index_name = MONGODB_INDEX_NAME)
332
  return db
333
+ """
334
+
335
+
336
 
337
  ###############################################
338
  #Langchain anlegen
 
1050
  current_date = datetime.now().strftime("%Y-%m-%d")
1051
  # Passen Sie hier die Positionierung an Ihre Bedürfnisse an
1052
  self.canv.drawRightString(550, 800, current_date) # Position anpassen
1053
+ self.canv.restoreState()
1054
+
1055
+
1056
+ ######################################################################
1057
+ #Zustandsgraph für Langgraph, um RAG zu implementieren mit verschiedenen Zuständen
1058
+ #die durchlaufen werden:
1059
+ #1. Dokumente aus vektorstore bekommen
1060
+ #2. die Relevanz ddr Dokuemnte einschätzen
1061
+ #3. wenn zu wenig relevante infos: Frage neu formulieren
1062
+ #4. nochmal 1. und 2.
1063
+ #5. wenn nun genug relevante Dokumente: Anfrage an Modell mit den Doks
1064
+ #6. wenn nicht gneug Dokumente relevant: Anfrage an Modell ohne Doks
1065
+ #####################################################################
1066
+
1067
+ # Zustandsgraph als Datenstruktur zum Umsetzen
1068
+ class GraphState(TypedDict):
1069
+ """
1070
+ Represents the state of our graph.
1071
+
1072
+ Attributes:
1073
+ keys: A dictionary where each key is a string.
1074
+ """
1075
+ keys: Dict[str, any]
1076
+
1077
+
1078
+ #Methoden, um den Graph und die Zustände umzusetzen
1079
+ ### Nodes ###
1080
+ # die Knoten des Graphen definieren, die der Reihe noch (bzw. je nach Outcome des Vorgänger Knotens) durchlaufen werden
1081
+ def retrieve(state):
1082
+ """
1083
+ Retrieve documents
1084
+ Args:
1085
+ state (dict): The current graph state
1086
+ Returns:
1087
+ state (dict): New keys added to state: documents, that contains retrieved documents, der wievielte Versuch gemacht wird
1088
+ """
1089
+ print("---RETRIEVE ---")
1090
+ state_dict = state["keys"]
1091
+ question = state_dict["question"]
1092
+ documents = retriever.get_relevant_documents(question)
1093
+ second_trial="ja"
1094
+ if 'second_trial' in state_dict:
1095
+ print("second time")
1096
+ second_trail = "ja"
1097
+ else:
1098
+ print("first time")
1099
+ second_trial="nein"
1100
+ return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }}
1101
+
1102
+
1103
+ def retrieve_redirect(state):
1104
+ """
1105
+ Retrieve redirect (wenn nach transform:question neues retrieven gemacht werden soll)
1106
+ Args:
1107
+ state (dict): The current graph state
1108
+ Returns:
1109
+ state (dict): New key added to state: second_trial
1110
+ """
1111
+ print("---RETRIEVE REDIRECT---")
1112
+ second_trial="ja"
1113
+ state_dict = state["keys"]
1114
+ question= state_dict["question"]
1115
+ documents= state_dict["documents"]
1116
+ return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }}
1117
+
1118
+
1119
+
1120
+ def generate(state):
1121
+ """
1122
+ Generate answer
1123
+ Args:
1124
+ state (dict): The current graph state
1125
+ Returns:
1126
+ state (dict): New key added to state, generation, that contains LLM generation
1127
+ """
1128
+ print("---GENERATE---")
1129
+ state_dict = state["keys"]
1130
+ question = state_dict["question"]
1131
+ documents = state_dict["documents"]
1132
+
1133
+ # Prompt
1134
+ prompt = hub.pull("rlm/rag-prompt")
1135
+
1136
+ # LLM
1137
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True)
1138
+
1139
+ # Post-processing
1140
+ #def format_docs(docs):
1141
+ #return "\n\n".join(doc.page_content for doc in docs)
1142
+
1143
+ # Chain
1144
+ rag_chain = prompt | llm | StrOutputParser()
1145
+
1146
+ # Run
1147
+ generation = rag_chain.invoke({"context": documents, "question": question})
1148
+ return {
1149
+ "keys": {"documents": documents, "question": question, "generation": generation}
1150
+ }
1151
+
1152
+ def generate_ohne(state):
1153
+ """
1154
+ Generate answer
1155
+ Args:
1156
+ state (dict): The current graph state
1157
+ Returns:
1158
+ state (dict): New key added to state, generation, that contains LLM generation
1159
+ """
1160
+ print("---GENERATE OHNE---")
1161
+ state_dict = state["keys"]
1162
+ question = state_dict["question"]
1163
+ #documents = state_dict["documents"]
1164
+
1165
+ # Prompt
1166
+ prompt = PromptTemplate(
1167
+ template="""\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt.
1168
+ Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt. \n\n
1169
+ Hier ist die Useranfrage: {question} """,
1170
+ input_variables=["question"])
1171
+
1172
+ # LLM
1173
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True)
1174
+
1175
+ # Post-processing
1176
+ #def format_docs(docs):
1177
+ #return "\n\n".join(doc.page_content for doc in docs)
1178
+
1179
+ # Chain
1180
+ llm_chain = prompt | llm | StrOutputParser()
1181
+
1182
+ # Run
1183
+ generation = llm_chain.invoke({ "question": question})
1184
+ return {
1185
+ "keys": {"question": question, "generation": generation}
1186
+ }
1187
+
1188
+
1189
+ def grade_documents(state):
1190
+ """
1191
+ Determines whether the retrieved documents are relevant to the question.
1192
+ Args:
1193
+ state (dict): The current graph state
1194
+ Returns:
1195
+ state (dict): Updates documents key with relevant documents
1196
+ """
1197
+
1198
+ print("---CHECK RELEVANCE---")
1199
+ state_dict = state["keys"]
1200
+ question = state_dict["question"]
1201
+ documents = state_dict["documents"]
1202
+ second_trial =state_dict["second_trial"]
1203
+
1204
+ # Data model
1205
+ class grade(BaseModel):
1206
+ """Binary score for relevance check."""
1207
+ binary_score: str = Field(description="Relevanz Bewertung 'ja' oder 'nein'")
1208
+
1209
+ # LLM
1210
+ model = ChatOpenAI(temperature=0.3, model="gpt-4-0125-preview", streaming=True)
1211
+
1212
+ # Tool
1213
+ grade_tool_oai = convert_to_openai_tool(grade)
1214
+
1215
+ # LLM with tool and enforce invocation
1216
+ llm_with_tool = model.bind(
1217
+ tools=[convert_to_openai_tool(grade_tool_oai)],
1218
+ tool_choice={"type": "function", "function": {"name": "grade"}},
1219
+ )
1220
+
1221
+ # Parser
1222
+ parser_tool = PydanticToolsParser(tools=[grade])
1223
+
1224
+ # Prompt
1225
+ prompt = PromptTemplate(
1226
+ template="""Du bist ein Bewerter, der die Relevanz von einem erhaltenen Dokument zu einer Nutzeranfrage bewerten soll. \n
1227
+ Hier ist das erhaltene Dokument: \n\n {context} \n\n
1228
+ Hier ist die Nutzeranfrage: {question} \n
1229
+ Wenn das erhaltene Dokument Keywörter oder semantische Bedeutung in Bezug auf die Nutzeranfrage hat, bewerte es als relevant. \n
1230
+ Gib eine binäre Bewertung von 'ja' oder 'nein' Bewertung, um anzuzeigen ob das Dokuemnt relevant ist zur Nutzeranfrage oder nicht.""",
1231
+ input_variables=["context", "question"],
1232
+ )
1233
+
1234
+ # Chain
1235
+ chain = prompt | llm_with_tool | parser_tool
1236
+
1237
+ # Score
1238
+ filtered_docs = []
1239
+ anzahl_relevant = 0
1240
+ search = "nein" # Default do not opt for re-questioning to supplement retrieval
1241
+ for d in documents:
1242
+ score = chain.invoke({"question": question, "context": d.page_content})
1243
+ grade = score[0].binary_score
1244
+ if grade == "ja":
1245
+ #search = "nein" # mind. ein relevantes Dokument -> keine Websuche nötig
1246
+ print("---Bewertung: Dokument ist relevant---")
1247
+ anzahl_relevant = anzahl_relevant +1
1248
+ filtered_docs.append(d)
1249
+ else:
1250
+ print("---Bewertung: Dokument irrelevant---")
1251
+ search = "ja" # mind ein Dokument irrelevant -> Frage umformulieren
1252
+ continue
1253
+ #wenn mehrheit der Dokumente relevant -> generieren starten damit
1254
+ if (anzahl_relevant>= len(documents)/2):
1255
+ search = "nein"
1256
+ print("second trial grade_docs:.....................")
1257
+ print(second_trial)
1258
+ return {
1259
+ "keys": {
1260
+ "documents": filtered_docs,
1261
+ "question": question,
1262
+ "search_again": search,
1263
+ "second_trial": second_trial
1264
+ }
1265
+ }
1266
+
1267
+
1268
+ def transform_query(state):
1269
+ """
1270
+ Transform the query to produce a better question.
1271
+ Args:
1272
+ state (dict): The current graph state
1273
+ Returns:
1274
+ state (dict): Updates question key with a re-phrased question
1275
+ """
1276
+
1277
+ print("---TRANSFORM QUERY---")
1278
+ state_dict = state["keys"]
1279
+ question = state_dict["question"]
1280
+ documents = state_dict["documents"]
1281
+
1282
+ # Create a prompt template with format instructions and the query
1283
+ prompt = PromptTemplate(
1284
+ template="""Du generierst Fragen, die optimiert sind für das Retrieval von Dokumenten. \n
1285
+ Schaue auf den input und versuche die zugrundeliegende Absicht / Bedeutung zu bewerten. \n
1286
+ Hier ist die ursprüngliche Frage:
1287
+ \n ------- \n
1288
+ {question}
1289
+ \n ------- \n
1290
+ Formuliere eine verbesserte Frage: """,
1291
+ input_variables=["question"],
1292
+ )
1293
+
1294
+ # Grader
1295
+ model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True)
1296
+
1297
+ # Prompt
1298
+ chain = prompt | model | StrOutputParser()
1299
+ better_question = chain.invoke({"question": question})
1300
+ second_trial="ja"
1301
+
1302
+ return {"keys": {"documents": documents, "question": better_question, "second_trial" : second_trial}}
1303
+
1304
+ #websuche zur Zeit nicht in gebrauch
1305
+ def web_search(state):
1306
+ """
1307
+ Web search based on the re-phrased question using Tavily API.
1308
+ Args:
1309
+ state (dict): The current graph state
1310
+ Returns:
1311
+ state (dict): Updates documents key with appended web results
1312
+ """
1313
+
1314
+ print("---WEB Suche---")
1315
+ state_dict = state["keys"]
1316
+ question = state_dict["question"]
1317
+ documents = state_dict["documents"]
1318
+
1319
+ tool = TavilySearchResults()
1320
+ docs = tool.invoke({"query": question})
1321
+ web_results = "\n".join([d["content"] for d in docs])
1322
+ web_results = Document(page_content=web_results)
1323
+ documents.append(web_results)
1324
+
1325
+ return {"keys": {"documents": documents, "question": question}}
1326
+
1327
+
1328
+ ### Edges
1329
+
1330
+
1331
+ def decide_to_generate(state):
1332
+ """
1333
+ Determines whether to generate an answer or re-generate a question for a new retriever question or generate without documents attached
1334
+ Args:
1335
+ state (dict): The current state of the agent, including all keys.
1336
+ Returns:
1337
+ str: Next node to call
1338
+ """
1339
+
1340
+ print("---ENTSCHEIDE ZU GENERIEREN---")
1341
+ print("current state")
1342
+ print(state["keys"])
1343
+ print("-------------------------------")
1344
+ state_dict = state["keys"]
1345
+ question = state_dict["question"]
1346
+ filtered_documents = state_dict["documents"]
1347
+ search_again = state_dict["search_again"]
1348
+ second_trial=state_dict["second_trial"]
1349
+
1350
+
1351
+ if search_again == "ja" :
1352
+ if (not second_trial == "ja"):
1353
+ # All documents have been filtered check_relevance
1354
+ # We will re-generate a new query
1355
+ print("---ENTSCHEIDUNG: VERÄNDERE DIE FRAGE ---")
1356
+ return "transform_query"
1357
+ else:
1358
+ # keine neue frage, sondern generieren - ohne Dokumente anzuhängen
1359
+ print("---ENTSCHEIDUNG: Generiere ohne Dokumente---")
1360
+ return "generate_ohne"
1361
+ else:
1362
+ # We have relevant documents, so generate answer
1363
+ print("---ENTSCHEIDUNG: GENERIERE---")
1364
+ return "generate"