Update utils.py
Browse files
utils.py
CHANGED
@@ -15,6 +15,9 @@ import sys
|
|
15 |
import gc
|
16 |
from pygments.lexers import guess_lexer, ClassNotFound
|
17 |
import time
|
|
|
|
|
|
|
18 |
|
19 |
import gradio as gr
|
20 |
from pypinyin import lazy_pinyin
|
@@ -37,9 +40,22 @@ from langchain.llms import HuggingFaceTextGenInference
|
|
37 |
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
38 |
from langchain.tools import DuckDuckGoSearchRun
|
39 |
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
|
40 |
-
|
|
|
41 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
42 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
44 |
from langchain.vectorstores import Chroma
|
45 |
from chromadb.errors import InvalidDimensionException
|
@@ -121,6 +137,13 @@ WEB_URL = "https://openai.com/research/gpt-4"
|
|
121 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
122 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
123 |
#YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
|
126 |
#################################################
|
@@ -202,14 +225,24 @@ def document_loading_splitting():
|
|
202 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
203 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
204 |
|
205 |
-
# Load the files
|
206 |
pdf_documents = pdf_loader.load()
|
207 |
word_documents = word_loader.load()
|
208 |
|
209 |
-
#
|
210 |
-
|
211 |
-
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
#andere loader...
|
215 |
# Load PDF
|
@@ -223,20 +256,29 @@ def document_loading_splitting():
|
|
223 |
#docs.extend(loader.load())
|
224 |
################################
|
225 |
# Document splitting
|
226 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
227 |
-
|
|
|
|
|
228 |
|
229 |
return splits
|
230 |
|
231 |
###########################################
|
232 |
#Chroma DB die splits ablegen - vektorisiert...
|
233 |
def document_storage_chroma(splits):
|
|
|
|
|
|
|
|
|
234 |
#OpenAi embeddings----------------------------------
|
235 |
-
Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
|
236 |
|
237 |
#HF embeddings--------------------------------------
|
238 |
#Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
|
239 |
-
|
|
|
|
|
|
|
240 |
#Mongo DB die splits ablegen - vektorisiert...
|
241 |
def document_storage_mongodb(splits):
|
242 |
MongoDBAtlasVectorSearch.from_documents(documents = splits,
|
@@ -288,6 +330,9 @@ def document_retrieval_mongodb(llm, prompt):
|
|
288 |
OpenAIEmbeddings(disallowed_special = ()),
|
289 |
index_name = MONGODB_INDEX_NAME)
|
290 |
return db
|
|
|
|
|
|
|
291 |
|
292 |
###############################################
|
293 |
#Langchain anlegen
|
@@ -1005,4 +1050,315 @@ class CustomDocTemplate(SimpleDocTemplate):
|
|
1005 |
current_date = datetime.now().strftime("%Y-%m-%d")
|
1006 |
# Passen Sie hier die Positionierung an Ihre Bedürfnisse an
|
1007 |
self.canv.drawRightString(550, 800, current_date) # Position anpassen
|
1008 |
-
self.canv.restoreState()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
import gc
|
16 |
from pygments.lexers import guess_lexer, ClassNotFound
|
17 |
import time
|
18 |
+
import json
|
19 |
+
import operator
|
20 |
+
from typing import Annotated, Sequence, TypedDict
|
21 |
|
22 |
import gradio as gr
|
23 |
from pypinyin import lazy_pinyin
|
|
|
40 |
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
41 |
from langchain.tools import DuckDuckGoSearchRun
|
42 |
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
|
43 |
+
from typing import Dict, TypedDict
|
44 |
+
from langchain_core.messages import BaseMessage
|
45 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
46 |
from langchain.prompts import PromptTemplate
|
47 |
+
|
48 |
+
from langchain import hub
|
49 |
+
from langchain.output_parsers.openai_tools import PydanticToolsParser
|
50 |
+
from langchain.prompts import PromptTemplate
|
51 |
+
from langchain.schema import Document
|
52 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
53 |
+
from langchain_community.vectorstores import Chroma
|
54 |
+
from langchain_core.messages import BaseMessage, FunctionMessage
|
55 |
+
from langchain_core.output_parsers import StrOutputParser
|
56 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
57 |
+
from langchain_core.runnables import RunnablePassthrough
|
58 |
+
from langchain_core.utils.function_calling import convert_to_openai_tool
|
59 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
60 |
from langchain.vectorstores import Chroma
|
61 |
from chromadb.errors import InvalidDimensionException
|
|
|
137 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
138 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
139 |
#YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
140 |
+
#spezielle Webseiten als Datenbasis laden
|
141 |
+
urls = [
|
142 |
+
"https://kkg.hamburg.de/unser-leitbild/"
|
143 |
+
"https://kkg.hamburg.de/unsere-schulcharta/",
|
144 |
+
"https://kkg.hamburg.de/koordination-unterrichtsentwicklung/",
|
145 |
+
"https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/",
|
146 |
+
]
|
147 |
|
148 |
|
149 |
#################################################
|
|
|
225 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
226 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
227 |
|
228 |
+
# Load the files - pdf und word
|
229 |
pdf_documents = pdf_loader.load()
|
230 |
word_documents = word_loader.load()
|
231 |
|
232 |
+
#urls -zum Thema passend
|
233 |
+
docs_web = [WebBaseLoader(url).load() for url in urls]
|
234 |
+
docs_list = [item for sublist in docs_web for item in sublist]
|
235 |
|
236 |
+
#alle zusammen in docs...
|
237 |
+
#pdf_docs als Liste umschreiben, um es mit den anderen Materialien in der docs_list zusammenzubringen
|
238 |
+
pdf_list = [pdf_documents]
|
239 |
+
word_list = [word_documents]
|
240 |
+
#die neuen Dokeumente der Gesamt-Liste von material hinzufügen
|
241 |
+
#alle zusammen in docs...
|
242 |
+
for doc in pdf_list:
|
243 |
+
docs_list.extend(doc)
|
244 |
+
for doc in word_list:
|
245 |
+
docs_list.extend(doc)
|
246 |
|
247 |
#andere loader...
|
248 |
# Load PDF
|
|
|
256 |
#docs.extend(loader.load())
|
257 |
################################
|
258 |
# Document splitting
|
259 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap=250)
|
260 |
+
doc_splits = text_splitter.split_documents(docs_list)
|
261 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
262 |
+
#splits = text_splitter.split_documents(docs)
|
263 |
|
264 |
return splits
|
265 |
|
266 |
###########################################
|
267 |
#Chroma DB die splits ablegen - vektorisiert...
|
268 |
def document_storage_chroma(splits):
|
269 |
+
# Add to vectorDB
|
270 |
+
vectorstore = Chroma.from_documents(documents=splits,collection_name="rag-chroma",embedding=OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
|
271 |
+
retriever = vectorstore.as_retriever(search_kwargs = {"k": 4})
|
272 |
+
|
273 |
#OpenAi embeddings----------------------------------
|
274 |
+
#Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
|
275 |
|
276 |
#HF embeddings--------------------------------------
|
277 |
#Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
|
278 |
+
return vectorstore, retriever
|
279 |
+
|
280 |
+
|
281 |
+
"""
|
282 |
#Mongo DB die splits ablegen - vektorisiert...
|
283 |
def document_storage_mongodb(splits):
|
284 |
MongoDBAtlasVectorSearch.from_documents(documents = splits,
|
|
|
330 |
OpenAIEmbeddings(disallowed_special = ()),
|
331 |
index_name = MONGODB_INDEX_NAME)
|
332 |
return db
|
333 |
+
"""
|
334 |
+
|
335 |
+
|
336 |
|
337 |
###############################################
|
338 |
#Langchain anlegen
|
|
|
1050 |
current_date = datetime.now().strftime("%Y-%m-%d")
|
1051 |
# Passen Sie hier die Positionierung an Ihre Bedürfnisse an
|
1052 |
self.canv.drawRightString(550, 800, current_date) # Position anpassen
|
1053 |
+
self.canv.restoreState()
|
1054 |
+
|
1055 |
+
|
1056 |
+
######################################################################
|
1057 |
+
#Zustandsgraph für Langgraph, um RAG zu implementieren mit verschiedenen Zuständen
|
1058 |
+
#die durchlaufen werden:
|
1059 |
+
#1. Dokumente aus vektorstore bekommen
|
1060 |
+
#2. die Relevanz ddr Dokuemnte einschätzen
|
1061 |
+
#3. wenn zu wenig relevante infos: Frage neu formulieren
|
1062 |
+
#4. nochmal 1. und 2.
|
1063 |
+
#5. wenn nun genug relevante Dokumente: Anfrage an Modell mit den Doks
|
1064 |
+
#6. wenn nicht gneug Dokumente relevant: Anfrage an Modell ohne Doks
|
1065 |
+
#####################################################################
|
1066 |
+
|
1067 |
+
# Zustandsgraph als Datenstruktur zum Umsetzen
|
1068 |
+
class GraphState(TypedDict):
|
1069 |
+
"""
|
1070 |
+
Represents the state of our graph.
|
1071 |
+
|
1072 |
+
Attributes:
|
1073 |
+
keys: A dictionary where each key is a string.
|
1074 |
+
"""
|
1075 |
+
keys: Dict[str, any]
|
1076 |
+
|
1077 |
+
|
1078 |
+
#Methoden, um den Graph und die Zustände umzusetzen
|
1079 |
+
### Nodes ###
|
1080 |
+
# die Knoten des Graphen definieren, die der Reihe noch (bzw. je nach Outcome des Vorgänger Knotens) durchlaufen werden
|
1081 |
+
def retrieve(state):
|
1082 |
+
"""
|
1083 |
+
Retrieve documents
|
1084 |
+
Args:
|
1085 |
+
state (dict): The current graph state
|
1086 |
+
Returns:
|
1087 |
+
state (dict): New keys added to state: documents, that contains retrieved documents, der wievielte Versuch gemacht wird
|
1088 |
+
"""
|
1089 |
+
print("---RETRIEVE ---")
|
1090 |
+
state_dict = state["keys"]
|
1091 |
+
question = state_dict["question"]
|
1092 |
+
documents = retriever.get_relevant_documents(question)
|
1093 |
+
second_trial="ja"
|
1094 |
+
if 'second_trial' in state_dict:
|
1095 |
+
print("second time")
|
1096 |
+
second_trail = "ja"
|
1097 |
+
else:
|
1098 |
+
print("first time")
|
1099 |
+
second_trial="nein"
|
1100 |
+
return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }}
|
1101 |
+
|
1102 |
+
|
1103 |
+
def retrieve_redirect(state):
|
1104 |
+
"""
|
1105 |
+
Retrieve redirect (wenn nach transform:question neues retrieven gemacht werden soll)
|
1106 |
+
Args:
|
1107 |
+
state (dict): The current graph state
|
1108 |
+
Returns:
|
1109 |
+
state (dict): New key added to state: second_trial
|
1110 |
+
"""
|
1111 |
+
print("---RETRIEVE REDIRECT---")
|
1112 |
+
second_trial="ja"
|
1113 |
+
state_dict = state["keys"]
|
1114 |
+
question= state_dict["question"]
|
1115 |
+
documents= state_dict["documents"]
|
1116 |
+
return {"keys": {"documents": documents, "second_trial":second_trial, "question": question, }}
|
1117 |
+
|
1118 |
+
|
1119 |
+
|
1120 |
+
def generate(state):
|
1121 |
+
"""
|
1122 |
+
Generate answer
|
1123 |
+
Args:
|
1124 |
+
state (dict): The current graph state
|
1125 |
+
Returns:
|
1126 |
+
state (dict): New key added to state, generation, that contains LLM generation
|
1127 |
+
"""
|
1128 |
+
print("---GENERATE---")
|
1129 |
+
state_dict = state["keys"]
|
1130 |
+
question = state_dict["question"]
|
1131 |
+
documents = state_dict["documents"]
|
1132 |
+
|
1133 |
+
# Prompt
|
1134 |
+
prompt = hub.pull("rlm/rag-prompt")
|
1135 |
+
|
1136 |
+
# LLM
|
1137 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True)
|
1138 |
+
|
1139 |
+
# Post-processing
|
1140 |
+
#def format_docs(docs):
|
1141 |
+
#return "\n\n".join(doc.page_content for doc in docs)
|
1142 |
+
|
1143 |
+
# Chain
|
1144 |
+
rag_chain = prompt | llm | StrOutputParser()
|
1145 |
+
|
1146 |
+
# Run
|
1147 |
+
generation = rag_chain.invoke({"context": documents, "question": question})
|
1148 |
+
return {
|
1149 |
+
"keys": {"documents": documents, "question": question, "generation": generation}
|
1150 |
+
}
|
1151 |
+
|
1152 |
+
def generate_ohne(state):
|
1153 |
+
"""
|
1154 |
+
Generate answer
|
1155 |
+
Args:
|
1156 |
+
state (dict): The current graph state
|
1157 |
+
Returns:
|
1158 |
+
state (dict): New key added to state, generation, that contains LLM generation
|
1159 |
+
"""
|
1160 |
+
print("---GENERATE OHNE---")
|
1161 |
+
state_dict = state["keys"]
|
1162 |
+
question = state_dict["question"]
|
1163 |
+
#documents = state_dict["documents"]
|
1164 |
+
|
1165 |
+
# Prompt
|
1166 |
+
prompt = PromptTemplate(
|
1167 |
+
template="""\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt.
|
1168 |
+
Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt. \n\n
|
1169 |
+
Hier ist die Useranfrage: {question} """,
|
1170 |
+
input_variables=["question"])
|
1171 |
+
|
1172 |
+
# LLM
|
1173 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.3, streaming=True)
|
1174 |
+
|
1175 |
+
# Post-processing
|
1176 |
+
#def format_docs(docs):
|
1177 |
+
#return "\n\n".join(doc.page_content for doc in docs)
|
1178 |
+
|
1179 |
+
# Chain
|
1180 |
+
llm_chain = prompt | llm | StrOutputParser()
|
1181 |
+
|
1182 |
+
# Run
|
1183 |
+
generation = llm_chain.invoke({ "question": question})
|
1184 |
+
return {
|
1185 |
+
"keys": {"question": question, "generation": generation}
|
1186 |
+
}
|
1187 |
+
|
1188 |
+
|
1189 |
+
def grade_documents(state):
|
1190 |
+
"""
|
1191 |
+
Determines whether the retrieved documents are relevant to the question.
|
1192 |
+
Args:
|
1193 |
+
state (dict): The current graph state
|
1194 |
+
Returns:
|
1195 |
+
state (dict): Updates documents key with relevant documents
|
1196 |
+
"""
|
1197 |
+
|
1198 |
+
print("---CHECK RELEVANCE---")
|
1199 |
+
state_dict = state["keys"]
|
1200 |
+
question = state_dict["question"]
|
1201 |
+
documents = state_dict["documents"]
|
1202 |
+
second_trial =state_dict["second_trial"]
|
1203 |
+
|
1204 |
+
# Data model
|
1205 |
+
class grade(BaseModel):
|
1206 |
+
"""Binary score for relevance check."""
|
1207 |
+
binary_score: str = Field(description="Relevanz Bewertung 'ja' oder 'nein'")
|
1208 |
+
|
1209 |
+
# LLM
|
1210 |
+
model = ChatOpenAI(temperature=0.3, model="gpt-4-0125-preview", streaming=True)
|
1211 |
+
|
1212 |
+
# Tool
|
1213 |
+
grade_tool_oai = convert_to_openai_tool(grade)
|
1214 |
+
|
1215 |
+
# LLM with tool and enforce invocation
|
1216 |
+
llm_with_tool = model.bind(
|
1217 |
+
tools=[convert_to_openai_tool(grade_tool_oai)],
|
1218 |
+
tool_choice={"type": "function", "function": {"name": "grade"}},
|
1219 |
+
)
|
1220 |
+
|
1221 |
+
# Parser
|
1222 |
+
parser_tool = PydanticToolsParser(tools=[grade])
|
1223 |
+
|
1224 |
+
# Prompt
|
1225 |
+
prompt = PromptTemplate(
|
1226 |
+
template="""Du bist ein Bewerter, der die Relevanz von einem erhaltenen Dokument zu einer Nutzeranfrage bewerten soll. \n
|
1227 |
+
Hier ist das erhaltene Dokument: \n\n {context} \n\n
|
1228 |
+
Hier ist die Nutzeranfrage: {question} \n
|
1229 |
+
Wenn das erhaltene Dokument Keywörter oder semantische Bedeutung in Bezug auf die Nutzeranfrage hat, bewerte es als relevant. \n
|
1230 |
+
Gib eine binäre Bewertung von 'ja' oder 'nein' Bewertung, um anzuzeigen ob das Dokuemnt relevant ist zur Nutzeranfrage oder nicht.""",
|
1231 |
+
input_variables=["context", "question"],
|
1232 |
+
)
|
1233 |
+
|
1234 |
+
# Chain
|
1235 |
+
chain = prompt | llm_with_tool | parser_tool
|
1236 |
+
|
1237 |
+
# Score
|
1238 |
+
filtered_docs = []
|
1239 |
+
anzahl_relevant = 0
|
1240 |
+
search = "nein" # Default do not opt for re-questioning to supplement retrieval
|
1241 |
+
for d in documents:
|
1242 |
+
score = chain.invoke({"question": question, "context": d.page_content})
|
1243 |
+
grade = score[0].binary_score
|
1244 |
+
if grade == "ja":
|
1245 |
+
#search = "nein" # mind. ein relevantes Dokument -> keine Websuche nötig
|
1246 |
+
print("---Bewertung: Dokument ist relevant---")
|
1247 |
+
anzahl_relevant = anzahl_relevant +1
|
1248 |
+
filtered_docs.append(d)
|
1249 |
+
else:
|
1250 |
+
print("---Bewertung: Dokument irrelevant---")
|
1251 |
+
search = "ja" # mind ein Dokument irrelevant -> Frage umformulieren
|
1252 |
+
continue
|
1253 |
+
#wenn mehrheit der Dokumente relevant -> generieren starten damit
|
1254 |
+
if (anzahl_relevant>= len(documents)/2):
|
1255 |
+
search = "nein"
|
1256 |
+
print("second trial grade_docs:.....................")
|
1257 |
+
print(second_trial)
|
1258 |
+
return {
|
1259 |
+
"keys": {
|
1260 |
+
"documents": filtered_docs,
|
1261 |
+
"question": question,
|
1262 |
+
"search_again": search,
|
1263 |
+
"second_trial": second_trial
|
1264 |
+
}
|
1265 |
+
}
|
1266 |
+
|
1267 |
+
|
1268 |
+
def transform_query(state):
|
1269 |
+
"""
|
1270 |
+
Transform the query to produce a better question.
|
1271 |
+
Args:
|
1272 |
+
state (dict): The current graph state
|
1273 |
+
Returns:
|
1274 |
+
state (dict): Updates question key with a re-phrased question
|
1275 |
+
"""
|
1276 |
+
|
1277 |
+
print("---TRANSFORM QUERY---")
|
1278 |
+
state_dict = state["keys"]
|
1279 |
+
question = state_dict["question"]
|
1280 |
+
documents = state_dict["documents"]
|
1281 |
+
|
1282 |
+
# Create a prompt template with format instructions and the query
|
1283 |
+
prompt = PromptTemplate(
|
1284 |
+
template="""Du generierst Fragen, die optimiert sind für das Retrieval von Dokumenten. \n
|
1285 |
+
Schaue auf den input und versuche die zugrundeliegende Absicht / Bedeutung zu bewerten. \n
|
1286 |
+
Hier ist die ursprüngliche Frage:
|
1287 |
+
\n ------- \n
|
1288 |
+
{question}
|
1289 |
+
\n ------- \n
|
1290 |
+
Formuliere eine verbesserte Frage: """,
|
1291 |
+
input_variables=["question"],
|
1292 |
+
)
|
1293 |
+
|
1294 |
+
# Grader
|
1295 |
+
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True)
|
1296 |
+
|
1297 |
+
# Prompt
|
1298 |
+
chain = prompt | model | StrOutputParser()
|
1299 |
+
better_question = chain.invoke({"question": question})
|
1300 |
+
second_trial="ja"
|
1301 |
+
|
1302 |
+
return {"keys": {"documents": documents, "question": better_question, "second_trial" : second_trial}}
|
1303 |
+
|
1304 |
+
#websuche zur Zeit nicht in gebrauch
|
1305 |
+
def web_search(state):
|
1306 |
+
"""
|
1307 |
+
Web search based on the re-phrased question using Tavily API.
|
1308 |
+
Args:
|
1309 |
+
state (dict): The current graph state
|
1310 |
+
Returns:
|
1311 |
+
state (dict): Updates documents key with appended web results
|
1312 |
+
"""
|
1313 |
+
|
1314 |
+
print("---WEB Suche---")
|
1315 |
+
state_dict = state["keys"]
|
1316 |
+
question = state_dict["question"]
|
1317 |
+
documents = state_dict["documents"]
|
1318 |
+
|
1319 |
+
tool = TavilySearchResults()
|
1320 |
+
docs = tool.invoke({"query": question})
|
1321 |
+
web_results = "\n".join([d["content"] for d in docs])
|
1322 |
+
web_results = Document(page_content=web_results)
|
1323 |
+
documents.append(web_results)
|
1324 |
+
|
1325 |
+
return {"keys": {"documents": documents, "question": question}}
|
1326 |
+
|
1327 |
+
|
1328 |
+
### Edges
|
1329 |
+
|
1330 |
+
|
1331 |
+
def decide_to_generate(state):
|
1332 |
+
"""
|
1333 |
+
Determines whether to generate an answer or re-generate a question for a new retriever question or generate without documents attached
|
1334 |
+
Args:
|
1335 |
+
state (dict): The current state of the agent, including all keys.
|
1336 |
+
Returns:
|
1337 |
+
str: Next node to call
|
1338 |
+
"""
|
1339 |
+
|
1340 |
+
print("---ENTSCHEIDE ZU GENERIEREN---")
|
1341 |
+
print("current state")
|
1342 |
+
print(state["keys"])
|
1343 |
+
print("-------------------------------")
|
1344 |
+
state_dict = state["keys"]
|
1345 |
+
question = state_dict["question"]
|
1346 |
+
filtered_documents = state_dict["documents"]
|
1347 |
+
search_again = state_dict["search_again"]
|
1348 |
+
second_trial=state_dict["second_trial"]
|
1349 |
+
|
1350 |
+
|
1351 |
+
if search_again == "ja" :
|
1352 |
+
if (not second_trial == "ja"):
|
1353 |
+
# All documents have been filtered check_relevance
|
1354 |
+
# We will re-generate a new query
|
1355 |
+
print("---ENTSCHEIDUNG: VERÄNDERE DIE FRAGE ---")
|
1356 |
+
return "transform_query"
|
1357 |
+
else:
|
1358 |
+
# keine neue frage, sondern generieren - ohne Dokumente anzuhängen
|
1359 |
+
print("---ENTSCHEIDUNG: Generiere ohne Dokumente---")
|
1360 |
+
return "generate_ohne"
|
1361 |
+
else:
|
1362 |
+
# We have relevant documents, so generate answer
|
1363 |
+
print("---ENTSCHEIDUNG: GENERIERE---")
|
1364 |
+
return "generate"
|