Spaces:
Runtime error
Runtime error
Update utils.py
Browse files
utils.py
CHANGED
@@ -15,6 +15,10 @@ import sys
|
|
15 |
import gc
|
16 |
from pygments.lexers import guess_lexer, ClassNotFound
|
17 |
import time
|
|
|
|
|
|
|
|
|
18 |
|
19 |
import gradio as gr
|
20 |
from pypinyin import lazy_pinyin
|
@@ -26,22 +30,37 @@ from pygments.lexers import guess_lexer,get_lexer_by_name
|
|
26 |
from pygments.formatters import HtmlFormatter
|
27 |
|
28 |
from langchain.chains import LLMChain, RetrievalQA
|
29 |
-
from
|
30 |
-
from
|
|
|
31 |
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
|
32 |
from langchain.document_loaders.generic import GenericLoader
|
33 |
from langchain.document_loaders.parsers import OpenAIWhisperParser
|
34 |
from langchain.schema import AIMessage, HumanMessage
|
35 |
-
from
|
36 |
-
from
|
37 |
-
from
|
38 |
-
from
|
39 |
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
|
|
|
|
42 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
44 |
-
from
|
45 |
from chromadb.errors import InvalidDimensionException
|
46 |
import io
|
47 |
from PIL import Image, ImageDraw, ImageOps, ImageFont
|
@@ -63,9 +82,7 @@ from reportlab.platypus import SimpleDocTemplate, Frame, Spacer
|
|
63 |
from reportlab.lib import colors
|
64 |
from reportlab.lib.units import mm
|
65 |
from reportlab.platypus import Paragraph, SimpleDocTemplate, Frame, Image, Table, ListFlowable, ListItem
|
66 |
-
from reportlab.lib.styles import
|
67 |
-
from reportlab.lib.units import cm
|
68 |
-
|
69 |
|
70 |
logging.basicConfig(
|
71 |
level=logging.INFO,
|
@@ -121,7 +138,13 @@ WEB_URL = "https://openai.com/research/gpt-4"
|
|
121 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
122 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
123 |
#YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
#################################################
|
127 |
# Retrieval Funktion, um KI-Antwort mit vorgegebenen zu vergleichen
|
@@ -206,10 +229,20 @@ def document_loading_splitting():
|
|
206 |
pdf_documents = pdf_loader.load()
|
207 |
word_documents = word_loader.load()
|
208 |
|
209 |
-
#
|
210 |
-
|
211 |
-
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
#andere loader...
|
215 |
# Load PDF
|
@@ -223,26 +256,154 @@ def document_loading_splitting():
|
|
223 |
#docs.extend(loader.load())
|
224 |
################################
|
225 |
# Document splitting
|
226 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
227 |
-
|
|
|
|
|
228 |
|
229 |
-
return
|
230 |
|
231 |
###########################################
|
232 |
#Chroma DB die splits ablegen - vektorisiert...
|
233 |
def document_storage_chroma(splits):
|
234 |
-
#OpenAi
|
235 |
-
Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
|
|
|
236 |
|
237 |
#HF embeddings--------------------------------------
|
238 |
#Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
-
#Mongo DB die splits ablegen - vektorisiert...
|
241 |
-
def document_storage_mongodb(splits):
|
242 |
-
MongoDBAtlasVectorSearch.from_documents(documents = splits,
|
243 |
-
embedding = OpenAIEmbeddings(disallowed_special = ()),
|
244 |
-
collection = MONGODB_COLLECTION,
|
245 |
-
index_name = MONGODB_INDEX_NAME)
|
246 |
############################################
|
247 |
#dokumente in chroma db vektorisiert ablegen können - die Db vorbereiten daüfur
|
248 |
def document_retrieval_chroma(llm, prompt):
|
|
|
15 |
import gc
|
16 |
from pygments.lexers import guess_lexer, ClassNotFound
|
17 |
import time
|
18 |
+
import json
|
19 |
+
import operator
|
20 |
+
from typing import Annotated, Sequence, TypedDict
|
21 |
+
import pprint
|
22 |
|
23 |
import gradio as gr
|
24 |
from pypinyin import lazy_pinyin
|
|
|
30 |
from pygments.formatters import HtmlFormatter
|
31 |
|
32 |
from langchain.chains import LLMChain, RetrievalQA
|
33 |
+
from langgraph.graph import END, StateGraph
|
34 |
+
from langchain_openai import ChatOpenAI
|
35 |
+
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader, UnstructuredWordDocumentLoader, DirectoryLoader
|
36 |
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
|
37 |
from langchain.document_loaders.generic import GenericLoader
|
38 |
from langchain.document_loaders.parsers import OpenAIWhisperParser
|
39 |
from langchain.schema import AIMessage, HumanMessage
|
40 |
+
from langchain_community.llms import HuggingFaceHub
|
41 |
+
from langchain_community.llms import HuggingFaceTextGenInference
|
42 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
|
43 |
+
from langchain_community.tools import DuckDuckGoSearchRun
|
44 |
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
|
45 |
+
from typing import Dict, TypedDict
|
46 |
+
from langchain_core.messages import BaseMessage
|
47 |
+
from langchain_openai import OpenAIEmbeddings
|
48 |
+
from langchain.prompts import PromptTemplate
|
49 |
|
50 |
+
|
51 |
+
from langchain import hub
|
52 |
+
from langchain.output_parsers.openai_tools import PydanticToolsParser
|
53 |
from langchain.prompts import PromptTemplate
|
54 |
+
from langchain.schema import Document
|
55 |
+
from langchain_community.tools.tavily_search import TavilySearchResults
|
56 |
+
from langchain_community.vectorstores import Chroma
|
57 |
+
from langchain_core.messages import BaseMessage, FunctionMessage
|
58 |
+
from langchain_core.output_parsers import StrOutputParser
|
59 |
+
from langchain_core.pydantic_v1 import BaseModel, Field
|
60 |
+
from langchain_core.runnables import RunnablePassthrough
|
61 |
+
from langchain_core.utils.function_calling import convert_to_openai_tool
|
62 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
63 |
+
from langchain_community.vectorstores import Chroma
|
64 |
from chromadb.errors import InvalidDimensionException
|
65 |
import io
|
66 |
from PIL import Image, ImageDraw, ImageOps, ImageFont
|
|
|
82 |
from reportlab.lib import colors
|
83 |
from reportlab.lib.units import mm
|
84 |
from reportlab.platypus import Paragraph, SimpleDocTemplate, Frame, Image, Table, ListFlowable, ListItem
|
85 |
+
from reportlab.lib.styles import get
|
|
|
|
|
86 |
|
87 |
logging.basicConfig(
|
88 |
level=logging.INFO,
|
|
|
138 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
139 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
140 |
#YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
141 |
+
#spezielle Webseiten als Datenbasis laden
|
142 |
+
urls = [
|
143 |
+
"https://kkg.hamburg.de/unser-leitbild/"
|
144 |
+
"https://kkg.hamburg.de/unsere-schulcharta/",
|
145 |
+
"https://kkg.hamburg.de/koordination-unterrichtsentwicklung/",
|
146 |
+
"https://kkg.hamburg.de/konzept-medien-und-it-am-kkg/",
|
147 |
+
]
|
148 |
|
149 |
#################################################
|
150 |
# Retrieval Funktion, um KI-Antwort mit vorgegebenen zu vergleichen
|
|
|
229 |
pdf_documents = pdf_loader.load()
|
230 |
word_documents = word_loader.load()
|
231 |
|
232 |
+
#urls -zum Thema passend
|
233 |
+
docs_web = [WebBaseLoader(url).load() for url in urls]
|
234 |
+
docs_list = [item for sublist in docs_web for item in sublist]
|
235 |
|
236 |
+
#alle zusammen in docs...
|
237 |
+
#pdf_docs als Liste umschreiben, um es mit den anderen Materialien in der docs_list zusammenzubringen
|
238 |
+
pdf_list = [pdf_documents]
|
239 |
+
word_list = [word_documents]
|
240 |
+
#die neuen Dokeumente der Gesamt-Liste von material hinzufügen
|
241 |
+
#alle zusammen in docs...
|
242 |
+
for doc in pdf_list:
|
243 |
+
docs_list.extend(doc)
|
244 |
+
for doc in word_list:
|
245 |
+
docs_list.extend(doc)
|
246 |
|
247 |
#andere loader...
|
248 |
# Load PDF
|
|
|
256 |
#docs.extend(loader.load())
|
257 |
################################
|
258 |
# Document splitting
|
259 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap=250)
|
260 |
+
doc_splits = text_splitter.split_documents(docs_list)
|
261 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500)
|
262 |
+
#splits = text_splitter.split_documents(docs)
|
263 |
|
264 |
+
return doc_splits
|
265 |
|
266 |
###########################################
|
267 |
#Chroma DB die splits ablegen - vektorisiert...
|
268 |
def document_storage_chroma(splits):
|
269 |
+
# Add to vectorDB - mit Embeddings von OpenAi - Embeddings sind zur Klassifizierung der Dokumente im Vektorraum nötig
|
270 |
+
vectorstore = Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR)
|
271 |
+
retriever = vectorstore.as_retriever(search_kwargs = {"k": 5})
|
272 |
|
273 |
#HF embeddings--------------------------------------
|
274 |
#Chroma.from_documents(documents = splits, embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False}), persist_directory = PATH_WORK + CHROMA_DIR)
|
275 |
+
return vectorstore, retriever
|
276 |
+
|
277 |
+
|
278 |
+
#Dokumente, die vom Retriever rausgesucht wurden auf Relevanz untersuchen
|
279 |
+
def grade_documents_direct(prompt, documents):
|
280 |
+
print("---CHECK RELEVANCE---")
|
281 |
+
|
282 |
+
# Data model
|
283 |
+
class grade(BaseModel):
|
284 |
+
#Binary score for relevance check.
|
285 |
+
binary_score: str = Field(description="Relevanz Bewertung 'ja' oder 'nein'")
|
286 |
+
|
287 |
+
# LLM
|
288 |
+
model = ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo-1106", streaming=True)
|
289 |
+
|
290 |
+
# Tool
|
291 |
+
grade_tool_oai = convert_to_openai_tool(grade)
|
292 |
+
|
293 |
+
# LLM with tool and enforce invocation
|
294 |
+
llm_with_tool = model.bind(
|
295 |
+
tools=[convert_to_openai_tool(grade_tool_oai)],
|
296 |
+
tool_choice={"type": "function", "function": {"name": "grade"}},
|
297 |
+
)
|
298 |
+
|
299 |
+
# Parser
|
300 |
+
parser_tool = PydanticToolsParser(tools=[grade])
|
301 |
+
|
302 |
+
# Prompt
|
303 |
+
prompt_gesamt = PromptTemplate(
|
304 |
+
template="""Du bist ein Bewerter, der die Relevanz von einem erhaltenen Dokument zu einer Nutzeranfrage bewerten soll. \n
|
305 |
+
Hier ist das erhaltene Dokument: \n\n {context} \n\n
|
306 |
+
Hier ist die Nutzeranfrage: {question} \n
|
307 |
+
Wenn das erhaltene Dokument Keywörter oder semantische Bedeutung in Bezug auf die Nutzeranfrage hat, bewerte es als relevant. \n
|
308 |
+
Gib eine binäre Bewertung von 'ja' oder 'nein' Bewertung, um anzuzeigen ob das Dokuemnt relevant ist zur Nutzeranfrage oder nicht.""",
|
309 |
+
input_variables=["context", "question"],
|
310 |
+
)
|
311 |
+
|
312 |
+
# Chain
|
313 |
+
chain = prompt_gesamt | llm_with_tool | parser_tool
|
314 |
+
|
315 |
+
# Score
|
316 |
+
filtered_docs = []
|
317 |
+
for d in documents:
|
318 |
+
#print(d.page_content)
|
319 |
+
score = chain.invoke({"question": prompt, "context": d.page_content})
|
320 |
+
grade = score[0].binary_score
|
321 |
+
if grade == "ja":
|
322 |
+
print("---Bewertung: Dokument ist relevant---")
|
323 |
+
filtered_docs.append(d)
|
324 |
+
else:
|
325 |
+
print("---Bewertung: Dokument irrelevant---")
|
326 |
+
continue
|
327 |
+
|
328 |
+
return filtered_docs
|
329 |
+
|
330 |
+
#Prompt überarbeiten, wenn RAG angeschaltet, aber der Relevanz Check nicht genügend positive Eergebnisse gebracht hat
|
331 |
+
#ChatGPT selbst formuliert den neuen Prompt
|
332 |
+
def transform_query_direct(question):
|
333 |
+
print("---TRANSFORM QUERY---")
|
334 |
+
|
335 |
+
# Create a prompt template with format instructions and the query
|
336 |
+
prompt = PromptTemplate(
|
337 |
+
template="""Du generierst Fragen, die optimiert sind für das Retrieval von Dokumenten. \n
|
338 |
+
Schaue auf den input und versuche die zugrundeliegende Absicht / Bedeutung zu bewerten. \n
|
339 |
+
Hier ist die ursprüngliche Frage:
|
340 |
+
\n ------- \n
|
341 |
+
{question}
|
342 |
+
\n ------- \n
|
343 |
+
Formuliere eine verbesserte Frage: """,
|
344 |
+
input_variables=["question"],
|
345 |
+
)
|
346 |
+
|
347 |
+
# Grader
|
348 |
+
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview", streaming=True)
|
349 |
+
|
350 |
+
# Prompt
|
351 |
+
chain = prompt | model | StrOutputParser()
|
352 |
+
better_question = chain.invoke({"question": question})
|
353 |
+
|
354 |
+
return better_question
|
355 |
+
|
356 |
+
|
357 |
+
###############################################
|
358 |
+
#Langchain anlegen
|
359 |
+
###############################################
|
360 |
+
#langchain nutzen, um prompt an LLM zu leiten - llm und prompt sind austauschbar
|
361 |
+
def llm_chain(llm, prompt):
|
362 |
+
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT)
|
363 |
+
result = llm_chain.run({"question": prompt})
|
364 |
+
return result
|
365 |
+
|
366 |
+
#nur für HF-um bei chatverlauf kurzbeschreibung zu erzeugen
|
367 |
+
def llm_chain2(llm, prompt):
|
368 |
+
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT2)
|
369 |
+
result = llm_chain.run({"question": prompt})
|
370 |
+
return result
|
371 |
+
|
372 |
+
|
373 |
+
#############################################
|
374 |
+
#langchain nutzen, um prompt an llm zu leiten, aber vorher in der VektorDB suchen, um passende splits zum Prompt hinzuzufügen
|
375 |
+
#die erhaltenen dokumente (vom retriever) werden einem Relevanzcheck unterzogen, bevor sie an ChatGPT zum Beantworten gehen
|
376 |
+
#den Relevanzcheck macht ebenfalls ein anderes KI-Modell. Wenn zu wenig relevante Dokumente rausgesucht wurden, wird die ursprüngliche Freage neu formuliert - und ein zweiter Versuch findet statt
|
377 |
+
def rag_chain(llm, prompt, retriever):
|
378 |
+
#Langgraph nutzen für ein wenig mehr Intelligenz beim Dokumente suchen
|
379 |
+
relevant_docs=[]
|
380 |
+
filtered_docs=[]
|
381 |
+
relevant_docs = retriever.get_relevant_documents(prompt)
|
382 |
+
print("releant docs1......................")
|
383 |
+
print(relevant_docs)
|
384 |
+
if (len(relevant_docs)>0):
|
385 |
+
filtered_docs = grade_documents_direct(prompt, relevant_docs)
|
386 |
+
|
387 |
+
neu_prompt=prompt
|
388 |
+
if (len(filtered_docs)<2): #frage neu formulieren
|
389 |
+
relevant_docs=[]
|
390 |
+
neu_prompt = transform_query_direct(prompt)
|
391 |
+
relevant_docs = retriever.get_relevant_documents(neu_prompt)
|
392 |
+
if (len(relevant_docs)>0):
|
393 |
+
print("releant docs2......................")
|
394 |
+
print(relevant_docs)
|
395 |
+
filtered_docs = grade_documents_direct(neu_prompt, relevant_docs)
|
396 |
+
|
397 |
+
if (len(filtered_docs)>0):
|
398 |
+
llm_chain = LLMChain(llm = llm, prompt = RAG_CHAIN_PROMPT)
|
399 |
+
result = llm_chain.run({"context": filtered_docs, "question": neu_prompt})
|
400 |
+
else:
|
401 |
+
#Normale Abfrage, da keine relevanten Dokumente gefunden
|
402 |
+
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT)
|
403 |
+
result = llm_chain.run({"question": neu_prompt})
|
404 |
+
return result
|
405 |
+
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
############################################
|
408 |
#dokumente in chroma db vektorisiert ablegen können - die Db vorbereiten daüfur
|
409 |
def document_retrieval_chroma(llm, prompt):
|