######################################################################################### # Title: Gradio AI-Interface with Memory-RAG # Author: Andreas Fischer # Date: October 15th, 2023 # Last update: May 27th, 2024 ########################################################################################## #https://github.com/abetlen/llama-cpp-python/issues/306 #sudo apt install libclblast-dev #CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -v # Prepare resources #------------------- import torch import gc torch.cuda.empty_cache() gc.collect() import os from datetime import datetime global filename filename=f"./{datetime.now().strftime('%Y%m%d')}_history.json" # where to store the history as json-file if(os.path.exists(filename)==True): os.remove(filename) # Chroma-DB #----------- import os import chromadb dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db" onPrem = True if(os.path.exists(dbPath)) else False if(onPrem==False): dbPath="/home/user/app/db" #onPrem=True # uncomment to override automatic detection print(dbPath) #client = chromadb.Client() path=dbPath client = chromadb.PersistentClient(path=path) print(client.heartbeat()) print(client.get_version()) print(client.list_collections()) from chromadb.utils import embedding_functions default_ef = embedding_functions.DefaultEmbeddingFunction() #sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer") #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda") embeddingModel = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer", device="cuda" if(onPrem) else "cpu") print(str(client.list_collections())) global collection dbName="historicalChromaDB1" if("name="+dbName in str(client.list_collections())): client.delete_collection(name=dbName) # deletes collection if("name="+dbName in str(client.list_collections())): print(dbName+" found!") collection = client.get_collection(name=dbName, embedding_function=embeddingModel) #sentence_transformer_ef) else: #client.delete_collection(name=dbName) print(dbName+" created!") collection = client.create_collection( dbName, embedding_function=embeddingModel, metadata={"hnsw:space": "cosine"}) print("Database ready!") print(collection.count()) x=collection.get(include=[])["ids"] if(len(x)==0): message="Ich bin der User." response="Hallo User, wie kann ich dienen?" x=collection.get(include=[])["ids"] collection.add( documents=[message,response], metadatas=[ {"source": "ICH", "dialog": f"ICH: {message}\nDU: {response}"}, {"source": "DU", "dialog": f"ICH: {message}\nDU: {response}"} ], ids=[str(len(x)+1),str(len(x)+2)] ) RAGResults=collection.query( query_texts=[message], n_results=1, #where={"source": "USER"} ) RAGResults["metadatas"][0][0]["dialog"] # Model #------- #onPrem=False if(onPrem==False): modelPath="mistralai/Mixtral-8x7B-Instruct-v0.1" from huggingface_hub import InferenceClient import gradio as gr client = InferenceClient( modelPath #"mistralai/Mixtral-8x7B-Instruct-v0.1" #"mistralai/Mistral-7B-Instruct-v0.1" ) else: import os import requests import subprocess #modelPath="/home/af/gguf/models/Discolm_german_7b_v1.Q4_0.gguf" modelPath="/home/af/gguf/models/Mixtral-8x7b-instruct-v0.1.Q4_0.gguf" if(os.path.exists(modelPath)==False): #url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true" url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true" response = requests.get(url) with open("./Mixtral-8x7b-instruct.gguf", mode="wb") as file: file.write(response.content) print("Model downloaded") modelPath="./Mixtral-8x7b-instruct.gguf" print(modelPath) n="20" if("Mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here... command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n] subprocess.Popen(command) print("Server ready!") #import llama_cpp #llama_cpp.llama_backend_init(numa=False) #params=llama_cpp.llama_context_default_params() #params.n_ctx # Gradio-GUI #------------ import re def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=True): startOfString="" if zeichenlimit is None: zeichenlimit=1000000000 # :-) template0=" [INST]{system}\n [/INST] " template1=" [INST] {message} [/INST]" template2=" {response}" if("command-r" in modelPath): #https://huggingface.co/CohereForAI/c4ai-command-r-v01 ## <|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> template0="<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> {system}<|END_OF_TURN_TOKEN|>" template1="<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" template2="{response}<|END_OF_TURN_TOKEN|>" if("Gemma-" in modelPath): # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 template0="user{system}" template1="user{message}model" template2="{response}" if("Mixtral-8x22B-Instruct" in modelPath): # AutoTokenizer: [INST] U1[/INST] A1[INST] U2[/INST] A2 startOfString="" template0="[INST]{system}\n [/INST] " template1="[INST] {message}[/INST]" template2=" {response}" if("Mixtral-8x7b-instruct" in modelPath): # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 startOfString="" # AutoTokenzizer: [INST] U1 [/INST]A1 [INST] U2 [/INST]A2 template0=" [INST]{system}\n [/INST] " template1=" [INST] {message} [/INST]" template2=" {response}" if("Mistral-7B-Instruct" in modelPath): #https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2 startOfString="" template0="[INST]{system}\n [/INST]" template1="[INST] {message} [/INST]" template2=" {response}" if("Openchat-3.5" in modelPath): #https://huggingface.co/TheBloke/openchat-3.5-0106-GGUF template0="GPT4 Correct User: {system}<|end_of_turn|>GPT4 Correct Assistant: Okay.<|end_of_turn|>" template1="GPT4 Correct User: {message}<|end_of_turn|>GPT4 Correct Assistant: " template2="{response}<|end_of_turn|>" if(("Discolm_german_7b" in modelPath) or ("SauerkrautLM-7b-HerO" in modelPath)): #https://huggingface.co/VAGOsolutions/SauerkrautLM-7b-HerO template0="<|im_start|>system\n{system}<|im_end|>\n" template1="<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" template2="{response}<|im_end|>\n" if("Llama-3-SauerkrautLM-8b-Instruct" in modelPath): #https://huggingface.co/VAGOsolutions/SauerkrautLM-7b-HerO template0="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|>" template1="<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" template2="{response}<|eot_id|>\n" if("WizardLM-13B-V1.2" in modelPath): #https://huggingface.co/WizardLM/WizardLM-13B-V1.2 template0="{system} " # template1="USER: {message} ASSISTANT: " template2="{response}" if("Phi-2" in modelPath): #https://huggingface.co/TheBloke/phi-2-GGUF template0="Instruct: {system}\nOutput: Okay.\n" template1="Instruct: {message}\nOutput:" template2="{response}\n" prompt = "" if RAGAddon is not None: system += RAGAddon if system is not None: prompt += template0.format(system=system) #"" if history is not None: for user_message, bot_response in history[-historylimit:]: if user_message is None: user_message = "" if bot_response is None: bot_response = "" bot_response = re.sub("\n\n.*?","", bot_response, flags=re.DOTALL) # remove RAG-compontents if removeHTML==True: bot_response = re.sub("<(.*?)>","\n", bot_response) # remove HTML-components in general (may cause bugs with markdown-rendering) if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit]) if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit]) if message is not None: prompt += template1.format(message=message[:zeichenlimit]) if system2 is not None: prompt += system2 return startOfString+prompt import gradio as gr import requests import json from datetime import datetime import os import re def response(message, history,customSysPrompt,settings): #settings="Memory Off" removeHTML=True system=customSysPrompt message=message.replace("[INST]","") message=message.replace("[/INST]","") message=message.replace("","") message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message) if (settings=="Memory On"): if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available) x=collection.get(include=[])["ids"] rag=None # RAG is turned off until history gets too long historylimit=0 #4 if(len(x)>(historylimit*2)): # turn on RAG when the database contains entries that are not shown within historylimit RAGResults=collection.query( query_texts=[message], n_results=1, #where={"source": "USER"} ) bestMatch=str(RAGResults["metadatas"][0][0]["dialog"]) #print("Message: "+message+"\n\nBest Match: "+bestMatch) rag="\n\n" rag += "Mit Blick auf den aktuellen Stand des Dialogs erinnerst du dich insb. an folgende Episode:\n" rag += bestMatch rag += "\n\nIm Folgenden siehst du den aktuellen Stand des Dialogs." system2=None # system2 can be used as fictive first words of the AI, which are not displayed or stored #print("RAG: "+rag) #print("System: "+system+"\n\nMessage: "+message) prompt=extend_prompt( message, # current message of the user history, # complete history system, # system prompt rag, # RAG-component added to the system prompt system2, # fictive first words of the AI (neither displayed nor stored) historylimit=historylimit,# number of past messages to consider for response to current message removeHTML=removeHTML # remove HTML-components from History (to prevent bugs with Markdown) ) #print("\n\nMESSAGE:"+str(message)) #print("\n\nHISTORY:"+str(history)) #print("\n\nSYSTEM:"+str(system)) #print("\n\nRAG:"+str(rag)) #print("\n\nSYSTEM2:"+str(system2)) #print("\n\n*** Prompt:\n"+prompt+"\n***\n\n") ## Request response from model #------------------------------ print("AI running on prem!" if(onPrem) else "AI running HFHub!") if(onPrem==False): temperature=float(0.9) max_new_tokens=500 top_p=0.95 repetition_penalty=1.0 if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) response = "" #print("User: "+message+"\nAI: ") for text in stream: part=text.token.text #print(part, end="", flush=True) response += part if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering) yield response # Store current state in DB if memory is turned on if (settings=="Memory On"): x=collection.get(include=[])["ids"] # add current dialog to db collection.add( documents=[message,response], metadatas=[ { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}, { "source": "DU", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"} ], ids=[str(len(x)+1),str(len(x)+2)] ) json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False) if(onPrem==True): # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions" url="http://0.0.0.0:2600/v1/completions" body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"} # e.g. Mixtral-Instruct if("Discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]}) # fix stop-token of DiscoLM if("Gemma-" in modelPath): body.update({"stop": ["<|im_end|>",""]}) # fix stop-token of Gemma response="" #+"("+myType+")\n" buffer="" #print("URL: "+url) #print("User: "+message+"\nAI: ") for text in requests.post(url, json=body, stream=True): #-H 'accept: application/json' -H 'Content-Type: application/json' if buffer is None: buffer="" buffer=str("".join(buffer)) # print("*** Raw String: "+str(text)+"\n***\n") text=text.decode('utf-8') if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text) # print("\n*** Buffer: "+str(buffer)+"\n***\n") buffer=buffer.split('"finish_reason": null}]}') if(len(buffer)==1): buffer="".join(buffer) pass if(len(buffer)==2): part=buffer[0]+'"finish_reason": null}]}' if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "") try: part = str(json.loads(part)["choices"][0]["text"]) #print(part, end="", flush=True) response=response+part buffer="" # reset buffer except Exception as e: print("Exception:"+str(e)) pass if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering) yield response history.append((message, response)) # add current dialog to history # Store current state in DB if memory is turned on if (settings=="Memory On"): x=collection.get(include=[])["ids"] # add current dialog to db collection.add( documents=[message,response], metadatas=[ { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}, { "source": "DU", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"} ], ids=[str(len(x)+1),str(len(x)+2)] ) json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False) gr.ChatInterface( response, chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, mit dem du zu unterschiedlichen Themen sprechen kannst.
Wenn du mich lokal bzw. \"on premises\" betreibst, kann ich unseren Dialog auf Wunsch auch speichern und bei meinen Antworten auch lange zurück liegende Episoden berücksichtigen.
Was ist dein Anliegen?"]], render_markdown=True), title="AI-Interface (on prem)" if onPrem else "AI-Interface (HFHub)", additional_inputs=[ gr.Textbox(value=None,label="System Prompt"), gr.Dropdown(["Memory On","Memory Off"],value="Memory Off",label="Memory") ] ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864) print("Interface up and running!")