Spaces:
Sleeping
Sleeping
File size: 3,880 Bytes
02d5bc7 1467ec9 02d5bc7 c9d7918 02d5bc7 d5ba5da c9d7918 02d5bc7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import numpy as np
from openai import OpenAI
from groq import Groq
import pandas as pd
from config import openai_api, groq_api, models
provider = "openai"
if provider == "openai":
client = OpenAI(api_key=openai_api)
else:
client = Groq(api_key=groq_api)
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def _get_embedding(text, model="text-embedding-3-large"):
try:
text = text.replace("\n", " ")
except:
None
return client.embeddings.create(input = [text], model=model).data[0].embedding
def get_answer(df, nb_in_context = 10, task = "Your task is to estimate the revenue evolution of the considered company."):
# query = str(query_preprocessor_augment(task))
embedding_query = _get_embedding(task, model="text-embedding-3-large")
try:
df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(x, embedding_query))
except:
df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(eval(x), embedding_query))
res = df.sort_values('similarity', ascending=False).head(nb_in_context).content.values
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": f"""You will be given a vast amount of data, each with it's source. {task}
Your answer, should be crisp, sharp and pinpoint to an exact source from the context (if the references of the sources are not easy to read by humans feel free to adjust so that it's readable - no links though, you refer to them as what they are).
You write using Bain's style as it will be read by private equity professionals and if asked, you refer to yourself as Catalyst (pronoun is "we"), a startup doing AI for private equity.
"""
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Context:\n{str(res)}"
}
]
}
],
temperature=1,
max_tokens=1665,
top_p=1,
frequency_penalty=0,
presence_penalty=0
).choices[0].message.content
return response
# for r in res:
# print(r)
def generate_content(input, data_dumpster, chunked_raw_content, custom_query = (False, "")):
data_locker_folder = f"./{data_dumpster}/{input.replace(' ','-')}"
try:
df = pd.read_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
except:
df = pd.DataFrame()
df["content"] = chunked_raw_content
embeddings = []
for chunk in chunked_raw_content:
embeddings.append(_get_embedding(chunk))
df["embeddings"] = embeddings
df.to_csv(f"{data_locker_folder}/vectorized_data_dumpster.csv", sep=";")
finance = "Your task is to estimate the revenue evolution of the considered company."
product = "Your task is to give an overview of the line of products of the considered company."
customer = "Your task is to give the probable customer segmentation of the considered company."
in_context = 15
if custom_query[0]:
print("Generating custom chat output")
custom_answer = get_answer(df, in_context, custom_query[1])
return custom_answer
print("Generating financials content")
finance_content = get_answer(df, in_context, finance)
print("Generating product content")
product_content = get_answer(df, in_context, product)
print("Generating customer segmentation content")
customer_content = get_answer(df, in_context, customer)
print("Done!")
rag_content = {
"finance" : finance_content,
"product": product_content,
"customer_segmentation": customer_content
}
return rag_content |