lingyit1108's picture
to create RAGAs result with triad of metrics
b580d80
raw
history blame
6.02 kB
import utils
import os
import numpy as np
import nest_asyncio
import openai
import chromadb
from llama_index.legacy import (
VectorStoreIndex,
SimpleDirectoryReader
)
from llama_index.core import (
StorageContext,
Document,
Settings
)
from llama_index.vector_stores.chroma.base import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from trulens_eval import Tru
from utils import get_prebuilt_trulens_recorder
import time
nest_asyncio.apply()
openai.api_key = utils.get_openai_api_key()
def main():
if not os.path.exists("./default.sqlite"):
start_time = time.time()
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
fine_tuned_path = "local:./models/fine-tuned-embeddings"
Settings.llm = llm
Settings.embed_model = fine_tuned_path
db = chromadb.PersistentClient(path="./models/chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# create your index
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
storage_context=storage_context
)
query_engine = index.as_query_engine()
separator = "\n\n"
eval_questions = []
with open('./raw_documents/eval_questions.txt', 'r') as file:
content = file.read()
for question in content.split(separator):
print(question)
print(separator)
eval_questions.append(question.strip())
response = query_engine.query(eval_questions[0])
print(str(response))
tru = Tru(database_file="./models/trulens_eval.sqlite")
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
app_id="Direct Query Engine")
print("Sending each question to llm ..")
with tru_recorder as recording:
for question in eval_questions:
response = query_engine.query(question)
records, feedback = tru.get_records_and_feedback(app_ids=[])
os.makedirs("./results", exist_ok=True)
records.to_csv("./results/records.csv", index=False)
print(tru.db.engine.url.render_as_string(hide_password=False))
end_time = time.time()
time_spent_mins = (end_time - start_time) / 60
with open("./results/time_cost.txt", "w") as fp:
fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.")
if __name__ == "__main__":
# main()
if False:
start_time = time.time()
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
fine_tuned_path = "local:./models/fine-tuned-embeddings"
Settings.llm = llm
Settings.embed_model = fine_tuned_path
db = chromadb.PersistentClient(path="./models/chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# create your index
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
storage_context=storage_context
)
query_engine = index.as_query_engine()
separator = "\n\n"
eval_questions = []
with open('./raw_documents/eval_questions.txt', 'r') as file:
content = file.read()
for question in content.split(separator):
print(question)
print(separator)
eval_questions.append(question.strip())
response = query_engine.query(eval_questions[0])
print(str(response))
from trulens_eval import Tru
tru = Tru()
documents = SimpleDirectoryReader(
input_files=["./raw_documents/qna.txt"]
).load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("Which is not a government healthcare philosophy?")
print(response)
from trulens_eval.feedback.provider.openai import OpenAI
openai = OpenAI()
# select context to be used in feedback. the location of context is app specific.
from trulens_eval.app import App
context = App.select_context(query_engine)
from trulens_eval import Feedback
# Define a groundedness feedback function
from trulens_eval.feedback import Groundedness
grounded = Groundedness(groundedness_provider=OpenAI())
f_groundedness = (
Feedback(grounded.groundedness_measure_with_cot_reasons)
.on(context.collect()) # collect context chunks into a list
.on_output()
.aggregate(grounded.grounded_statements_aggregator)
)
# Question/answer relevance between overall question and answer.
f_qa_relevance = Feedback(openai.relevance).on_input_output()
# Question/statement relevance between question and each context chunk.
f_qs_relevance = (
Feedback(openai.qs_relevance)
.on_input()
.on(context)
.aggregate(np.mean)
)
from trulens_eval import TruLlama
tru_query_engine_recorder = TruLlama(query_engine,
app_id='LlamaIndex_App1',
feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance])
if False:
# or as context manager
with tru_query_engine_recorder as recording:
query_engine.query("Which of the following is TRUE on the similarity of Means Testing and Casemix?")