Spaces:

lingyit1108
/

ragtest-sakimilo

Running

File size: 15,743 Bytes

69e20d0

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac0cc1aa-e68d-432d-b316-52e272c43207",
   "metadata": {},
   "outputs": [],
   "source": [
    "import streamlit as st\n",
    "from streamlit_feedback import streamlit_feedback\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "import base64\n",
    "from io import BytesIO\n",
    "import sys\n",
    "sys.path.insert(0, \"../\")\n",
    "\n",
    "import chromadb\n",
    "from llama_index.core import (\n",
    "            VectorStoreIndex, \n",
    "            SimpleDirectoryReader,\n",
    "            StorageContext,\n",
    "            Document\n",
    ")\n",
    "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n",
    "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core.memory import ChatMemoryBuffer\n",
    "from llama_index.core.tools import QueryEngineTool\n",
    "from llama_index.agent.openai import OpenAIAgent\n",
    "from llama_index.core import Settings\n",
    "\n",
    "from vision_api import get_transcribed_text\n",
    "from qna_prompting import get_qna_question_tool, evaluate_qna_answer_tool\n",
    "\n",
    "import nest_asyncio\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b05cb9b-869a-409c-8d4f-aafae703c558",
   "metadata": {},
   "outputs": [],
   "source": [
    "@st.cache_resource\n",
    "def get_document_object(input_files):\n",
    "    documents = SimpleDirectoryReader(input_files=input_files).load_data()\n",
    "    document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))\n",
    "    return document\n",
    "\n",
    "@st.cache_resource\n",
    "def get_llm_object(selected_model, temperature):\n",
    "    llm = OpenAI(model=selected_model, temperature=temperature)\n",
    "    return llm\n",
    "\n",
    "@st.cache_resource\n",
    "def get_embedding_model(model_name, fine_tuned_path=None):\n",
    "    if fine_tuned_path is None:\n",
    "        print(f\"loading from `{model_name}` from huggingface\")\n",
    "        embed_model = HuggingFaceEmbedding(model_name=model_name)\n",
    "    else:\n",
    "        print(f\"loading from local `{fine_tuned_path}`\")\n",
    "        embed_model = fine_tuned_path\n",
    "    return embed_model\n",
    "\n",
    "@st.cache_resource\n",
    "def get_query_engine(input_files, llm_model, temperature,\n",
    "                     embedding_model, fine_tuned_path,\n",
    "                     system_content, persisted_vector_db):\n",
    "    \n",
    "    llm = get_llm_object(llm_model, temperature)\n",
    "    embedded_model = get_embedding_model(\n",
    "                        model_name=embedding_model, \n",
    "                        fine_tuned_path=fine_tuned_path\n",
    "    )\n",
    "    Settings.llm = llm\n",
    "    Settings.chunk_size = 1024\n",
    "    Settings.embed_model = embedded_model\n",
    "\n",
    "    if os.path.exists(persisted_vector_db):\n",
    "        print(\"loading from vector database - chroma\")\n",
    "        db = chromadb.PersistentClient(path=persisted_vector_db)\n",
    "        chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
    "        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "        storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "\n",
    "        index = VectorStoreIndex.from_vector_store(\n",
    "            vector_store=vector_store,\n",
    "            storage_context=storage_context\n",
    "        )\n",
    "    else:\n",
    "        print(\"create new chroma vector database..\")\n",
    "        documents = SimpleDirectoryReader(input_files=input_files).load_data()\n",
    "        \n",
    "        db = chromadb.PersistentClient(path=persisted_vector_db)\n",
    "        chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
    "        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "        \n",
    "        nodes = Settings.node_parser.get_nodes_from_documents(documents)\n",
    "        storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "        storage_context.docstore.add_documents(nodes)\n",
    "\n",
    "        index = VectorStoreIndex(nodes, storage_context=storage_context)\n",
    "    \n",
    "    memory = ChatMemoryBuffer.from_defaults(token_limit=15000)\n",
    "    hi_content_engine = index.as_query_engine(\n",
    "                            memory=memory,\n",
    "                            system_prompt=system_content,\n",
    "                            similarity_top_k=20,\n",
    "                            streaming=True\n",
    "    )\n",
    "    hi_textbook_query_description = \"\"\"\n",
    "        Use this tool to extract content from textbook `Health Insurance 7th Edition`,\n",
    "        that has 15 chapters in total. When user wants to learn more about a \n",
    "        particular chapter, this tool will help to assist user to get better\n",
    "        understanding of the content of the textbook.\n",
    "    \"\"\"\n",
    "    \n",
    "    hi_query_tool = QueryEngineTool.from_defaults(\n",
    "                        query_engine=hi_content_engine,\n",
    "                        name=\"health_insurance_textbook_query_engine\",\n",
    "                        description=hi_textbook_query_description\n",
    "    )\n",
    "\n",
    "    agent = OpenAIAgent.from_tools(tools=[\n",
    "                                        hi_query_tool, \n",
    "                                        get_qna_question_tool,\n",
    "                                        evaluate_qna_answer_tool\n",
    "                                    ],\n",
    "                                   max_function_calls=1,\n",
    "                                   llm=llm, \n",
    "                                   verbose=True,\n",
    "                                   system_prompt=textbook_content)\n",
    "    print(\"loaded AI agent, let's begin the chat!\")\n",
    "    print(\"=\"*50)\n",
    "    print(\"\")\n",
    "\n",
    "    return agent\n",
    "\n",
    "def generate_llm_response(prompt_input, tool_choice=\"auto\"):\n",
    "    chat_agent = get_query_engine(input_files=input_files, \n",
    "                                   llm_model=selected_model, \n",
    "                                   temperature=temperature,\n",
    "                                   embedding_model=embedding_model,\n",
    "                                   fine_tuned_path=fine_tuned_path,\n",
    "                                   system_content=system_content,\n",
    "                                   persisted_vector_db=persisted_vector_db)\n",
    "    \n",
    "    # st.session_state.messages\n",
    "    response = chat_agent.stream_chat(prompt_input, tool_choice=tool_choice)\n",
    "    return response\n",
    "\n",
    "def handle_feedback(user_response):\n",
    "    st.toast(\"✔️ Feedback received!\")\n",
    "    st.session_state.feedback = False\n",
    "\n",
    "def handle_image_upload():\n",
    "    st.session_state.release_file = \"true\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f148426b-1634-45ed-a1fa-44e9c6ab14ac",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4461d081-d8d0-4801-ad52-dbe826cbfe59",
   "metadata": {},
   "outputs": [],
   "source": [
    "openai_api = os.getenv(\"OPENAI_API_KEY\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a24c861-896b-4800-8478-73f8cd65e8fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "image_prompt = False\n",
    "# llm_model = \"gpt-3.5-turbo-0125\"\n",
    "llm_model = \"gpt-4-0125-preview\"\n",
    "temperature = 0\n",
    "\n",
    "input_files = [\"./raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
    "               \"./raw_documents/qna.txt\"]\n",
    "embedding_model = \"BAAI/bge-small-en-v1.5\"\n",
    "persisted_vector_db = \"../models/chroma_db\"\n",
    "fine_tuned_path = \"local:../models/fine-tuned-embeddings\"\n",
    "system_content = (\n",
    "                    \"You are a helpful study assistant. \"\n",
    "                    \"You do not respond as 'User' or pretend to be 'User'. \"\n",
    "                    \"You only respond once as 'Assistant'.\"\n",
    ")\n",
    "textbook_content = (\n",
    "                    \"The content of the textbook `Health Insurance 7th Edition` are as follows,\"\n",
    "                    \"- Chapter 1: Overview Of Healthcare Environment In Singapore\"\n",
    "                    \"- Chapter 2: Medical Expense Insurance\"\n",
    "                    \"- Chapter 3: Group Medical Expense Insurance\"\n",
    "                    \"- Chapter 4: Disability Income Insurance\"\n",
    "                    \"- Chapter 5: Long-Term Care Insurance \"\n",
    "                    \"- Chapter 6: Critical Illness Insurance\"\n",
    "                    \"- Chapter 7: Other Types Of Health Insurance\"\n",
    "                    \"- Chapter 8: Managed Healthcare\"\n",
    "                    \"- Chapter 9: Part I Healthcare Financing\"\n",
    "                    \"- Chapter 9: Part II Healthcare Financing\"\n",
    "                    \"- Chapter 10: Common Policy Provisions\"\n",
    "                    \"- Chapter 11: Health Insurance Pricing\"\n",
    "                    \"- Chapter 12: Health Insurance Underwriting\"\n",
    "                    \"- Chapter 13: Notice No: MAS 120 Disclosure And Advisory Process - Requirements For Accident And Health Insurance Products\"\n",
    "                    \"- Chapter 14: Financial Needs Analysis\"\n",
    "                    \"- Chapter 15: Case Studies\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5e4b22c-1e29-4ab8-9039-6e86f566871a",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = get_llm_object(llm_model, temperature)\n",
    "embedded_model = get_embedding_model(\n",
    "                    model_name=embedding_model, \n",
    "                    fine_tuned_path=fine_tuned_path\n",
    ")\n",
    "Settings.llm = llm\n",
    "Settings.chunk_size = 1024\n",
    "Settings.embed_model = embedded_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e92d21e3-8483-4f24-91cf-40a6c10d43c5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5753c6ed-41a6-40b5-bc4f-477eb7c1d5c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"loading from vector database - chroma\")\n",
    "db = chromadb.PersistentClient(path=persisted_vector_db)\n",
    "chroma_collection = db.get_or_create_collection(\"quickstart\")\n",
    "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n",
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
    "\n",
    "index = VectorStoreIndex.from_vector_store(\n",
    "    vector_store=vector_store,\n",
    "    storage_context=storage_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d91e2dda-cb74-4d85-adce-a4a72c53cc7d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4211bb2-aba9-4be2-b2f1-6fbd3f7e4223",
   "metadata": {},
   "outputs": [],
   "source": [
    "memory = ChatMemoryBuffer.from_defaults(token_limit=15000)\n",
    "hi_content_engine = index.as_query_engine(\n",
    "                        memory=memory,\n",
    "                        system_prompt=system_content,\n",
    "                        similarity_top_k=8,\n",
    "                        verbose=True,\n",
    "                        streaming=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "007f8bf5-19c5-4462-b5f2-5f4ff30f593b",
   "metadata": {},
   "outputs": [],
   "source": [
    "hi_textbook_query_description = \"\"\"\n",
    "    Use this tool to extract content from the query engine,\n",
    "    which is built by ingesting textbook content from `Health Insurance 7th Edition`,\n",
    "    that has 15 chapters in total. When user wants to learn more about a \n",
    "    particular chapter, this tool will help to assist user to get better\n",
    "    understanding of the content of the textbook.\n",
    "\"\"\"\n",
    "\n",
    "hi_query_tool = QueryEngineTool.from_defaults(\n",
    "                    query_engine=hi_content_engine,\n",
    "                    name=\"health_insurance_textbook_query_engine\",\n",
    "                    description=hi_textbook_query_description\n",
    ")\n",
    "agent = OpenAIAgent.from_tools(tools=[\n",
    "                                    hi_query_tool, \n",
    "                                    get_qna_question_tool,\n",
    "                                    evaluate_qna_answer_tool\n",
    "                                ],\n",
    "                               max_function_calls=1,\n",
    "                               llm=llm, \n",
    "                               verbose=True,\n",
    "                               system_prompt=textbook_content)\n",
    "\n",
    "print(\"loaded AI agent, let's begin the chat!\")\n",
    "print(\"=\"*50)\n",
    "print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2e42ad6-20fc-4f2e-a4ea-403e79b14ba4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c62e817e-c7c8-4f90-9e32-217fec376565",
   "metadata": {},
   "outputs": [],
   "source": [
    "response = hi_content_engine.query(\"can you give me the list of chapters that `Health Insurance 7th Edition` covers\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5902ffd2-2f66-4b89-bf7f-a05e3fdeccaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "for res in response.response_gen:\n",
    "    print(res, end=\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e75453b-85c7-4e1c-8683-6df45a13cacb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b97d90d-5c59-486f-863b-4aaa12ed0ea0",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4584aa46-b488-4535-9d69-2736c9dad170",
   "metadata": {},
   "outputs": [],
   "source": [
    "response = agent.stream_chat(\"hihi\", tool_choice=\"auto\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eff8bb8d-a2d1-428a-9c3d-193389378288",
   "metadata": {},
   "outputs": [],
   "source": [
    "for res in response.response_gen:\n",
    "    print(res, end=\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7a504af-6499-4649-8e68-2a86d415e458",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}