from llama_index import LLMPredictor, PromptHelper, StorageContext, ServiceContext, load_index_from_storage, SimpleDirectoryReader, GPTVectorStoreIndex from langchain.chat_models import ChatOpenAI import gradio as gr import sys import os import openai from ratelimit import limits, sleep_and_retry from langchain import HuggingFaceHub # fixing bugs # 1. open ai key: https://stackoverflow.com/questions/76425556/tenacity-retryerror-retryerrorfuture-at-0x7f89bc35eb90-state-finished-raised # 2. rate limit error in lang_chain default version - install langchain==0.0.188. https://github.com/jerryjliu/llama_index/issues/924 # 3. added true Config variable in langchain: https://github.com/pydantic/pydantic/issues/3320 # 4. deploy on huggingfaces https://huggingface.co/welcome # create huggingfaces token https://huggingface.co/settings/tokens # login: huggingface-cli login # add requirements.txt file https://huggingface.co/docs/hub/spaces-dependencies os.environ["OPENAI_API_KEY"] = os.environ.get("openai_key") openai.api_key = os.environ["OPENAI_API_KEY"] # Define the rate limit for API calls (requests per second) RATE_LIMIT = 3 # Implement the rate limiting decorator @sleep_and_retry @limits(calls=RATE_LIMIT, period=1) def create_service_context(): # prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit) max_input_size = 4096 num_outputs = 512 max_chunk_overlap = 20 chunk_size_limit = 600 prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit) # llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-4", max_tokens=num_outputs)) #LLMPredictor is a wrapper class around LangChain's LLMChain that allows easy integration into LlamaIndex llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=num_outputs)) #constructs service_context service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) return service_context # Implement the rate limiting decorator @sleep_and_retry @limits(calls=RATE_LIMIT, period=1) def data_ingestion_indexing(directory_path): #loads data from the specified directory path documents = SimpleDirectoryReader(directory_path).load_data() #when first building the index index = GPTVectorStoreIndex.from_documents( documents, service_context=create_service_context() ) #persist index to disk, default "storage" folder index.storage_context.persist() return index def data_querying(input_text): #rebuild storage context storage_context = StorageContext.from_defaults(persist_dir="./storage") #loads index from storage index = load_index_from_storage(storage_context, service_context=create_service_context()) #queries the index with the input text response = index.as_query_engine().query(input_text) return response.response iface = gr.Interface(fn=data_querying, inputs=gr.components.Textbox(lines=20, label="Enter your question"), outputs=gr.components.Textbox(lines=25, label="Response", style="height: 400px; overflow-y: scroll;"), title="Therapy Validation GPT 0.1 pre alpha") #passes in data directory index = data_ingestion_indexing("book-validation") iface.launch(inline=True)