Spaces:
Running
Running
File size: 5,797 Bytes
4b878db 783d533 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# a monkey patch to use llama-index completion
import os
import time
import gradio as gr
from functools import wraps
from threading import Lock
from typing import Union
import src.translation_agent.utils as utils
from llama_index.llms.groq import Groq
from llama_index.llms.cohere import Cohere
from llama_index.llms.openai import OpenAI
from llama_index.llms.together import TogetherLLM
from llama_index.llms.ollama import Ollama
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.core import Settings
from llama_index.core.llms import ChatMessage
RPM = 60
# Add your LLMs here
def model_load(
endpoint: str,
model: str,
api_key: str = None,
context_window: int = 4096,
num_output: int = 512,
rpm: int = RPM,
):
if endpoint == "Groq":
llm = Groq(
model=model,
api_key=api_key if api_key else os.getenv("GROQ_API_KEY"),
)
elif endpoint == "Cohere":
llm = Cohere(
model=model,
api_key=api_key if api_key else os.getenv("COHERE_API_KEY"),
)
elif endpoint == "OpenAI":
llm = OpenAI(
model=model,
api_key=api_key if api_key else os.getenv("OPENAI_API_KEY"),
)
elif endpoint == "TogetherAI":
llm = TogetherLLM(
model=model,
api_key=api_key if api_key else os.getenv("TOGETHER_API_KEY"),
)
elif endpoint == "Ollama":
llm = Ollama(
model=model,
request_timeout=120.0)
elif endpoint == "Huggingface":
llm = HuggingFaceInferenceAPI(
model_name=model,
token=api_key if api_key else os.getenv("HF_TOKEN"),
task="text-generation",
)
global RPM
RPM = rpm
Settings.llm = llm
# maximum input size to the LLM
Settings.context_window = context_window
# number of tokens reserved for text generation.
Settings.num_output = num_output
def rate_limit(get_max_per_minute):
def decorator(func):
lock = Lock()
last_called = [0.0]
@wraps(func)
def wrapper(*args, **kwargs):
with lock:
max_per_minute = get_max_per_minute()
min_interval = 60.0 / max_per_minute
elapsed = time.time() - last_called[0]
left_to_wait = min_interval - elapsed
if left_to_wait > 0:
time.sleep(left_to_wait)
ret = func(*args, **kwargs)
last_called[0] = time.time()
return ret
return wrapper
return decorator
@rate_limit(lambda: RPM)
def get_completion(
prompt: str,
system_message: str = "You are a helpful assistant.",
temperature: float = 0.3,
json_mode: bool = False,
) -> Union[str, dict]:
"""
Generate a completion using the OpenAI API.
Args:
prompt (str): The user's prompt or query.
system_message (str, optional): The system message to set the context for the assistant.
Defaults to "You are a helpful assistant.".
temperature (float, optional): The sampling temperature for controlling the randomness of the generated text.
Defaults to 0.3.
json_mode (bool, optional): Whether to return the response in JSON format.
Defaults to False.
Returns:
Union[str, dict]: The generated completion.
If json_mode is True, returns the complete API response as a dictionary.
If json_mode is False, returns the generated text as a string.
"""
llm = Settings.llm
if llm.class_name() == "HuggingFaceInferenceAPI":
llm.system_prompt = system_message
messages = [
ChatMessage(
role="user", content=prompt),
]
try:
response = llm.chat(
messages=messages,
temperature=temperature,
)
return response.message.content
except Exception as e:
raise gr.Error(f"An unexpected error occurred: {e}")
else:
messages = [
ChatMessage(
role="system", content=system_message),
ChatMessage(
role="user", content=prompt),
]
if json_mode:
response = llm.chat(
temperature=temperature,
response_format={"type": "json_object"},
messages=messages,
)
return response.message.content
else:
try:
response = llm.chat(
temperature=temperature,
messages=messages,
)
return response.message.content
except Exception as e:
raise gr.Error(f"An unexpected error occurred: {e}")
utils.get_completion = get_completion
one_chunk_initial_translation = utils.one_chunk_initial_translation
one_chunk_reflect_on_translation = utils.one_chunk_reflect_on_translation
one_chunk_improve_translation = utils.one_chunk_improve_translation
one_chunk_translate_text = utils.one_chunk_translate_text
num_tokens_in_string = utils.num_tokens_in_string
multichunk_initial_translation = utils.multichunk_initial_translation
multichunk_reflect_on_translation = utils.multichunk_reflect_on_translation
multichunk_improve_translation = utils.multichunk_improve_translation
multichunk_translation = utils.multichunk_translation
calculate_chunk_size =utils.calculate_chunk_size |