|
import os |
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
from matplotlib import pyplot as plt |
|
|
|
from evaluate_params import eval_func_param_names, eval_extra_columns, input_args_list |
|
from gen import evaluate, check_locals, score_qa |
|
from prompter import Prompter |
|
from utils import clear_torch_cache, NullContext, get_kwargs, makedirs |
|
|
|
|
|
def run_eval( |
|
base_model=None, lora_weights=None, inference_server=None, |
|
regenerate_clients=None, regenerate_gradio_clients=None, validate_clients=None, fail_if_invalid_client=None, |
|
prompt_type=None, prompt_dict=None, chat_template=None, system_prompt=None, |
|
debug=None, chat=False, |
|
stream_output=None, enable_caching=None, async_output=None, num_async=None, stream_map=None, |
|
eval_filename=None, eval_prompts_only_num=None, eval_prompts_only_seed=None, eval_as_output=None, |
|
examples=None, memory_restriction_level=None, |
|
|
|
n_jobs=None, llamacpp_path=None, llamacpp_dict=None, exllama_dict=None, gptq_dict=None, attention_sinks=None, |
|
sink_dict=None, truncation_generation=None, |
|
hf_model_dict=None, |
|
force_seq2seq_type=None, force_t5_type=None, |
|
load_exllama=None, |
|
|
|
force_streaming_on_to_handle_timeouts=None, |
|
|
|
use_pymupdf=None, |
|
use_unstructured_pdf=None, |
|
use_pypdf=None, |
|
enable_pdf_ocr=None, |
|
enable_pdf_doctr=None, |
|
enable_image=None, |
|
visible_image_models=None, |
|
image_size=None, |
|
image_quality=None, |
|
image_guidance_scale=None, |
|
image_num_inference_steps=None, |
|
|
|
try_pdf_as_html=None, |
|
|
|
load_awq='', |
|
temperature=None, |
|
top_p=None, |
|
top_k=None, |
|
penalty_alpha=None, |
|
num_beams=None, |
|
max_new_tokens=None, |
|
min_new_tokens=None, |
|
early_stopping=None, |
|
max_time=None, |
|
repetition_penalty=None, |
|
num_return_sequences=None, |
|
do_sample=None, |
|
seed=None, |
|
langchain_mode=None, |
|
langchain_action=None, |
|
langchain_agents=[], |
|
top_k_docs=None, |
|
chunk=None, |
|
chunk_size=None, |
|
document_subset=None, |
|
document_choice=None, |
|
document_source_substrings=None, |
|
document_source_substrings_op=None, |
|
document_content_substrings=None, |
|
document_content_substrings_op=None, |
|
pre_prompt_query=None, prompt_query=None, |
|
pre_prompt_summary=None, prompt_summary=None, hyde_llm_prompt=None, |
|
all_docs_start_prompt=None, |
|
all_docs_finish_prompt=None, |
|
|
|
user_prompt_for_fake_system_prompt=None, |
|
json_object_prompt=None, |
|
json_object_prompt_simpler=None, |
|
json_code_prompt=None, |
|
json_code_prompt_if_no_schema=None, |
|
json_schema_instruction=None, |
|
json_preserve_system_prompt=None, |
|
json_object_post_prompt_reminder=None, |
|
json_code_post_prompt_reminder=None, |
|
json_code2_post_prompt_reminder=None, |
|
|
|
image_audio_loaders=None, |
|
pdf_loaders=None, |
|
url_loaders=None, |
|
jq_schema=None, |
|
extract_frames=None, |
|
extract_frames0=None, |
|
guided_whitespace_pattern0=None, |
|
metadata_in_context0=None, |
|
llava_prompt=None, |
|
visible_models=None, |
|
h2ogpt_key=None, |
|
add_search_to_context=None, |
|
chat_conversation=None, |
|
text_context_list=None, |
|
docs_ordering_type=None, |
|
min_max_new_tokens=None, |
|
max_input_tokens=None, |
|
max_total_input_tokens=None, |
|
docs_token_handling=None, |
|
docs_joiner=None, |
|
hyde_level=None, |
|
hyde_template=None, |
|
hyde_show_only_final=None, |
|
hyde_show_intermediate_in_accordion=None, |
|
map_reduce_show_intermediate_in_accordion=None, |
|
doc_json_mode=None, |
|
metadata_in_context=None, |
|
chatbot_role=None, |
|
speaker=None, |
|
tts_language=None, |
|
tts_speed=None, |
|
image_file=None, |
|
image_control=None, |
|
images_num_max=None, |
|
image_resolution=None, |
|
image_format=None, |
|
rotate_align_resize_image=None, |
|
video_frame_period=None, |
|
image_batch_image_prompt=None, |
|
image_batch_final_prompt=None, |
|
image_batch_stream=None, |
|
visible_vision_models=None, |
|
video_file=None, |
|
|
|
response_format=None, |
|
guided_json=None, |
|
guided_regex=None, |
|
guided_choice=None, |
|
guided_grammar=None, |
|
guided_whitespace_pattern=None, |
|
client_metadata=None, |
|
|
|
|
|
captions_model=None, |
|
caption_loader=None, |
|
doctr_loader=None, |
|
pix2struct_loader=None, |
|
llava_model=None, |
|
image_model_dict=None, |
|
|
|
asr_model=None, |
|
asr_loader=None, |
|
|
|
image_audio_loaders_options0=None, |
|
pdf_loaders_options0=None, |
|
url_loaders_options0=None, |
|
jq_schema0=None, |
|
keep_sources_in_context=None, |
|
gradio_errors_to_chatbot=None, |
|
allow_chat_system_prompt=None, |
|
src_lang=None, tgt_lang=None, concurrency_count=None, save_dir=None, sanitize_bot_response=None, |
|
model_state0=None, |
|
use_auth_token=None, |
|
trust_remote_code=None, |
|
score_model_state0=None, |
|
max_max_new_tokens=None, |
|
is_public=None, |
|
max_max_time=None, |
|
raise_generate_gpu_exceptions=None, load_db_if_exists=None, use_llm_if_no_docs=None, |
|
my_db_state0=None, selection_docs_state0=None, dbs=None, langchain_modes=None, langchain_mode_paths=None, |
|
detect_user_path_changes_every_query=None, |
|
use_openai_embedding=None, use_openai_model=None, |
|
hf_embedding_model=None, migrate_embedding_model=None, |
|
cut_distance=None, |
|
answer_with_sources=None, |
|
append_sources_to_answer=None, |
|
append_sources_to_chat=None, |
|
sources_show_text_in_accordion=None, |
|
top_k_docs_max_show=None, |
|
show_link_in_sources=None, |
|
langchain_instruct_mode=None, |
|
add_chat_history_to_context=None, |
|
context=None, iinput=None, |
|
db_type=None, first_para=None, text_limit=None, verbose=None, |
|
gradio=None, cli=None, |
|
use_cache=None, |
|
auto_reduce_chunks=None, max_chunks=None, headsize=None, |
|
model_lock=None, force_langchain_evaluate=None, |
|
model_state_none=None, |
|
): |
|
from_ui = False |
|
|
|
answer_with_sources = False |
|
show_link_in_sources = False |
|
append_sources_to_answer = False |
|
append_sources_to_chat = False |
|
|
|
check_locals(**locals().copy()) |
|
|
|
if not context: |
|
context = '' |
|
|
|
if eval_prompts_only_num > 0: |
|
np.random.seed(eval_prompts_only_seed) |
|
example1 = examples[-1] |
|
examples = [] |
|
responses = [] |
|
if eval_filename is None: |
|
|
|
eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json' |
|
if not os.path.isfile(eval_filename): |
|
os.system( |
|
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename) |
|
import json |
|
with open(eval_filename, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
|
|
turn_start = 0 |
|
data = [x for x in data if len(x['conversations']) > turn_start + 1 and |
|
x['conversations'][turn_start]['from'] == 'human' and |
|
x['conversations'][turn_start + 1]['from'] == 'gpt'] |
|
for i in sorted(np.random.randint(0, len(data), size=eval_prompts_only_num)): |
|
assert data[i]['conversations'][turn_start]['from'] == 'human' |
|
instruction = data[i]['conversations'][turn_start]['value'] |
|
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt' |
|
output = data[i]['conversations'][turn_start + 1]['value'] |
|
examplenew = example1.copy() |
|
assert not chat, "No gradio must use chat=False, uses nochat instruct" |
|
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction |
|
examplenew[eval_func_param_names.index('iinput_nochat')] = iinput |
|
examplenew[eval_func_param_names.index('context')] = context |
|
examples.append(examplenew) |
|
responses.append(output) |
|
else: |
|
|
|
|
|
import json |
|
with open(eval_filename, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
for i in sorted(np.random.randint(0, len(data), size=eval_prompts_only_num)): |
|
examplenew = example1.copy() |
|
instruction = data[i]['instruction'] |
|
output = data[i].get('output', '') |
|
assert not chat, "No gradio must use chat=False, uses nochat instruct" |
|
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction |
|
examplenew[eval_func_param_names.index('iinput_nochat')] = iinput |
|
examplenew[eval_func_param_names.index('context')] = context |
|
examples.append(examplenew) |
|
responses.append(output) |
|
|
|
num_examples = len(examples) |
|
scoring_path = 'scoring' |
|
|
|
scoring_path = makedirs(scoring_path, tmp_ok=True, use_base=True) |
|
if eval_as_output: |
|
used_base_model = 'gpt35' |
|
used_lora_weights = '' |
|
used_inference_server = '' |
|
else: |
|
used_base_model = str(base_model.split('/')[-1]) |
|
used_lora_weights = str(lora_weights.split('/')[-1]) |
|
used_inference_server = str(inference_server.split('/')[-1]) |
|
eval_out_filename = "df_scores_%s_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_prompts_only_num, |
|
eval_prompts_only_seed, |
|
eval_as_output, |
|
used_base_model, |
|
used_lora_weights, |
|
used_inference_server, |
|
) |
|
eval_out_filename = os.path.join(scoring_path, eval_out_filename) |
|
|
|
smodel = score_model_state0['model'] |
|
stokenizer = score_model_state0['tokenizer'] |
|
sdevice = score_model_state0['device'] |
|
|
|
|
|
n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0 |
|
device = 'cpu' if n_gpus == 0 else 'cuda' |
|
context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device |
|
|
|
with context_class(device): |
|
|
|
assert not stream_output, "stream_output=True does not make sense with example loop" |
|
import time |
|
from functools import partial |
|
|
|
if not eval_as_output: |
|
requests_state0 = {} |
|
roles_state0 = None |
|
args = (None, my_db_state0, selection_docs_state0, requests_state0, roles_state0) |
|
assert len(args) == len(input_args_list) |
|
fun = partial(evaluate, |
|
*args, |
|
**get_kwargs(evaluate, exclude_names=input_args_list + eval_func_param_names, |
|
**locals().copy())) |
|
else: |
|
assert eval_prompts_only_num > 0 |
|
|
|
def get_response(*args, exi=0): |
|
|
|
yield responses[exi] |
|
|
|
fun = get_response |
|
t0 = time.time() |
|
score_dump = [] |
|
score_avg = 0 |
|
score_median = 0 |
|
|
|
for exi, ex in enumerate(examples): |
|
clear_torch_cache(allow_skip=True) |
|
|
|
instruction = ex[eval_func_param_names.index('instruction_nochat')] |
|
iinput = ex[eval_func_param_names.index('iinput_nochat')] |
|
context = ex[eval_func_param_names.index('context')] |
|
clear_torch_cache(allow_skip=True) |
|
print("") |
|
print("START" + "=" * 100) |
|
print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else ''))) |
|
print("-" * 105) |
|
|
|
|
|
t1 = time.time() |
|
|
|
|
|
eval_vars = ex.copy() |
|
for k in eval_func_param_names: |
|
if k in locals().copy(): |
|
eval_vars[eval_func_param_names.index(k)] = locals().copy()[k] |
|
|
|
gener = fun(*tuple(eval_vars), exi=exi) if eval_as_output else fun(*tuple(eval_vars)) |
|
for res_fun in gener: |
|
res = res_fun['response'] |
|
sources = res_fun.get('sources', 'Failure of Generation') |
|
print(res) |
|
if smodel: |
|
score_with_prompt = False |
|
if score_with_prompt: |
|
data_point = dict(instruction=instruction, input=iinput, context=context) |
|
prompter = Prompter(prompt_type, prompt_dict, |
|
debug=debug, stream_output=stream_output, base_model=base_model) |
|
prompt = prompter.generate_prompt(data_point, context_from_history=False, image_file=image_file) |
|
else: |
|
|
|
if eval_prompts_only_num > 0: |
|
|
|
assert iinput in [None, ''], iinput |
|
prompt = instruction |
|
score = score_qa(smodel, stokenizer, prompt, res, memory_restriction_level=memory_restriction_level) |
|
score_dump.append(ex + [prompt, res, score, sources]) |
|
|
|
df_scores = pd.DataFrame(score_dump, |
|
columns=eval_func_param_names + |
|
eval_extra_columns) |
|
df_scores.to_parquet(eval_out_filename, index=False) |
|
if not isinstance(score, str): |
|
|
|
plt.figure(figsize=(10, 10)) |
|
plt.hist(df_scores['score'], bins=20) |
|
score_avg = np.mean(df_scores['score']) |
|
score_median = np.median(df_scores['score']) |
|
print("SCORE %s: %s So far: AVG: %s MEDIAN: %s" % (exi, score, score_avg, score_median), |
|
flush=True) |
|
plt.title("Score avg: %s median: %s" % (score_avg, score_median)) |
|
plt.savefig(eval_out_filename.replace('.parquet', '.png')) |
|
plt.close() |
|
|
|
print("END" + "=" * 102) |
|
print("") |
|
t2 = time.time() |
|
print("Time taken for example: %s Time taken so far: %.4f about %.4g per example" % ( |
|
t2 - t1, t2 - t0, (t2 - t0) / (1 + exi))) |
|
t1 = time.time() |
|
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples)) |
|
print("Score avg: %s median: %s" % (score_avg, score_median), flush=True) |
|
return eval_out_filename |
|
|