|
from toolbox import update_ui |
|
from toolbox import CatchException, report_execption |
|
from .crazy_utils import read_and_clean_pdf_text |
|
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive |
|
fast_debug = False |
|
|
|
|
|
def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): |
|
import tiktoken |
|
print('begin analysis on:', file_name) |
|
|
|
|
|
|
|
|
|
file_content, page_one = read_and_clean_pdf_text(file_name) |
|
file_content = file_content.encode('utf-8', 'ignore').decode() |
|
page_one = str(page_one).encode('utf-8', 'ignore').decode() |
|
|
|
TOKEN_LIMIT_PER_FRAGMENT = 2500 |
|
|
|
from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf |
|
from request_llm.bridge_all import model_info |
|
enc = model_info["gpt-3.5-turbo"]['tokenizer'] |
|
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) |
|
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( |
|
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT) |
|
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf( |
|
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4) |
|
|
|
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] |
|
|
|
|
|
final_results = [] |
|
final_results.append(paper_meta) |
|
|
|
|
|
i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" |
|
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) |
|
|
|
iteration_results = [] |
|
last_iteration_result = paper_meta |
|
MAX_WORD_TOTAL = 4096 |
|
n_fragment = len(paper_fragments) |
|
if n_fragment >= 20: print('文章极长,不能达到预期效果') |
|
for i in range(n_fragment): |
|
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment |
|
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}" |
|
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}" |
|
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, |
|
llm_kwargs, chatbot, |
|
history=["The main idea of the previous section is?", last_iteration_result], |
|
sys_prompt="Extract the main idea of this section." |
|
) |
|
iteration_results.append(gpt_say) |
|
last_iteration_result = gpt_say |
|
|
|
|
|
final_results.extend(iteration_results) |
|
final_results.append(f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。') |
|
|
|
i_say_show_user = f'接下来,你是一名专业的学术教授,利用以上信息,使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。" |
|
chatbot.append([i_say_show_user, gpt_say]) |
|
|
|
|
|
from .crazy_utils import input_clipping |
|
_, final_results = input_clipping("", final_results, max_token_limit=3200) |
|
yield from update_ui(chatbot=chatbot, history=final_results) |
|
|
|
|
|
@CatchException |
|
def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): |
|
import glob, os |
|
|
|
|
|
chatbot.append([ |
|
"函数插件功能?", |
|
"理解PDF论文内容,并且将结合上下文内容,进行学术解答。函数插件贡献者: Hanzoe, binary-husky"]) |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
|
|
|
|
try: |
|
import fitz |
|
except: |
|
report_execption(chatbot, history, |
|
a = f"解析项目: {txt}", |
|
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
history = [] |
|
|
|
|
|
if os.path.exists(txt): |
|
project_folder = txt |
|
else: |
|
if txt == "": |
|
txt = '空空如也的输入栏' |
|
report_execption(chatbot, history, |
|
a=f"解析项目: {txt}", b=f"找不到本地项目或无权访问: {txt}") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
|
|
|
|
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] |
|
|
|
if len(file_manifest) == 0: |
|
report_execption(chatbot, history, |
|
a=f"解析项目: {txt}", b=f"找不到任何.tex或.pdf文件: {txt}") |
|
yield from update_ui(chatbot=chatbot, history=history) |
|
return |
|
txt = file_manifest[0] |
|
|
|
yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) |
|
|