Spaces:

firefighter
/

PdfSumGPT

Sleeping

Qifan Zhang commited on Mar 5, 2023

Commit

f89a7d8

•

1 Parent(s): d487adb

add read web, text; update truncation

Files changed (6) hide show

.gitignore CHANGED Viewed

app.py CHANGED Viewed

@@ -2,16 +2,46 @@ import gradio as gr
 from utils.chatgpt import ChatGPTAPI
 from utils.read_pdf import read_pdf
-def process(api_key: str = '', prompt: str = '', file=None) -> str:
     chatgpt = ChatGPTAPI(api_key, max_input_length=1024)
-    pdf_contents = read_pdf(file.name)
-    pdf_str = '\n'.join(pdf_contents)
-    content = prompt + '\n' + pdf_str
     response = chatgpt(content)
     return response
-gr.Interface(fn=process, inputs=["text", "text", "file"], outputs="text").launch()

 from utils.chatgpt import ChatGPTAPI
 from utils.read_pdf import read_pdf
+from utils.read_web import read_web
+from utils.truncate import truncate_string
+def file2str(filepath: str) -> str:
+    if not filepath:
+        return ''
+    if filepath.endswith('.pdf'):
+        content_list = read_pdf(filepath)
+        text = '\n'.join(content_list)
+    elif filepath.endswith('.txt'):
+        with open(filepath, 'r') as f:
+            text = f.readlines()
+    else:
+        raise Exception('File type not supported')
+    text = truncate_string(text, max_length=1024)
+    return text
+def process(api_key: str = '', prompt: str = '', file=None, url='') -> str:
     chatgpt = ChatGPTAPI(api_key, max_input_length=1024)
+    file_text = file2str(file.name) if file else ''
+    web_txt = read_web(url)
+    web_txt = truncate_string(web_txt, max_length=1024)
+    content = prompt + '\n' + file_text + '\n' + web_txt
     response = chatgpt(content)
     return response
+prompt_input = gr.components.Textbox(
+    value='用中文总结下面的文章',
+    lines=2,
+    type="text"
+)
+app = gr.Interface(
+    fn=process,
+    inputs=["text", prompt_input, "file", "text"],
+    outputs="text"
+)
+app.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,9 @@
 openai
 gradio
 pypdf
-tiktoken

 openai
+tiktoken
 gradio
 pypdf
+requests
+bs4

utils/chatgpt.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import openai
-import tiktoken
 class ChatGPTAPI:
@@ -12,17 +11,10 @@ class ChatGPTAPI:
         openai.api_key = api_key
         self.max_input_length = max_input_length
-        self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
-    def truncate_string(self, s):
-        e = self.encoding.encode(s)[:self.max_input_length]
-        s = self.encoding.decode(e)
-        return s
     def __call__(self, content: str):
         assert isinstance(content, str), 'ChatGPT Error: content must be a string'
         content = content.strip()
-        content = self.truncate_string(content)
         messages = [{'role': 'user', 'content': content}]
         try:
             resp = openai.ChatCompletion.create(

 import openai
 class ChatGPTAPI:
         openai.api_key = api_key
         self.max_input_length = max_input_length
     def __call__(self, content: str):
         assert isinstance(content, str), 'ChatGPT Error: content must be a string'
         content = content.strip()
         messages = [{'role': 'user', 'content': content}]
         try:
             resp = openai.ChatCompletion.create(

utils/read_web.py ADDED Viewed

+import re
+import requests
+from bs4 import BeautifulSoup
+def read_web(url: str) -> str:
+    if not url:
+        return ''
+    resp = requests.get(url)
+    soup = BeautifulSoup(resp.text, 'html.parser')
+    text = soup.get_text()
+    text = re.sub('\n{3,}', '\n\n', text)
+    return text
+if __name__ == '__main__':
+    r = read_web('https://en.wikipedia.org/wiki/Wiki')
+    print(r)

utils/truncate.py ADDED Viewed

+import tiktoken
+encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+def truncate_string(s, max_length=1024) -> str:
+    e = encoding.encode(s)[:max_length]
+    s = encoding.decode(e)
+    return s