import streamlit as st from streamlit.components.v1 import html import os import PyPDF2 import requests from transformers import pipeline def get_pdf_text(pdf_path): # creating a pdf file object pdfFileObj = open(pdf_path, 'rb') # creating a pdf reader object pdf_reader = PyPDF2.PdfReader(pdfFileObj) # extract text total_text_list = [] for i in range(len(pdf_reader.pages)): page_text = pdf_reader.pages[i].extract_text() total_text_list.append(page_text) pdf_text = " ".join(total_text_list) pdfFileObj.close() return pdf_text # sum_model = pipeline("text2text-generation", model="yasminesarraj/flan-t5-small-samsum") headers = {"Authorization": f"Bearer {st.secrets['HF_AUTH']}"} def create_tags(payload): API_URL_TAGS = "https://api-inference.huggingface.co/models/fabiochiu/t5-base-tag-generation" response = requests.post(API_URL_TAGS, headers=headers, json=payload) return response.json() def summarize_text(payload): API_URL = "https://api-inference.huggingface.co/models/yasminesarraj/flan-t5-small-samsum" response = requests.post(API_URL, headers=headers, json=payload) return response.json() # Start of the app code tab_your_paper, tab_general_topics = st.tabs(["Summarize your paper(s)", "Research topics"]) with tab_your_paper: html("", height=10) st.markdown(""" ### Simply upload one or multiple PDFs and we summarize the content for you! """) pdf_files = st.file_uploader("Upload your paper as a pdf", type=[".pdf"], accept_multiple_files=True, help="You can summarize one or also multiple papers at once. The file format needs to be a pdf.") if pdf_files: recently_added = [] for pdf in pdf_files: # Saving the files pdf_data = pdf.getvalue() pdf_path = os.path.join(pdf.name) with open(pdf_path, "wb") as f: f.write(pdf_data) recently_added.append(pdf_path) pdfs_content_list = [] for recent_pdf in recently_added: # Reading the pdf files pdf_content = get_pdf_text(recent_pdf) print("**", pdf_content) pdfs_content_list.append(pdf_content) # Delete the files os.remove(recent_pdf) all_text_together = " ".join(pdfs_content_list) try: tags = create_tags({ "inputs": all_text_together, })[0]["generated_text"] tags_available = True except: tags_available = False try: summary = summarize_text({ "inputs": "Summarize: "+all_text_together })[0]["summary_text"] sum_available = True except: sum_available = False col1, col2 = st.columns(2) if sum_available == True: with col1: if len(recently_added) > 1: st.markdown("#### Summary of your paper(s):") else: st.markdown("#### Summary of your paper:") st.write(summary) # else: # with col1: # st.write(sum_model(all_text_together)) else: with col1: st.markdown("#### Summary currently unavailable.") if tags_available == True: with col2: if len(recently_added) > 1: st.markdown("#### Identified topics of your paper(s):") else: st.markdown("#### Identified topics of your paper:") st.write(tags) else: with col2: st.markdown("#### Topics currently unavailable") with st.expander("See your total text"): st.write(all_text_together) with tab_general_topics: html("", height=10) st.header("See the status of a research topic through a summary of the most cited papers") st.selectbox("Select a research topic", ["Artificial Intelligence", "Sustainability", "Cooking"])