AgentReview / agentreview /paper_processor.py
Yiqiao Jin
Update demo
53709ed
"""
Read papers from a PDF file and extract the title, abstract, figures and tables captions, and main content. These
functions work best with ICLR / NeurIPS papers.
"""
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def extract_text_from_pdf(path: str) -> str:
"""Extracts text from a PDF file.
Args:
path (str): A string specifying the path to the PDF file.
Returns:
A string containing the extracted text from the PDF.
"""
with open(path, 'rb') as file_handle:
# Initialize a PDF resource manager to store shared resources.
resource_manager = PDFResourceManager()
# Set up a StringIO instance to capture the extracted text.
text_output = StringIO()
# Create a TextConverter to convert PDF pages to text.
converter = TextConverter(resource_manager, text_output, laparams=LAParams())
# Initialize a PDF page interpreter.
interpreter = PDFPageInterpreter(resource_manager, converter)
# Process each page in the PDF.
for page in PDFPage.get_pages(file_handle, caching=True, check_extractable=True):
interpreter.process_page(page)
# Retrieve the extracted text and close the StringIO instance.
extracted_text = text_output.getvalue()
text_output.close()
# Finalize the converter.
converter.close()
# Replace form feed characters with newlines.
extracted_text = extracted_text.replace('\x0c', '\n')
return extracted_text
def convert_text_into_dict(text: str) -> dict:
"""Converts the extracted text into a dictionary.
Args:
text (str): the extracted text from the PDF.
Returns:
A json object containing the extracted fields from the paper.
"""
lines = text.split('\n')
# Create a filtered list to store non-matching lines
filtered_lines = [line for line in lines if not (line.startswith('Under review') or
line.startswith('Published as') or
line.startswith('Paper under double-blind review'))]
# Remove the first few empty lines before the title
while filtered_lines[0].strip() == "":
filtered_lines.pop(0)
# Get title
title = ""
while filtered_lines[0] != "":
title += filtered_lines.pop(0) + ' '
title = title.strip().capitalize()
# Remove the author information between the title and the abstract
while filtered_lines[0].lower() != "abstract":
filtered_lines.pop(0)
filtered_lines.pop(0)
# Get abstract
abstract = ""
while filtered_lines[0].lower() != "introduction":
abstract += filtered_lines.pop(0) + ' '
main_content = ""
figures_captions = []
tables_captions = []
while filtered_lines != [] and not filtered_lines[0].lower().startswith("references"):
figure_caption = ""
table_caption = ""
if filtered_lines[0].lower().startswith("figure"):
while not filtered_lines[0] == "":
figure_caption += filtered_lines.pop(0) + ' '
elif filtered_lines[0].lower().startswith("Table"):
while not filtered_lines[0] == "":
table_caption += filtered_lines.pop(0) + ' '
else:
main_content += filtered_lines.pop(0) + ' '
if figure_caption != "":
figures_captions.append(figure_caption)
if table_caption != "":
tables_captions.append(table_caption)
figures_captions = "\n".join(figures_captions) + "\n" + "\n".join(tables_captions)
# Get the first section title in the Appendix
# Example section title: "A ENVIRONMENT DETAILS"
while filtered_lines != [] and not (filtered_lines[0].isupper() and filtered_lines[0][0] == "A"):
filtered_lines.pop(0)
appendix = ""
while filtered_lines != []:
appendix += filtered_lines.pop(0) + ' '
# Now we have reached the "References" section
# Skip until we reach
paper = {
"Title": title.strip(),
"Abstract": abstract.strip(),
"Figures/Tables Captions": figures_captions.strip(),
"Main Content": main_content.strip(),
"Appendix": appendix.strip(),
}
return paper
if __name__ == "__main__":
from agentreview.utility.authentication_utils import read_and_set_openai_key
from agentreview.review import get_lm_review
read_and_set_openai_key()
path = "data/rejected/6359.pdf"
text = extract_text_from_pdf(path)
parsed_paper = convert_text_into_dict(text)
review_generated = get_lm_review(parsed_paper)
print(review_generated["review_generated"])