Spaces:
Sleeping
Sleeping
File size: 4,888 Bytes
43c34cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""
Read papers from a PDF file and extract the title, abstract, figures and tables captions, and main content. These
functions work best with ICLR / NeurIPS papers.
"""
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def extract_text_from_pdf(path: str) -> str:
"""Extracts text from a PDF file.
Args:
path (str): A string specifying the path to the PDF file.
Returns:
A string containing the extracted text from the PDF.
"""
with open(path, 'rb') as file_handle:
# Initialize a PDF resource manager to store shared resources.
resource_manager = PDFResourceManager()
# Set up a StringIO instance to capture the extracted text.
text_output = StringIO()
# Create a TextConverter to convert PDF pages to text.
converter = TextConverter(resource_manager, text_output, laparams=LAParams())
# Initialize a PDF page interpreter.
interpreter = PDFPageInterpreter(resource_manager, converter)
# Process each page in the PDF.
for page in PDFPage.get_pages(file_handle, caching=True, check_extractable=True):
interpreter.process_page(page)
# Retrieve the extracted text and close the StringIO instance.
extracted_text = text_output.getvalue()
text_output.close()
# Finalize the converter.
converter.close()
# Replace form feed characters with newlines.
extracted_text = extracted_text.replace('\x0c', '\n')
return extracted_text
def convert_text_into_dict(text: str) -> dict:
"""Converts the extracted text into a dictionary.
Args:
text (str): the extracted text from the PDF.
Returns:
A json object containing the extracted fields from the paper.
"""
lines = text.split('\n')
# Create a filtered list to store non-matching lines
filtered_lines = [line for line in lines if not (line.startswith('Under review') or
line.startswith('Published as') or
line.startswith('Paper under double-blind review'))]
# Remove the first few empty lines before the title
while filtered_lines[0].strip() == "":
filtered_lines.pop(0)
# Get title
title = ""
while filtered_lines[0] != "":
title += filtered_lines.pop(0) + ' '
title = title.strip().capitalize()
# Remove the author information between the title and the abstract
while filtered_lines[0].lower() != "abstract":
filtered_lines.pop(0)
filtered_lines.pop(0)
# Get abstract
abstract = ""
while filtered_lines[0].lower() != "introduction":
abstract += filtered_lines.pop(0) + ' '
main_content = ""
figures_captions = []
tables_captions = []
while filtered_lines != [] and not filtered_lines[0].lower().startswith("references"):
figure_caption = ""
table_caption = ""
if filtered_lines[0].lower().startswith("figure"):
while not filtered_lines[0] == "":
figure_caption += filtered_lines.pop(0) + ' '
elif filtered_lines[0].lower().startswith("Table"):
while not filtered_lines[0] == "":
table_caption += filtered_lines.pop(0) + ' '
else:
main_content += filtered_lines.pop(0) + ' '
if figure_caption != "":
figures_captions.append(figure_caption)
if table_caption != "":
tables_captions.append(table_caption)
figures_captions = "\n".join(figures_captions) + "\n" + "\n".join(tables_captions)
# Get the first section title in the Appendix
# Example section title: "A ENVIRONMENT DETAILS"
while filtered_lines != [] and not (filtered_lines[0].isupper() and filtered_lines[0][0] == "A"):
filtered_lines.pop(0)
appendix = ""
while filtered_lines != []:
appendix += filtered_lines.pop(0) + ' '
# Now we have reached the "References" section
# Skip until we reach
paper = {
"Title": title.strip(),
"Abstract": abstract.strip(),
"Figures/Tables Captions": figures_captions.strip(),
"Main Content": main_content.strip(),
"Appendix": appendix.strip(),
}
return paper
if __name__ == "__main__":
from agentreview.utility.authentication_utils import read_and_set_openai_key
from agentreview.review import get_lm_review
read_and_set_openai_key()
path = "data/rejected/6359.pdf"
text = extract_text_from_pdf(path)
parsed_paper = convert_text_into_dict(text)
review_generated = get_lm_review(parsed_paper)
print(review_generated["review_generated"])
|