File size: 4,888 Bytes
43c34cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Read papers from a PDF file and extract the title, abstract, figures and tables captions, and main content. These
functions work best with ICLR / NeurIPS papers.

"""

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage


def extract_text_from_pdf(path: str) -> str:
    """Extracts text from a PDF file.

    Args:
        path (str): A string specifying the path to the PDF file.

    Returns:
        A string containing the extracted text from the PDF.
    """

    with open(path, 'rb') as file_handle:
        # Initialize a PDF resource manager to store shared resources.
        resource_manager = PDFResourceManager()

        # Set up a StringIO instance to capture the extracted text.
        text_output = StringIO()

        # Create a TextConverter to convert PDF pages to text.
        converter = TextConverter(resource_manager, text_output, laparams=LAParams())

        # Initialize a PDF page interpreter.
        interpreter = PDFPageInterpreter(resource_manager, converter)

        # Process each page in the PDF.
        for page in PDFPage.get_pages(file_handle, caching=True, check_extractable=True):
            interpreter.process_page(page)

        # Retrieve the extracted text and close the StringIO instance.
        extracted_text = text_output.getvalue()
        text_output.close()

        # Finalize the converter.
        converter.close()

    # Replace form feed characters with newlines.
    extracted_text = extracted_text.replace('\x0c', '\n')

    return extracted_text


def convert_text_into_dict(text: str) -> dict:
    """Converts the extracted text into a dictionary.

    Args:
        text (str): the extracted text from the PDF.

    Returns:
        A json object containing the extracted fields from the paper.

    """

    lines = text.split('\n')

    # Create a filtered list to store non-matching lines
    filtered_lines = [line for line in lines if not (line.startswith('Under review') or
                                                     line.startswith('Published as') or
                                                     line.startswith('Paper under double-blind review'))]

    # Remove the first few empty lines before the title
    while filtered_lines[0].strip() == "":
        filtered_lines.pop(0)

    # Get title
    title = ""
    while filtered_lines[0] != "":
        title += filtered_lines.pop(0) + ' '

    title = title.strip().capitalize()

    # Remove the author information between the title and the abstract
    while filtered_lines[0].lower() != "abstract":
        filtered_lines.pop(0)
    filtered_lines.pop(0)

    # Get abstract
    abstract = ""
    while filtered_lines[0].lower() != "introduction":
        abstract += filtered_lines.pop(0) + ' '

    main_content = ""

    figures_captions = []
    tables_captions = []

    while filtered_lines != [] and not filtered_lines[0].lower().startswith("references"):
        figure_caption = ""
        table_caption = ""

        if filtered_lines[0].lower().startswith("figure"):
            while not filtered_lines[0] == "":
                figure_caption += filtered_lines.pop(0) + ' '


        elif filtered_lines[0].lower().startswith("Table"):
            while not filtered_lines[0] == "":
                table_caption += filtered_lines.pop(0) + ' '

        else:
            main_content += filtered_lines.pop(0) + ' '

        if figure_caption != "":
            figures_captions.append(figure_caption)

        if table_caption != "":
            tables_captions.append(table_caption)


    figures_captions = "\n".join(figures_captions) + "\n" + "\n".join(tables_captions)

    # Get the first section title in the Appendix
    # Example section title: "A ENVIRONMENT DETAILS"
    while filtered_lines != [] and not (filtered_lines[0].isupper() and filtered_lines[0][0] == "A"):
        filtered_lines.pop(0)


    appendix = ""

    while filtered_lines != []:
        appendix += filtered_lines.pop(0) + ' '

    # Now we have reached the "References" section
    # Skip until we reach


    paper = {
        "Title": title.strip(),
        "Abstract": abstract.strip(),
        "Figures/Tables Captions": figures_captions.strip(),
        "Main Content": main_content.strip(),
        "Appendix": appendix.strip(),
    }

    return paper


if __name__ == "__main__":
    from agentreview.utility.authentication_utils import read_and_set_openai_key
    from agentreview.review import get_lm_review

    read_and_set_openai_key()

    path = "data/rejected/6359.pdf"
    text = extract_text_from_pdf(path)

    parsed_paper = convert_text_into_dict(text)

    review_generated = get_lm_review(parsed_paper)

    print(review_generated["review_generated"])