AgentReview / agentreview /dataset /download_openreview_paper.py
USTC975's picture
create the app
43c34cc
"""
Download all papers from one year of ICLR conference using OpenReview API.
This script downloads all paper PDFs and their corresponding metadata
from the ICLR 2023 conference using the OpenReview API.
Alternative methods to download can be found in this
[colab notebook](https://colab.research.google.com/drive/1vXXNxn8lnO3j1dgoidjybbKIN0DW0Bt2),
though it's not used here.
"""
import glob
import json
import os
import time
import requests
from agentreview.arguments import parse_args
try:
import openreview
except ImportError:
raise ImportError("Please install openreview package using `pip install openreview-py`")
def download_papers(args):
"""Downloads all papers from ICLR 2023 using OpenReview API.
This function authenticates with the OpenReview API using environment
variables for the username and password. It then iterates through the
available papers, downloads the PDF, and saves the corresponding metadata
(in JSON format) in the specified directories.
Raises:
AssertionError: If the OPENREVIEW_USERNAME or OPENREVIEW_PASSWORD environment
variables are not set.
AssertionError: If the conference argument is not for ICLR.
"""
openreview_username = os.environ.get("OPENREVIEW_USERNAME")
openreview_password = os.environ.get("OPENREVIEW_PASSWORD")
assert openreview_username is not None, (
"Please set your OpenReview username through the OPENREVIEW_USERNAME environment variable."
)
assert openreview_password is not None, (
"Please set your OpenReview password through the OPENREVIEW_PASSWORD environment variable."
)
client = openreview.Client(
baseurl='https://api.openreview.net',
username=openreview_username,
password=openreview_password
)
page_size = 1000
offset = 0
papers_directory = os.path.join(args.data_dir, args.conference, "paper")
notes_directory = os.path.join(args.data_dir, args.conference, "notes")
assert "ICLR" in args.conference, "Only works for ICLR conferences!"
year = int(args.conference.split("ICLR")[-1]) # Only works for ICLR currently
ids = []
# Create directories if they don't exist
for path in [papers_directory, notes_directory]:
os.makedirs(path, exist_ok=True)
while True:
# Fetch submissions with pagination
notes = client.get_notes(
invitation=f'ICLR.cc/{year}/Conference/-/Blind_Submission',
details='all',
offset=offset,
limit=page_size
)
if not notes:
break # Exit if no more notes are available
# Get existing paper IDs to avoid re-downloading
existing_papers = glob.glob(f"{papers_directory}/*.pdf")
existing_paper_ids = {int(os.path.basename(paper).split(".pdf")[0]) for paper in existing_papers}
for note in notes:
paper_id = note.number
paper_path = os.path.join(papers_directory, f"{paper_id}.pdf")
note_path = os.path.join(notes_directory, f"{paper_id}.json")
# Skip existing papers
if paper_id in existing_paper_ids:
print(f"Paper {paper_id} already downloaded.")
continue
print(f"Title: {note.content.get('title', 'N/A')}")
print(f"Abstract: {note.content.get('abstract', 'N/A')}")
print(f"TL;DR: {note.content.get('TL;DR', 'N/A')}")
pdf_link = f"https://openreview.net/pdf?id={note.id}"
print(f"PDF Link: {pdf_link}")
# Attempt to download the paper PDF, retry if fails
tries = 0
while tries < 10:
try:
response = requests.get(pdf_link)
if response.status_code == 200:
with open(paper_path, "wb") as pdf_file:
pdf_file.write(response.content)
print(f"PDF downloaded successfully as {paper_path}")
# Save metadata as JSON, which contains the reviews, rebuttals, and decisions.
with open(note_path, "w") as note_file:
json.dump(note.to_json(), note_file, indent=2)
break
else:
print(f"Attempt {tries} failed. Status code: {response.status_code}")
if response.status_code == 429: # Too many requests
print("Too many requests. Sleeping for 10 seconds.")
time.sleep(10)
except Exception as e:
print(f"Attempt {tries} failed with error: {e}")
tries += 1
offset += page_size
if __name__ == "__main__":
args = parse_args()
download_papers(args)