Spaces:
Runtime error
Runtime error
from selenium import webdriver | |
from selenium.webdriver import FirefoxOptions | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from bs4 import BeautifulSoup | |
import time | |
def load_driver(): | |
print("Loading driver...") | |
opts = FirefoxOptions() | |
opts.add_argument("--headless") | |
driver = webdriver.Firefox(options=opts) | |
print("OK.") | |
return driver | |
def parse_review(html): | |
# Review text | |
soup = BeautifulSoup(html, 'html.parser') | |
positive_review = None | |
negative_review = None | |
rows = soup.find_all("div", class_="c-review__row") | |
for row in rows: | |
if row.find("span", class_="c-review__translation-loader"): | |
continue | |
delimiter = row.find("span", class_="bui-u-sr-only").text.strip() | |
review_text = row.find("span", class_='c-review__body').text.strip() | |
if delimiter == "Понравилось": | |
positive_review = review_text | |
elif delimiter == "Не понравилось": | |
negative_review = review_text | |
else: | |
raise ValueError() | |
# Room name | |
room_info = soup.find('div', class_='c-review-block__room-info-row') | |
room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None | |
# Datetime of the review | |
# datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True) | |
# Number of nights + date | |
stay_date_info = soup.find('ul', class_='c-review-block__stay-date') | |
date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ") | |
return { | |
"positive": positive_review, | |
"negative": negative_review, | |
"room": room_name, | |
"time": date_info | |
} | |
def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 3): | |
# url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews" | |
review_infos = [] | |
driver.get(url) | |
for i in range(page_count): | |
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block"))) | |
time.sleep(wait_time) | |
# Remove cookie banner | |
try: | |
driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();") | |
except: | |
pass | |
elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block") | |
for elem in elems: | |
html = elem.get_attribute('outerHTML') | |
review_info = parse_review(html) | |
review_infos.append(review_info) | |
print(f"Done page {i+1} of {page_count}") | |
pagenext = driver.find_element(By.CLASS_NAME, "pagenext") | |
pagenext.click() | |
return review_infos | |
def extract_url(url: str): | |
if "?" in url: | |
url = url.split("?")[0] | |
url += "#tab-reviews" | |
return url |