booking_reviews_chatgpt_summary / selenium_parser.py
max-unfinity
chatgpt
9beb013
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
def load_driver():
print("Loading driver...")
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
print("OK.")
return driver
def parse_review(html):
# Review text
soup = BeautifulSoup(html, 'html.parser')
positive_review = None
negative_review = None
rows = soup.find_all("div", class_="c-review__row")
for row in rows:
if row.find("span", class_="c-review__translation-loader"):
continue
delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
review_text = row.find("span", class_='c-review__body').text.strip()
if delimiter == "Понравилось":
positive_review = review_text
elif delimiter == "Не понравилось":
negative_review = review_text
else:
raise ValueError()
# Room name
room_info = soup.find('div', class_='c-review-block__room-info-row')
room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None
# Datetime of the review
# datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)
# Number of nights + date
stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")
return {
"positive": positive_review,
"negative": negative_review,
"room": room_name,
"time": date_info
}
def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 3):
# url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
review_infos = []
driver.get(url)
for i in range(page_count):
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
time.sleep(wait_time)
# Remove cookie banner
try:
driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
except:
pass
elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")
for elem in elems:
html = elem.get_attribute('outerHTML')
review_info = parse_review(html)
review_infos.append(review_info)
print(f"Done page {i+1} of {page_count}")
pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
pagenext.click()
return review_infos
def extract_url(url: str):
if "?" in url:
url = url.split("?")[0]
url += "#tab-reviews"
return url