File size: 3,019 Bytes
4c404f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cec537
4c404f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9beb013
4c404f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9beb013
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


def load_driver():
    print("Loading driver...")
    opts = FirefoxOptions()
    opts.add_argument("--headless")
    driver = webdriver.Firefox(options=opts)
    print("OK.")
    return driver


def parse_review(html):
    # Review text
    soup = BeautifulSoup(html, 'html.parser')
    positive_review = None
    negative_review = None
    rows = soup.find_all("div", class_="c-review__row")
    for row in rows:
        if row.find("span", class_="c-review__translation-loader"):
            continue
        delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
        review_text = row.find("span", class_='c-review__body').text.strip()
        if delimiter == "Понравилось":
            positive_review = review_text
        elif delimiter == "Не понравилось":
            negative_review = review_text
        else:
            raise ValueError()
        
    # Room name
    room_info = soup.find('div', class_='c-review-block__room-info-row')
    room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None

    # Datetime of the review
    # datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)

    # Number of nights + date
    stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
    date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")

    return {
        "positive": positive_review,
        "negative": negative_review,
        "room": room_name,
        "time": date_info
        }


def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 3):
    # url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
    review_infos = []

    driver.get(url)

    for i in range(page_count):
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
        time.sleep(wait_time)

        # Remove cookie banner
        try:
            driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
        except:
            pass

        elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")

        for elem in elems:
            html = elem.get_attribute('outerHTML')
            review_info = parse_review(html)
            review_infos.append(review_info)
        
        print(f"Done page {i+1} of {page_count}")

        pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
        pagenext.click()
    
    return review_infos


def extract_url(url: str):
    if "?" in url:
        url = url.split("?")[0]
    url += "#tab-reviews"
    return url