Spaces:
Runtime error
Runtime error
acecalisto3
commited on
Create background_tasks.py
Browse files- background_tasks.py +105 -0
background_tasks.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import threading
|
2 |
+
import time
|
3 |
+
from selenium import webdriver
|
4 |
+
from selenium.webdriver.chrome.service import Service
|
5 |
+
from selenium.webdriver.chrome.options import Options
|
6 |
+
import hashlib
|
7 |
+
import sqlite3
|
8 |
+
import csv
|
9 |
+
import os
|
10 |
+
import logging
|
11 |
+
import traceback
|
12 |
+
|
13 |
+
def create_database():
|
14 |
+
try:
|
15 |
+
conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
|
16 |
+
c = conn.cursor()
|
17 |
+
c.execute('''CREATE TABLE IF NOT EXISTS changes
|
18 |
+
(id INTEGER PRIMARY KEY AUTOINCREMENT,
|
19 |
+
date TEXT,
|
20 |
+
time TEXT,
|
21 |
+
url TEXT,
|
22 |
+
change TEXT)''')
|
23 |
+
conn.commit()
|
24 |
+
conn.close()
|
25 |
+
logging.info("Database created or already exists")
|
26 |
+
except Exception as e:
|
27 |
+
logging.error(f"Error creating database: {e}")
|
28 |
+
traceback.print_exc()
|
29 |
+
|
30 |
+
def insert_change(date, time, url, change):
|
31 |
+
try:
|
32 |
+
conn = sqlite3.connect('/home/user/app/scraped_data/culver/culvers_changes.db')
|
33 |
+
c = conn.cursor()
|
34 |
+
c.execute("INSERT INTO changes (date, time, url, change) VALUES (?, ?, ?, ?)",
|
35 |
+
(date, time, url, change))
|
36 |
+
conn.commit()
|
37 |
+
conn.close()
|
38 |
+
logging.info(f"Change inserted: {date} {time} {url}")
|
39 |
+
except Exception as e:
|
40 |
+
logging.error(f"Error inserting change: {e}")
|
41 |
+
traceback.print_exc()
|
42 |
+
|
43 |
+
def continuous_monitoring(storage_location, urls, scrape_interval, content_type):
|
44 |
+
create_database()
|
45 |
+
|
46 |
+
os.makedirs(os.path.dirname(storage_location), exist_ok=True)
|
47 |
+
|
48 |
+
previous_hashes = {url: "" for url in urls}
|
49 |
+
|
50 |
+
options = Options()
|
51 |
+
options.add_argument("--headless")
|
52 |
+
options.add_argument("--no-sandbox")
|
53 |
+
options.add_argument("--disable-dev-shm-usage")
|
54 |
+
|
55 |
+
service = Service('/usr/bin/chromedriver')
|
56 |
+
|
57 |
+
logging.info(f"Starting continuous monitoring for URLs: {urls}")
|
58 |
+
|
59 |
+
try:
|
60 |
+
with webdriver.Chrome(service=service, options=options) as driver:
|
61 |
+
while True:
|
62 |
+
for url in urls:
|
63 |
+
try:
|
64 |
+
logging.info(f"Accessing URL: {url}")
|
65 |
+
driver.get(url)
|
66 |
+
time.sleep(2) # Wait for the page to load
|
67 |
+
if content_type == "text":
|
68 |
+
current_content = driver.page_source
|
69 |
+
elif content_type == "media":
|
70 |
+
current_content = driver.find_elements_by_tag_name("img")
|
71 |
+
else:
|
72 |
+
current_content = driver.page_source
|
73 |
+
|
74 |
+
current_hash = hashlib.md5(str(current_content).encode('utf-8')).hexdigest()
|
75 |
+
|
76 |
+
if current_hash != previous_hashes[url]:
|
77 |
+
previous_hashes[url] = current_hash
|
78 |
+
date_time_str = time.strftime("%Y-%m-%d %H:%M:%S")
|
79 |
+
date, time_str = date_time_str.split()
|
80 |
+
change = "Content changed"
|
81 |
+
|
82 |
+
with open(storage_location, "a", newline='') as csvfile:
|
83 |
+
csv_toolkit = csv.DictWriter(csvfile, fieldnames=["date", "time", "url", "change"])
|
84 |
+
csv_toolkit.writerow({"date": date, "time": time_str, "url": url, "change": change})
|
85 |
+
|
86 |
+
insert_change(date, time_str, url, change)
|
87 |
+
logging.info(f"Change detected at {url} on {date_time_str}")
|
88 |
+
else:
|
89 |
+
logging.info(f"No change detected at {url}")
|
90 |
+
except Exception as e:
|
91 |
+
logging.error(f"Error accessing {url}: {e}")
|
92 |
+
traceback.print_exc()
|
93 |
+
|
94 |
+
logging.info(f"Sleeping for {scrape_interval} minutes")
|
95 |
+
time.sleep(scrape_interval * 60) # Check every scrape_interval minutes
|
96 |
+
except Exception as e:
|
97 |
+
logging.error(f"Error in continuous monitoring: {e}")
|
98 |
+
traceback.print_exc()
|
99 |
+
|
100 |
+
def start_background_monitoring(storage_location, urls, scrape_interval, content_type):
|
101 |
+
thread = threading.Thread(target=continuous_monitoring, args=(storage_location, urls, scrape_interval, content_type))
|
102 |
+
thread.daemon = True
|
103 |
+
thread.start()
|
104 |
+
logging.info("Background monitoring started")
|
105 |
+
|