Spaces:

BlooperDodge
/

MedCleave

Sleeping

App Files Files Community

MedCleave / crawler.py

BlooperDodge

Upload 8 files

a8c2ce4 verified 7 months ago

raw

history blame contribute delete

7.11 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import os
	from transformers import BertModel, BertTokenizer
	import torch
	import numpy as np
	import faiss
	from concurrent.futures import ThreadPoolExecutor
	from retrying import retry
	import time
	from ratelimit import limits, sleep_and_retry
	import threading

	# Global counters for URLs and FAISS index initialization
	total_urls_crawled = 0
	index_file = 'faiss_index.bin' # FAISS index file path

	# Set of visited URLs to prevent duplicates
	visited_urls = set()

	# Directory to save crawled URLs
	urls_dir = 'crawled_urls'
	os.makedirs(urls_dir, exist_ok=True)
	urls_file = os.path.join(urls_dir, 'crawled_urls.txt')

	# Initialize FAISS index
	def initialize_faiss_index(dimension):
	if os.path.exists(index_file):
	os.remove(index_file)
	print("Deleted previous FAISS index file.")
	index = faiss.IndexFlatL2(dimension)
	return index

	# Initialize or load FAISS index
	dimension = 768 # Dimension of BERT embeddings
	index = initialize_faiss_index(dimension)

	# Initialize tokenizer and model
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertModel.from_pretrained('bert-base-uncased')

	# Lock for thread-safe update of total_urls_crawled
	lock = threading.Lock()

	# Function to update and print live count of crawled URLs
	def update_live_count():
	global total_urls_crawled
	while True:
	with lock:
	print(f"\rURLs crawled: {total_urls_crawled}", end='')
	time.sleep(1) # Update every second

	# Start live count update thread
	live_count_thread = threading.Thread(target=update_live_count, daemon=True)
	live_count_thread.start()

	# Function to save crawled URLs to a file
	def save_crawled_urls(url):
	with open(urls_file, 'a') as f:
	f.write(f"{url}\n")
	f.flush() # Flush buffer to ensure immediate write
	os.fsync(f.fileno()) # Ensure write is flushed to disk

	# Function to get all links from a webpage with retry mechanism and rate limiting
	@retry(stop_max_attempt_number=3, wait_fixed=2000)
	@sleep_and_retry
	@limits(calls=10, period=1) # Adjust calls and period based on website's rate limits
	def get_links(url, domain):
	global total_urls_crawled
	links = []
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, timeout=50)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	for link in soup.find_all('a', href=True):
	href = link['href']
	normalized_url = normalize_url(href, domain)
	if normalized_url and normalized_url not in visited_urls:
	links.append(normalized_url)
	visited_urls.add(normalized_url)
	with lock:
	total_urls_crawled += 1
	save_crawled_urls(normalized_url) # Save crawled URL to file

	# Convert text to BERT embeddings and add to FAISS index
	try:
	text = soup.get_text()
	if text:
	embeddings = convert_text_to_bert_embeddings(text, tokenizer, model)
	index.add(np.array([embeddings]))
	except Exception as e:
	print(f"Error adding embeddings to FAISS index: {e}")

	except requests.HTTPError as e:
	if e.response.status_code == 404:
	print(f"HTTP 404 Error: {e}")
	else:
	print(f"HTTP error occurred: {e}")
	except requests.RequestException as e:
	print(f"Error accessing {url}: {e}")
	return links

	# Function to normalize and validate URLs
	def normalize_url(url, domain):
	parsed_url = urlparse(url)
	if not parsed_url.scheme:
	url = urljoin(domain, url)
	if url.startswith(domain):
	return url
	return None

	# Function to recursively get all pages and collect links with retry mechanism and rate limiting
	@retry(stop_max_attempt_number=3, wait_fixed=2000)
	@sleep_and_retry
	@limits(calls=10, period=1) # Adjust calls and period based on website's rate limits
	def crawl_site(base_url, domain, depth=0, max_depth=10): # Increased max_depth to 10
	if depth > max_depth or base_url in visited_urls:
	return []
	visited_urls.add(base_url)

	links = get_links(base_url, domain)
	print(f"Crawled {len(links)} links from {base_url} at depth {depth}.") # Debugging info

	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(base_url, headers=headers, timeout=30)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, 'html.parser')
	links_to_crawl = []
	for link in soup.find_all('a', href=True):
	href = link['href']
	normalized_url = normalize_url(href, domain)
	if normalized_url and normalized_url not in visited_urls:
	links_to_crawl.append(normalized_url)

	with ThreadPoolExecutor(max_workers=500) as executor:
	results = executor.map(lambda url: crawl_site(url, domain, depth + 1, max_depth), links_to_crawl)
	for result in results:
	links.extend(result)

	except requests.HTTPError as e:
	if e.response.status_code == 404:
	print(f"HTTP 404 Error: {e}")
	else:
	print(f"HTTP error occurred: {e}")
	except requests.RequestException as e:
	print(f"Error accessing {base_url}: {e}")

	return links

	# Function to convert text to BERT embeddings
	def convert_text_to_bert_embeddings(text, tokenizer, model):
	inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)

	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Average pool last layer's output

	return embeddings

	# Main process
	def main():
	global total_urls_crawled
	domain = 'https://go.drugbank.com/' # Replace with your new domain
	start_url = 'https://go.drugbank.com/drugs/DB00001' # Replace with your starting URL


	try:
	# Save the FAISS index at the beginning of the execution
	faiss.write_index(index, index_file)
	print("Initial FAISS index saved.")

	urls = crawl_site(start_url, domain)
	print(f"\n\nFound {total_urls_crawled} URLs.")

	# Save the FAISS index at the end of execution
	faiss.write_index(index, index_file)
	print("Final FAISS index saved.")

	except Exception as e:
	print(f"Exception encountered: {e}")

	if __name__ == "__main__":
	main()