Spaces:

Nymbo
/

web-scraper-images

Running

App Files Files Community

web-scraper-images / app.py

Nymbo

Duplicate from dwancin/web-scraping

3e48a1e over 1 year ago

raw

history blame

6.44 kB

	'''
	# Web Scrapping
	[@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
	'''

	import os,re, requests, uuid, zipfile, hashlib, shutil
	import gradio as gr
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	# Function to validate URLs
	def validator(url):
	parsed = urlparse(url)
	return bool(parsed.netloc) and bool(parsed.scheme)


	# Function to find files on webpage
	def finder(url, soup, media_type):
	files = []

	# find image files
	if media_type == "image":
	tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
	for tag in soup.find_all('img'):
	file = tag.get('src')
	if any(tag in file for tag in tags):
	file_url = file
	if not validator(file_url):
	file_url = urljoin(url, file_url)
	files.append(file_url)

	# find text
	elif media_type == "text":
	text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
	for tag in text_tags:
	for element in soup.find_all(tag):
	files.append(element.get_text())

	# find links
	else:
	for link in soup.find_all('a'):
	file = link.get('href')
	if media_type in file:
	file_url = file
	if not validator(file_url):
	file_url = urljoin(url, file_url)
	files.append(file_url)

	return files


	# Function to download the files
	def downloader(urls, folder_name):
	os.makedirs(folder_name, exist_ok=True)
	for i, url in enumerate(urls):
	response = requests.get(url, stream=True)
	file_extension = url.split(".")[-1].split("&")[0]
	url_hash = hashlib.md5(url.encode()).hexdigest()
	unique_id = str(uuid.uuid4())[:8]
	file_name = f'{url_hash}-{unique_id}.{file_extension}'
	file_name = file_name[:255]
	file_name = re.sub(r'[\\/:"*?<>\|]+', '_', file_name)
	with open(f'{folder_name}/{file_name}', 'wb') as out_file:
	out_file.write(response.content)
	print(f"Downloaded file: {file_name}")


	# Function to create zip file
	def zipper(folder_name):
	if os.listdir(folder_name):
	with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
	for file in os.listdir(folder_name):
	zipf.write(f'{folder_name}/{file}')
	return f'{folder_name}.zip'
	else:
	return ""


	# Function to access website
	def scrapper(url, images=False, text=False):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	except (requests.exceptions.RequestException, ValueError):
	raise gr.Error(f"Unable to access URL: {url}")
	return None, None
	soup = BeautifulSoup(response.content, 'html.parser')

	# Clear all the previews folder data
	if images:
	shutil.rmtree('images', ignore_errors=True)
	if text:
	shutil.rmtree('text', ignore_errors=True)

	# Add images to the image folder
	if images:
	image_urls = finder(url, soup, 'image')
	os.makedirs('images', exist_ok=True)
	if image_urls:
	downloader(image_urls, 'images')
	else:
	raise gr.Error("Found no images.")

	# Add text files to the text folder
	if text:
	text_content = finder(url, soup, 'text')
	os.makedirs('text', exist_ok=True)
	if text_content:
	with open('text/content.txt', 'w') as text_file:
	for line in text_content:
	text_file.write(line + '\n')

	# Output folder(s) as zip files
	images_zip_file, text_zip_file = None, None
	if images and os.path.exists('images') and os.listdir('images'):
	images_zip_file = zipper('images')
	if text and os.path.exists('text') and os.listdir('text'):
	text_zip_file = zipper('text')
	return images_zip_file, text_zip_file


	# Function to find requests errors
	def checker(url, media_types):
	if not url:
	raise gr.Error("URL cannot be empty.")
	if not url.startswith("https://"):
	raise gr.Error("The URL must begin with https://")
	if not media_types:
	raise gr.Error("At least one media type must be selected.")
	try:
	image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
	except requests.exceptions.HTTPError as e:
	if e.response.status_code == 403:
	raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
	else:
	raise gr.Error(f"HTTP Error: {e.response.status_code}")
	except TypeError as e:
	raise gr.Error(f"TypeError: {str(e)}")
	except (requests.exceptions.RequestException, ValueError):
	raise gr.Error(f"Unable to access URL: {url}")
	files = []
	if "Text" in media_types and not text_file:
	raise gr.Error("Found no text.")
	if "Images" in media_types and not image_file:
	raise gr.Error("Found no images.")
	if image_file:
	files.append(image_file)
	if text_file:
	files.append(text_file)

	print(f"Returning downloaded files from {url} in {files} ...")

	return files

	# Gradio Interface
	with gr.Blocks(theme="dwancin/theme") as app:
	title = gr.Markdown('''# Web Scrapping 🕵️''')
	description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
	with gr.Row():
	with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
	url_name = gr.Textbox(
	placeholder="Enter URL here",
	show_label=True,
	label="Website",
	)

	media_types = gr.CheckboxGroup(
	["Images", "Text"],
	value="Images",
	label="Media types",
	)

	submit_button = gr.Button(
	"Submit",
	variant="primary",
	interactive=True,
	)

	with gr.Column(scale=2):
	output_files = gr.Files(
	label="Output",
	elem_id="file-list",
	size="lg",
	show_label=False,
	)

	submit_button.click(
	checker,
	inputs=[url_name, media_types],
	outputs=[output_files],
	)

	app.launch()