''' # Web Scrapping [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping) ''' import os,re, requests, uuid, zipfile, hashlib, shutil import gradio as gr from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse # Function to validate URLs def validator(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) # Function to find files on webpage def finder(url, soup, media_type): files = [] # find image files if media_type == "image": tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw'] for tag in soup.find_all('img'): file = tag.get('src') if any(tag in file for tag in tags): file_url = file if not validator(file_url): file_url = urljoin(url, file_url) files.append(file_url) # find text elif media_type == "text": text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx'] for tag in text_tags: for element in soup.find_all(tag): files.append(element.get_text()) # find links else: for link in soup.find_all('a'): file = link.get('href') if media_type in file: file_url = file if not validator(file_url): file_url = urljoin(url, file_url) files.append(file_url) return files # Function to download the files def downloader(urls, folder_name): os.makedirs(folder_name, exist_ok=True) for i, url in enumerate(urls): response = requests.get(url, stream=True) file_extension = url.split(".")[-1].split("&")[0] url_hash = hashlib.md5(url.encode()).hexdigest() unique_id = str(uuid.uuid4())[:8] file_name = f'{url_hash}-{unique_id}.{file_extension}' file_name = file_name[:255] file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) with open(f'{folder_name}/{file_name}', 'wb') as out_file: out_file.write(response.content) print(f"Downloaded file: {file_name}") # Function to create zip file def zipper(folder_name): if os.listdir(folder_name): with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf: for file in os.listdir(folder_name): zipf.write(f'{folder_name}/{file}') return f'{folder_name}.zip' else: return "" # Function to access website def scrapper(url, images=False, text=False): try: response = requests.get(url, timeout=10) response.raise_for_status() except (requests.exceptions.RequestException, ValueError): raise gr.Error(f"Unable to access URL: {url}") return None, None soup = BeautifulSoup(response.content, 'html.parser') # Clear all the previews folder data if images: shutil.rmtree('images', ignore_errors=True) if text: shutil.rmtree('text', ignore_errors=True) # Add images to the image folder if images: image_urls = finder(url, soup, 'image') os.makedirs('images', exist_ok=True) if image_urls: downloader(image_urls, 'images') else: raise gr.Error("Found no images.") # Add text files to the text folder if text: text_content = finder(url, soup, 'text') os.makedirs('text', exist_ok=True) if text_content: with open('text/content.txt', 'w') as text_file: for line in text_content: text_file.write(line + '\n') # Output folder(s) as zip files images_zip_file, text_zip_file = None, None if images and os.path.exists('images') and os.listdir('images'): images_zip_file = zipper('images') if text and os.path.exists('text') and os.listdir('text'): text_zip_file = zipper('text') return images_zip_file, text_zip_file # Function to find requests errors def checker(url, media_types): if not url: raise gr.Error("URL cannot be empty.") if not url.startswith("https://"): raise gr.Error("The URL must begin with https://") if not media_types: raise gr.Error("At least one media type must be selected.") try: image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types) except requests.exceptions.HTTPError as e: if e.response.status_code == 403: raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.") else: raise gr.Error(f"HTTP Error: {e.response.status_code}") except TypeError as e: raise gr.Error(f"TypeError: {str(e)}") except (requests.exceptions.RequestException, ValueError): raise gr.Error(f"Unable to access URL: {url}") files = [] if "Text" in media_types and not text_file: raise gr.Error("Found no text.") if "Images" in media_types and not image_file: raise gr.Error("Found no images.") if image_file: files.append(image_file) if text_file: files.append(text_file) print(f"Returning downloaded files from {url} in {files} ...") return files # Gradio Interface with gr.Blocks(theme="Nymbo/Nymbo_Theme") as app: title = gr.Markdown('''# Web Scraping 🕵️''') description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''') with gr.Row(): with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"): url_name = gr.Textbox( placeholder="Enter URL here", show_label=True, label="Website", ) media_types = gr.CheckboxGroup( ["Images", "Text"], value="Images", label="Media types", ) submit_button = gr.Button( "Submit", variant="primary", interactive=True, ) with gr.Column(scale=2): output_files = gr.Files( label="Output", elem_id="file-list", size="lg", show_label=False, ) submit_button.click( checker, inputs=[url_name, media_types], outputs=[output_files], ) app.launch()