Nymbo dwancin commited on
Commit
3e48a1e
·
0 Parent(s):

Duplicate from dwancin/web-scraping

Browse files

Co-authored-by: DW <[email protected]>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +193 -0
  4. requirements.txt +12 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Web Scraping
3
+ emoji: 🕵️
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: dwancin/web-scraping
11
+ ---
12
+
13
+ https://huggingface.co/spaces/dwancin/web-scraping
app.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ # Web Scrapping
3
+ [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping)
4
+ '''
5
+
6
+ import os,re, requests, uuid, zipfile, hashlib, shutil
7
+ import gradio as gr
8
+ from bs4 import BeautifulSoup
9
+ from urllib.parse import urljoin, urlparse
10
+
11
+ # Function to validate URLs
12
+ def validator(url):
13
+ parsed = urlparse(url)
14
+ return bool(parsed.netloc) and bool(parsed.scheme)
15
+
16
+
17
+ # Function to find files on webpage
18
+ def finder(url, soup, media_type):
19
+ files = []
20
+
21
+ # find image files
22
+ if media_type == "image":
23
+ tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw']
24
+ for tag in soup.find_all('img'):
25
+ file = tag.get('src')
26
+ if any(tag in file for tag in tags):
27
+ file_url = file
28
+ if not validator(file_url):
29
+ file_url = urljoin(url, file_url)
30
+ files.append(file_url)
31
+
32
+ # find text
33
+ elif media_type == "text":
34
+ text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx']
35
+ for tag in text_tags:
36
+ for element in soup.find_all(tag):
37
+ files.append(element.get_text())
38
+
39
+ # find links
40
+ else:
41
+ for link in soup.find_all('a'):
42
+ file = link.get('href')
43
+ if media_type in file:
44
+ file_url = file
45
+ if not validator(file_url):
46
+ file_url = urljoin(url, file_url)
47
+ files.append(file_url)
48
+
49
+ return files
50
+
51
+
52
+ # Function to download the files
53
+ def downloader(urls, folder_name):
54
+ os.makedirs(folder_name, exist_ok=True)
55
+ for i, url in enumerate(urls):
56
+ response = requests.get(url, stream=True)
57
+ file_extension = url.split(".")[-1].split("&")[0]
58
+ url_hash = hashlib.md5(url.encode()).hexdigest()
59
+ unique_id = str(uuid.uuid4())[:8]
60
+ file_name = f'{url_hash}-{unique_id}.{file_extension}'
61
+ file_name = file_name[:255]
62
+ file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name)
63
+ with open(f'{folder_name}/{file_name}', 'wb') as out_file:
64
+ out_file.write(response.content)
65
+ print(f"Downloaded file: {file_name}")
66
+
67
+
68
+ # Function to create zip file
69
+ def zipper(folder_name):
70
+ if os.listdir(folder_name):
71
+ with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf:
72
+ for file in os.listdir(folder_name):
73
+ zipf.write(f'{folder_name}/{file}')
74
+ return f'{folder_name}.zip'
75
+ else:
76
+ return ""
77
+
78
+
79
+ # Function to access website
80
+ def scrapper(url, images=False, text=False):
81
+ try:
82
+ response = requests.get(url, timeout=10)
83
+ response.raise_for_status()
84
+ except (requests.exceptions.RequestException, ValueError):
85
+ raise gr.Error(f"Unable to access URL: {url}")
86
+ return None, None
87
+ soup = BeautifulSoup(response.content, 'html.parser')
88
+
89
+ # Clear all the previews folder data
90
+ if images:
91
+ shutil.rmtree('images', ignore_errors=True)
92
+ if text:
93
+ shutil.rmtree('text', ignore_errors=True)
94
+
95
+ # Add images to the image folder
96
+ if images:
97
+ image_urls = finder(url, soup, 'image')
98
+ os.makedirs('images', exist_ok=True)
99
+ if image_urls:
100
+ downloader(image_urls, 'images')
101
+ else:
102
+ raise gr.Error("Found no images.")
103
+
104
+ # Add text files to the text folder
105
+ if text:
106
+ text_content = finder(url, soup, 'text')
107
+ os.makedirs('text', exist_ok=True)
108
+ if text_content:
109
+ with open('text/content.txt', 'w') as text_file:
110
+ for line in text_content:
111
+ text_file.write(line + '\n')
112
+
113
+ # Output folder(s) as zip files
114
+ images_zip_file, text_zip_file = None, None
115
+ if images and os.path.exists('images') and os.listdir('images'):
116
+ images_zip_file = zipper('images')
117
+ if text and os.path.exists('text') and os.listdir('text'):
118
+ text_zip_file = zipper('text')
119
+ return images_zip_file, text_zip_file
120
+
121
+
122
+ # Function to find requests errors
123
+ def checker(url, media_types):
124
+ if not url:
125
+ raise gr.Error("URL cannot be empty.")
126
+ if not url.startswith("https://"):
127
+ raise gr.Error("The URL must begin with https://")
128
+ if not media_types:
129
+ raise gr.Error("At least one media type must be selected.")
130
+ try:
131
+ image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types)
132
+ except requests.exceptions.HTTPError as e:
133
+ if e.response.status_code == 403:
134
+ raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.")
135
+ else:
136
+ raise gr.Error(f"HTTP Error: {e.response.status_code}")
137
+ except TypeError as e:
138
+ raise gr.Error(f"TypeError: {str(e)}")
139
+ except (requests.exceptions.RequestException, ValueError):
140
+ raise gr.Error(f"Unable to access URL: {url}")
141
+ files = []
142
+ if "Text" in media_types and not text_file:
143
+ raise gr.Error("Found no text.")
144
+ if "Images" in media_types and not image_file:
145
+ raise gr.Error("Found no images.")
146
+ if image_file:
147
+ files.append(image_file)
148
+ if text_file:
149
+ files.append(text_file)
150
+
151
+ print(f"Returning downloaded files from {url} in {files} ...")
152
+
153
+ return files
154
+
155
+ # Gradio Interface
156
+ with gr.Blocks(theme="dwancin/theme") as app:
157
+ title = gr.Markdown('''# Web Scrapping 🕵️''')
158
+ description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''')
159
+ with gr.Row():
160
+ with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"):
161
+ url_name = gr.Textbox(
162
+ placeholder="Enter URL here",
163
+ show_label=True,
164
+ label="Website",
165
+ )
166
+
167
+ media_types = gr.CheckboxGroup(
168
+ ["Images", "Text"],
169
+ value="Images",
170
+ label="Media types",
171
+ )
172
+
173
+ submit_button = gr.Button(
174
+ "Submit",
175
+ variant="primary",
176
+ interactive=True,
177
+ )
178
+
179
+ with gr.Column(scale=2):
180
+ output_files = gr.Files(
181
+ label="Output",
182
+ elem_id="file-list",
183
+ size="lg",
184
+ show_label=False,
185
+ )
186
+
187
+ submit_button.click(
188
+ checker,
189
+ inputs=[url_name, media_types],
190
+ outputs=[output_files],
191
+ )
192
+
193
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=3.35.2
2
+ beautifulsoup4>=4.12.2
3
+ Pillow>=9.5.0
4
+ requests>=2.31.0
5
+ Flask>=2.3.2
6
+ zipfile2>= 0.0.12
7
+ urllib3>=2.0.3
8
+ pytest-shutil>=1.7.0
9
+ mime>=0.1.0
10
+ mimetypes-extensions>=0.1.0
11
+ uuid>=1.30
12
+ pytube>=15.0.0