gyrojeff commited on
Commit
a100da0
·
1 Parent(s): 10fb84b

feat: add japanese lyrics spider

Browse files
Files changed (1) hide show
  1. lyrics_corpus/crawler_uta_net.py +362 -0
lyrics_corpus/crawler_uta_net.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import click
3
+ import requests
4
+ import os
5
+ import sqlite3
6
+ from lxml import html
7
+ import concurrent.futures
8
+ from typing import List, Tuple
9
+ import io
10
+ from tqdm import tqdm
11
+
12
+
13
+ num_workers = 100
14
+
15
+ # init paths
16
+ root_dir = os.path.dirname(os.path.realpath(__file__))
17
+ cache_dir = os.path.join(root_dir, "cache")
18
+ artist_page_cache_path = os.path.join(cache_dir, "uta-net.db")
19
+ name_list_url_prefix = "https://www.uta-net.com/name_list/"
20
+ url_prefix = "https://www.uta-net.com"
21
+ artist_url_prefix = "https://www.uta-net.com/artist/"
22
+ song_url_prefix = "https://www.uta-net.com/song/"
23
+
24
+ artist_page_suffixes = [
25
+ 0,
26
+ 1,
27
+ 2,
28
+ 3,
29
+ 4,
30
+ 5,
31
+ 6,
32
+ 7,
33
+ 8,
34
+ 9,
35
+ 10,
36
+ 11,
37
+ 12,
38
+ 13,
39
+ 14,
40
+ 15,
41
+ 16,
42
+ 17,
43
+ 18,
44
+ 19,
45
+ 20,
46
+ 21,
47
+ 22,
48
+ 23,
49
+ 24,
50
+ 25,
51
+ 26,
52
+ 27,
53
+ 28,
54
+ 29,
55
+ 30,
56
+ 31,
57
+ 32,
58
+ 33,
59
+ 34,
60
+ 35,
61
+ 36,
62
+ 37,
63
+ 38,
64
+ 39,
65
+ 40,
66
+ 41,
67
+ 42,
68
+ 43,
69
+ 44,
70
+ 70,
71
+ ]
72
+
73
+ if not os.path.exists(cache_dir):
74
+ os.makedirs(cache_dir)
75
+
76
+ # init db
77
+ def init_artist_page_db():
78
+ conn = sqlite3.connect(artist_page_cache_path)
79
+ c = conn.cursor()
80
+ c.execute("""CREATE TABLE IF NOT EXISTS `index` (index_id INTEGER PRIMARY KEY)""")
81
+ c.execute(
82
+ """CREATE TABLE IF NOT EXISTS artists (artist_id INTEGER PRIMARY KEY, artist_name TEXT, done INTEGER DEFAULT 0)"""
83
+ )
84
+ c.execute("""CREATE TABLE IF NOT EXISTS songs (song_id INTEGER PRIMARY KEY)""")
85
+ c.execute(
86
+ """CREATE TABLE IF NOT EXISTS lyrics (song_id INTEGER PRIMARY KEY, lyrics TEXT)"""
87
+ )
88
+ conn.commit()
89
+
90
+
91
+ init_artist_page_db()
92
+
93
+
94
+ # scraper abstraction
95
+ class Scraper:
96
+ def __init__(self, desc: str, num_workers: int, func, workload: List):
97
+ self.desc = desc
98
+ self.num_workers = num_workers
99
+ self.func = func
100
+ self.workload = workload
101
+
102
+ def run(self):
103
+ with concurrent.futures.ThreadPoolExecutor(
104
+ max_workers=self.num_workers
105
+ ) as executor:
106
+ _ = list(
107
+ tqdm(
108
+ executor.map(self.func, self.workload),
109
+ total=len(self.workload),
110
+ leave=True,
111
+ desc=self.desc,
112
+ miniters=1,
113
+ ),
114
+ )
115
+
116
+
117
+ # db ops
118
+ def has_index_done_db(index_id):
119
+ conn = sqlite3.connect(artist_page_cache_path)
120
+ c = conn.cursor()
121
+ c.execute(f"SELECT index_id FROM `index` WHERE index_id=?", (index_id,))
122
+ row = c.fetchone()
123
+ if row is not None:
124
+ return True
125
+ return False
126
+
127
+
128
+ def complete_index_page_db(index_id, artists):
129
+ conn = sqlite3.connect(artist_page_cache_path)
130
+ c = conn.cursor()
131
+ c.execute(
132
+ f"INSERT OR REPLACE INTO `index` VALUES (?)",
133
+ (index_id,),
134
+ )
135
+
136
+ for artist_id, artist_name in artists:
137
+ c.execute(
138
+ f"INSERT INTO artists(artist_id, artist_name) SELECT ?, ? WHERE NOT EXISTS (SELECT 1 FROM artists WHERE artist_id=?)",
139
+ (artist_id, artist_name, artist_id),
140
+ )
141
+
142
+ conn.commit()
143
+
144
+
145
+ def get_all_artists_db():
146
+ conn = sqlite3.connect(artist_page_cache_path)
147
+ c = conn.cursor()
148
+ c.execute(f"SELECT artist_id, artist_name FROM artists")
149
+ rows = c.fetchall()
150
+ return rows
151
+
152
+
153
+ def has_artist_done_db(artist_id):
154
+ conn = sqlite3.connect(artist_page_cache_path)
155
+ c = conn.cursor()
156
+ c.execute(
157
+ f"SELECT artist_id FROM artists WHERE artist_id=? AND done=1", (artist_id,)
158
+ )
159
+ row = c.fetchone()
160
+ if row is not None:
161
+ return True
162
+ return False
163
+
164
+
165
+ def complete_artist_page_db(artist_id, songs):
166
+ conn = sqlite3.connect(artist_page_cache_path)
167
+ c = conn.cursor()
168
+ c.execute(f"UPDATE artists SET done=? WHERE artist_id=?", (1, artist_id))
169
+
170
+ for song_id in songs:
171
+ c.execute(f"INSERT OR REPLACE INTO songs VALUES (?)", (song_id,))
172
+
173
+ conn.commit()
174
+
175
+
176
+ def get_all_songs_db():
177
+ conn = sqlite3.connect(artist_page_cache_path)
178
+ c = conn.cursor()
179
+ c.execute(f"SELECT song_id FROM songs")
180
+ rows = c.fetchall()
181
+ rows = [row[0] for row in rows]
182
+ return rows
183
+
184
+
185
+ def has_lyrics_db(song_id):
186
+ conn = sqlite3.connect(artist_page_cache_path)
187
+ c = conn.cursor()
188
+ c.execute(f"SELECT song_id FROM lyrics WHERE song_id=?", (song_id,))
189
+ row = c.fetchone()
190
+ if row is not None:
191
+ return True
192
+ return False
193
+
194
+
195
+ def update_lyrics_db(song_id, lyrics):
196
+ conn = sqlite3.connect(artist_page_cache_path)
197
+ c = conn.cursor()
198
+ c.execute(f"INSERT OR REPLACE INTO lyrics VALUES (?, ?)", (song_id, lyrics))
199
+ conn.commit()
200
+
201
+
202
+ # scraping ops
203
+ def scrape_artist_list(index_id):
204
+ # Make a GET request to the URL
205
+ response = requests.get(f"{name_list_url_prefix}{index_id}")
206
+
207
+ # Parse the HTML content of the response
208
+ tree = html.fromstring(response.text)
209
+
210
+ # Find all of the links and names using the given XPath pattern
211
+ links = tree.xpath(
212
+ f'//*[contains(@id,"anchor_")]/dl/dd/ul//li/p[@class="flex-glow"]/a/@href'
213
+ )
214
+ names = tree.xpath(f'//*[contains(@id,"anchor_")]/dl/dd/ul//li/p/a/text()')
215
+
216
+ # Convert the links to integers
217
+ ids = []
218
+ for link in links:
219
+ ids.append(int(link.replace("artist", "").replace("/", "")))
220
+
221
+ ret = []
222
+ ret.extend(zip(ids, names))
223
+
224
+ return ret
225
+
226
+
227
+ def scrape_song_list(artist_id):
228
+ # Make a GET request to the URL
229
+ response = requests.get(f"{artist_url_prefix}{artist_id}")
230
+
231
+ # Parse the HTML content of the response
232
+ tree = html.fromstring(response.text)
233
+
234
+ # Find all of the links using the given XPath pattern
235
+ links = tree.xpath(
236
+ f'//*[@id="list-song"]/div[2]/div[1]/div[2]/div[2]//table/tbody//tr/td[1]/a/@href'
237
+ )
238
+
239
+ # Convert the links to integers
240
+ ids = []
241
+ for link in links:
242
+ try:
243
+ ids.append(int(link.replace("song", "").replace("/", "")))
244
+ except:
245
+ pass
246
+
247
+ return ids
248
+
249
+
250
+ def scrape_lyrics(song_id):
251
+ # Make a GET request to the URL
252
+ response = requests.get(f"{song_url_prefix}{song_id}")
253
+
254
+ # Parse the HTML content of the response
255
+ tree = html.fromstring(response.text)
256
+
257
+ # Find all of the links using the given XPath pattern
258
+ song_lyrics = tree.xpath('//*[@id="kashi_area"]/text()')
259
+
260
+ return "\n".join(song_lyrics)
261
+
262
+
263
+ @click.command()
264
+ @click.option(
265
+ "--no-cache-index",
266
+ is_flag=True,
267
+ default=False,
268
+ help="Do not use cached index page",
269
+ )
270
+ @click.option(
271
+ "--no-cache-artist",
272
+ is_flag=True,
273
+ default=False,
274
+ help="Do not use cached artist page",
275
+ )
276
+ @click.option(
277
+ "--no-cache-lyrics", is_flag=True, default=False, help="Do not use cached lyrics"
278
+ )
279
+ def scrape(no_cache_index, no_cache_artist, no_cache_lyrics):
280
+ # artists
281
+ def scrape_artist(index_id):
282
+ while True:
283
+ try:
284
+ if not no_cache_index and has_index_done_db(index_id):
285
+ pass
286
+ else:
287
+ artists = scrape_artist_list(index_id)
288
+ complete_index_page_db(index_id, artists)
289
+
290
+ break
291
+ except Exception as e:
292
+ traceback.print_exc()
293
+ print(f"Error: {e}")
294
+ print("Retrying ...")
295
+ continue
296
+
297
+ Scraper(
298
+ desc="Scraping artists",
299
+ num_workers=num_workers,
300
+ func=scrape_artist,
301
+ workload=artist_page_suffixes,
302
+ ).run()
303
+
304
+ artists = get_all_artists_db()
305
+ for artist_id, name in artists:
306
+ print(f"Detected: {name} -> {artist_id}")
307
+ print(f"{len(artists)} artists detected")
308
+
309
+ # songs
310
+ def scrape_song(artist: Tuple[int, str]):
311
+ while True:
312
+ try:
313
+ artist_id, name = artist
314
+ if not no_cache_artist and has_artist_done_db(artist_id):
315
+ print(f"Skipping: {name} -> {artist_id}")
316
+ else:
317
+ print(f"Scraping: {name} -> {artist_id}")
318
+ songs = scrape_song_list(artist_id)
319
+ complete_artist_page_db(artist_id, songs)
320
+
321
+ break
322
+ except Exception as e:
323
+ traceback.print_exc()
324
+ print(f"Error: {e}")
325
+ print("Retrying ...")
326
+ continue
327
+
328
+ Scraper(
329
+ desc="Scraping songs",
330
+ num_workers=num_workers,
331
+ func=scrape_song,
332
+ workload=artists,
333
+ ).run()
334
+
335
+ songs = get_all_songs_db()
336
+
337
+ # lyrics
338
+ def scrape_lyric(song_id):
339
+ while True:
340
+ try:
341
+ if not no_cache_lyrics and has_lyrics_db(song_id):
342
+ pass
343
+ else:
344
+ lyrics = scrape_lyrics(song_id)
345
+ update_lyrics_db(song_id, lyrics)
346
+ break
347
+ except Exception as e:
348
+ traceback.print_exc()
349
+ print(f"Error: {e}")
350
+ print("Retrying ...")
351
+ continue
352
+
353
+ Scraper(
354
+ desc="Scraping lyrics",
355
+ num_workers=num_workers,
356
+ func=scrape_lyric,
357
+ workload=songs,
358
+ ).run()
359
+
360
+
361
+ if __name__ == "__main__":
362
+ scrape()