ameerazam08's picture
Upload folder using huggingface_hub
e34aada verified
import argparse
import multiprocessing as mp
import os
from functools import partial
from time import time as timer
from pytube import YouTube
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument('--input_list', type=str, required=True,
help='List of youtube video ids')
parser.add_argument('--output_dir', type=str, default='data/youtube_videos',
help='Location to download videos')
parser.add_argument('--num_workers', type=int, default=8,
help='How many multiprocessing workers?')
args = parser.parse_args()
def download_video(output_dir, video_id):
r"""Download video."""
video_path = '%s/%s.mp4' % (output_dir, video_id)
if not os.path.isfile(video_path):
try:
# Download the highest quality mp4 stream.
yt = YouTube('https://www.youtube.com/watch?v=%s' % (video_id))
stream = yt.streams.filter(subtype='mp4', only_video=True, adaptive=True).first()
if stream is None:
stream = yt.streams.filter(subtype='mp4').first()
stream.download(output_path=output_dir, filename=video_id + '.mp4')
except Exception as e:
print(e)
print('Failed to download %s' % (video_id))
else:
print('File exists: %s' % (video_id))
if __name__ == '__main__':
# Read list of videos.
video_ids = []
with open(args.input_list) as fin:
for line in fin:
video_ids.append(line.strip())
# Create output folder.
os.makedirs(args.output_dir, exist_ok=True)
# Download videos.
downloader = partial(download_video, args.output_dir)
start = timer()
pool_size = args.num_workers
print('Using pool size of %d' % (pool_size))
with mp.Pool(processes=pool_size) as p:
_ = list(tqdm(p.imap_unordered(downloader, video_ids), total=len(video_ids)))
print('Elapsed time: %.2f' % (timer() - start))