whisper-youtube-2-hf_dataset / preprocessing /youtubevideopreprocessor.py
juancopi81's picture
Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset
7288748
raw
history blame
4.02 kB
from typing import List, Generator, Tuple
from pathlib import Path
from itertools import islice
import scrapetube
from youtubesearchpython import ChannelsSearch
from pytube import Playlist
from utils import accepts_types
from loading.serialization import Serializer
class YoutubeVideoPreprocessor:
"""This class is responsible for creating json files of expected as YoutubeVideo
objects taking a channel name as input.
Each JSON file has the following information:
- channel_name: The name of the YouTube channel
- url: The url of the video
Args:
channel_name (`str`):
The name of the YouTube channel:
Returns:
load_paths (`List[Path]`)
The paths of the json files of the video of that channel.
TODO: Change it to accept also URL of video list, name of video list, etc.
"""
def __init__(self,
mode: str = "channel_name",
serializer = Serializer) -> None:
self.mode = mode
self.serializer = serializer
def preprocess(self,
name: str,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
if self.mode == "channel_name":
# TODO: Add credits
channels_search = ChannelsSearch(name, limit=1)
channel_id = channels_search.result()['result'][0]['id']
videos = scrapetube.get_channel(channel_id=channel_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
elif self.mode == "playlist":
playlist_id = name.split("=")[-1]
playlist = Playlist(name)
name = playlist.title
videos = scrapetube.get_playlist(playlist_id)
load_paths, dataset_folder = self._convert_videos_to_json_files(name,
videos,
num_videos,
videos_in_ds)
return load_paths, dataset_folder
else:
# TODO: implement this part
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
test_files_folder = youtube_folder/"test/files"
return [Path("test.json"), Path("test1.json")], test_files_folder
def _convert_videos_to_json_files(self,
name:str,
videos: Generator,
num_videos: int,
videos_in_ds: List[str]) -> Tuple[List[Path], Path]:
load_paths = []
youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber"
dataset_folder = youtube_folder/name
Path(dataset_folder).mkdir(parents=True, exist_ok=True)
i = 0
while i < num_videos:
try:
video = next(videos)
if video["videoId"] in videos_in_ds:
continue
else:
file_name = f"{i}.json"
save_path = Path(dataset_folder, file_name)
save_path.touch(exist_ok=True)
video_dict = {"channel_name": name,
"url":f"https://www.youtube.com/watch?v={video['videoId']}"}
self.serializer.dump(obj=video_dict, save_path=save_path)
load_paths.append(save_path)
i += 1
except StopIteration:
break
return load_paths, dataset_folder