juancopi81's picture
Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset
7288748
raw
history blame
1.32 kB
# Adapted from Eduardo Matallanas
from datasets import load_dataset, Dataset
from datasets.data_files import EmptyDatasetError
class HFDataset():
"""
Create a dataset to save the transcripts from Youtube.
"""
def __init__(self, name) -> None:
self.name = name
if name != "":
self._init_dataset()
else:
self.dataset = Dataset.from_dict({})
self.exist = False
self.is_empty = True
def _init_dataset(self):
try:
self.dataset = load_dataset(self.name)
self.exist = True
self.is_empty = False
self.list_of_ids = self._get_list_of_id()
except EmptyDatasetError:
self.dataset = Dataset.from_dict({})
self.exist = True
self.is_empty = True
self.list_of_ids = []
pass
except FileNotFoundError:
self.dataset = Dataset.from_dict({})
self.exist = False
self.is_empty = True
self.list_of_ids = []
pass
def upload(self):
self.dataset.push_to_hub(self.name)
def _get_list_of_id(self):
new_ds = self.dataset.map(
lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True
)
list_of_ids = []
for split in new_ds:
ids = new_ds[split]["ID"]
list_of_ids.append(ids)
return [item for sublist in list_of_ids for item in sublist]