# -*- coding: utf-8 -*- """Copy of AudioCourse_MusicGenreClassifier_P2.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/19qAGS31MqX04p9EeUgAM2dGvI3SB3pm6 """ !pip install --upgrade transformers !pip install datasets !pip install gradio from datasets import load_dataset gtzan = load_dataset("marsyas/gtzan", "all") gtzan # GTZAN does not provide a split in the dataset, so we are creating one # ourselves gtzan = gtzan['train'].train_test_split(test_size=0.1, shuffle=True, seed=42) gtzan gtzan['train'][0] # genre is represented as an integer, so let’s use the int2str() method of # the genre feature to map these integers to human-readable names id2label_fn = gtzan['train'].features['genre'].int2str id2label_fn(gtzan['train'][0]['genre']) # Let’s now listen to a few more examples by using Gradio to create a simple # interface with the Blocks API import gradio as gr def generate_audio(): example = gtzan["train"].shuffle()[0] audio = example["audio"] return ( audio["sampling_rate"], audio["array"], ), id2label_fn(example["genre"]) with gr.Blocks() as demo: with gr.Column(): for _ in range(4): audio, label = generate_audio() output = gr.Audio(audio, label=label) demo.launch(debug=True) from transformers import AutoFeatureExtractor model_id = "ntu-spml/distilhubert" feature_extractor = AutoFeatureExtractor.from_pretrained( model_id, do_normalize=True, return_attention_mask=True ) # As we have seen above, the sampling rate of the audio samples # in the dataset is = 22KHz(approx.) let's find the sampling rate accepted by the # model sampling_rate = feature_extractor.sampling_rate sampling_rate # The model needs 16Khz samples so we can use the cast_column() method to # downsample the examples to match the requirements of the model. from datasets import Audio gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate)) # Let's verify if the changes were successful. gtzan['train'][0] # Works! However, I noticed that the 1-D NP Array of the audio has changed. # What exactly did the feature extractor do? # Looking into this import numpy as np test_sample = gtzan['train'][0]['audio'] print(f"Mean: {np.mean(test_sample['array']):.3},\n", f"Variance: {np.var(test_sample['array']):.3}") inputs = feature_extractor(test_sample["array"], sampling_rate=test_sample["sampling_rate"]) print(f"inputs keys: {list(inputs.keys())}") print( f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}" ) # the model cannot process audio samples above 30secs, # therefore, wee need to trucate examples that have longer durations. # Let's define a method to set the max_duration and use the feature_extractor # class on a single sample. # Later, we can use the .map() method to apply the same for all samples. max_duration = 30.0 def preprocess_function(examples): audio_arrays = [x["array"] for x in examples["audio"]] inputs = feature_extractor( audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=int(feature_extractor.sampling_rate * max_duration), truncation=True, return_attention_mask=True, ) return inputs gtzan_encoded = gtzan.map( preprocess_function, remove_columns=["audio", "file"], batched=True, batch_size=100, num_proc=1, ) gtzan_encoded gtzan_encoded = gtzan_encoded.rename_column("genre", "label") id2label = { str(i): id2label_fn(i) for i in range(len(gtzan_encoded["train"].features["label"].names)) } label2id = {v: k for k, v in id2label.items()} id2label["7"] # Begin fine tuning the model from transformers import AutoModelForAudioClassification num_labels = len(id2label) model = AutoModelForAudioClassification.from_pretrained( model_id, num_labels=num_labels, label2id=label2id, id2label=id2label, ) from huggingface_hub import notebook_login notebook_login() !pip install transformers[torch] !pip install accelerate -U from transformers import TrainingArguments model_name = model_id.split("/")[-1] batch_size = 8 gradient_accumulation_steps = 1 num_train_epochs = 10 training_args = TrainingArguments( f"{model_name}-finetuned-gtzan", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, per_device_eval_batch_size=batch_size, num_train_epochs=num_train_epochs, warmup_ratio=0.1, logging_steps=5, load_best_model_at_end=True, metric_for_best_model="accuracy", fp16=True, push_to_hub=False, ) !pip install evaluate import evaluate import numpy as np metric = evaluate.load("accuracy") def compute_metrics(eval_pred): """Computes accuracy on a batch of predictions""" predictions = np.argmax(eval_pred.predictions, axis=1) return metric.compute(predictions=predictions, references=eval_pred.label_ids) # Now we have got all the required pieces. # Instantiating the Trainer class and training the model. from transformers import Trainer trainer = Trainer( model, training_args, train_dataset=gtzan_encoded["train"], eval_dataset=gtzan_encoded["test"], tokenizer=feature_extractor, compute_metrics=compute_metrics, ) trainer.train() !pip install huggingface-cli !huggingface-cli login kwargs = { "dataset_tags": "marsyas/gtzan", "dataset": "GTZAN", "model_name": f"{model_name}-finetuned-gtzan", "finetuned_from": model_id, "tasks": "audio-classification", }