Spaces:
Sleeping
Sleeping
bpietrzak
commited on
Commit
·
c49f003
1
Parent(s):
330654a
App
Browse files- app.py +50 -0
- encode_dataset.py +0 -34
- push_model.py +0 -29
- train.py +0 -134
app.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
4 |
+
import gradio as gr
|
5 |
+
import json
|
6 |
+
|
7 |
+
|
8 |
+
config = json.load(open("config.json"))
|
9 |
+
|
10 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
11 |
+
config['extractor_path'], do_normalize=True, return_attention_mask=True
|
12 |
+
)
|
13 |
+
|
14 |
+
model = AutoModelForAudioClassification.from_pretrained(
|
15 |
+
config['model_path']
|
16 |
+
)
|
17 |
+
|
18 |
+
def audio_pipeline(audio_file):
|
19 |
+
if isinstance(audio_file, str):
|
20 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
21 |
+
else:
|
22 |
+
waveform, sample_rate = torchaudio.load(audio_file.name)
|
23 |
+
waveform = waveform.mean(dim=0)
|
24 |
+
if sample_rate != feature_extractor.sampling_rate:
|
25 |
+
transform = torchaudio.transforms.Resample(
|
26 |
+
orig_freq=sample_rate,
|
27 |
+
new_freq=feature_extractor.sampling_rate)
|
28 |
+
waveform = transform(waveform)
|
29 |
+
inputs = feature_extractor(waveform,
|
30 |
+
sampling_rate=feature_extractor.sampling_rate,
|
31 |
+
return_tensors="pt",
|
32 |
+
padding=True)
|
33 |
+
with torch.no_grad():
|
34 |
+
logits = model(**inputs).logits
|
35 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)[0]
|
36 |
+
|
37 |
+
top_probs, top_ids = torch.topk(probs, config['top_k'])
|
38 |
+
top_labels = [model.config.id2label[idx.item()] for idx in top_ids]
|
39 |
+
|
40 |
+
results = {label: prob.item() for label, prob in zip(top_labels, top_probs)}
|
41 |
+
return results
|
42 |
+
|
43 |
+
demo = gr.Interface(
|
44 |
+
fn=audio_pipeline,
|
45 |
+
inputs=[gr.Audio(type="filepath", label="Upload Audio")],
|
46 |
+
outputs=gr.Label(num_top_classes=config['top_k']),
|
47 |
+
title="Music Mind",
|
48 |
+
)
|
49 |
+
|
50 |
+
demo.launch(debug=True)
|
encode_dataset.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import librosa
|
3 |
-
from transformers import AutoFeatureExtractor
|
4 |
-
import argparse
|
5 |
-
import os
|
6 |
-
|
7 |
-
|
8 |
-
def parse_args():
|
9 |
-
ap = argparse.ArgumentParser()
|
10 |
-
ap.add_argument("--input_dir", type=str, default="data")
|
11 |
-
ap.add_argument("--output_dir", type=str, default="features")
|
12 |
-
ap.add_argument("--model_id", type=str)
|
13 |
-
|
14 |
-
return vars(ap.parse_args())
|
15 |
-
|
16 |
-
def main(args):
|
17 |
-
feature_extractor = AutoFeatureExtractor.from_pretrained(args["model_id"])
|
18 |
-
fe_sr = feature_extractor.sampling_rate
|
19 |
-
|
20 |
-
for root, _, files in os.walk(args["input_dir"]):
|
21 |
-
for file in files:
|
22 |
-
if not file.endswith(".wav"):
|
23 |
-
continue
|
24 |
-
path = os.path.join(root, file)
|
25 |
-
waveform, _ = librosa.load(path, sr=fe_sr)
|
26 |
-
if waveform.ndim == 1:
|
27 |
-
waveform = waveform.reshape(1, -1)
|
28 |
-
inputs = feature_extractor(waveform, sampling_rate=fe_sr)
|
29 |
-
output_path = os.path.join(args["output_dir"], file.replace(".wav", ".npy"))
|
30 |
-
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
31 |
-
np.save(output_path, inputs['input_values'])
|
32 |
-
|
33 |
-
if __name__ == "__main__":
|
34 |
-
main(parse_args())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
push_model.py
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
3 |
-
from huggingface_hub import HfApi, HfFolder, Repository
|
4 |
-
|
5 |
-
def push_model_to_hub(local_model_path, model_name):
|
6 |
-
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)
|
7 |
-
tokenizer = AutoTokenizer.from_pretrained(local_model_path)
|
8 |
-
config = AutoConfig.from_pretrained(local_model_path)
|
9 |
-
|
10 |
-
temp_dir = f"./{model_name}"
|
11 |
-
model.save_pretrained(temp_dir)
|
12 |
-
tokenizer.save_pretrained(temp_dir)
|
13 |
-
config.save_pretrained(temp_dir)
|
14 |
-
|
15 |
-
api = HfApi()
|
16 |
-
user = HfFolder.get_user()
|
17 |
-
repo_url = api.create_repo(name=model_name, token=HfFolder.get_token())
|
18 |
-
|
19 |
-
repo = Repository(local_dir=temp_dir, clone_from=repo_url)
|
20 |
-
repo.push_to_hub(commit_message="Initial commit")
|
21 |
-
|
22 |
-
if __name__ == "__main__":
|
23 |
-
parser = argparse.ArgumentParser(description="Push a model to the Hugging Face Hub")
|
24 |
-
parser.add_argument("--local_model_path", type=str, required=True, help="Path to the local model")
|
25 |
-
parser.add_argument("--model_name", type=str, required=True, help="Desired name of the model on the Hugging Face Hub")
|
26 |
-
|
27 |
-
args = parser.parse_args()
|
28 |
-
|
29 |
-
push_model_to_hub(args.local_model_path, args.model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train.py
DELETED
@@ -1,134 +0,0 @@
|
|
1 |
-
from transformers import AutoModelForAudioClassification
|
2 |
-
from torch.utils.data import DataLoader
|
3 |
-
import evaluate
|
4 |
-
import torch
|
5 |
-
from tqdm import tqdm
|
6 |
-
import argparse
|
7 |
-
import json
|
8 |
-
import os
|
9 |
-
import shutil
|
10 |
-
import mlflow
|
11 |
-
import mlflow.pytorch
|
12 |
-
|
13 |
-
from gtzan import GtzanDataset
|
14 |
-
|
15 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
-
|
17 |
-
metric = evaluate.load("accuracy")
|
18 |
-
|
19 |
-
def parse_args():
|
20 |
-
ap = argparse.ArgumentParser()
|
21 |
-
ap.add_argument("--label2id", type=str)
|
22 |
-
ap.add_argument("--model_id", type=str)
|
23 |
-
ap.add_argument("--batch_size", type=int, default=32)
|
24 |
-
ap.add_argument("--train_dir", type=str, default="data/train")
|
25 |
-
ap.add_argument("--val_dir", type=str, default="data/val")
|
26 |
-
ap.add_argument("--num_workers", type=int, default=4)
|
27 |
-
ap.add_argument("--lr", type=float, default=1e-4)
|
28 |
-
ap.add_argument("--epochs", type=int, default=10)
|
29 |
-
ap.add_argument("--output_dir", type=str, default="./weights")
|
30 |
-
ap.add_argument("--seed", type=int, default=42)
|
31 |
-
ap.add_argument("--name", type=str, default="model")
|
32 |
-
return vars(ap.parse_args())
|
33 |
-
|
34 |
-
def train(args):
|
35 |
-
torch.manual_seed(args["seed"])
|
36 |
-
|
37 |
-
label2id = json.load(open(args["label2id"]))
|
38 |
-
id2label = {v: k for k, v in label2id.items()}
|
39 |
-
num_labels = len(label2id)
|
40 |
-
if not os.path.exists(args["output_dir"]):
|
41 |
-
os.makedirs(args["output_dir"])
|
42 |
-
|
43 |
-
train_dataset = GtzanDataset(args["train_dir"], label2id)
|
44 |
-
val_dataset = GtzanDataset(args["val_dir"], label2id)
|
45 |
-
|
46 |
-
train_loader = DataLoader(
|
47 |
-
train_dataset,
|
48 |
-
batch_size=args["batch_size"],
|
49 |
-
shuffle=True,
|
50 |
-
num_workers=args["num_workers"])
|
51 |
-
|
52 |
-
val_loader = DataLoader(
|
53 |
-
val_dataset,
|
54 |
-
batch_size=args["batch_size"],
|
55 |
-
shuffle=False,
|
56 |
-
num_workers=args["num_workers"])
|
57 |
-
|
58 |
-
model = AutoModelForAudioClassification.from_pretrained(
|
59 |
-
args['model_id'],
|
60 |
-
num_labels=num_labels,
|
61 |
-
label2id=label2id,
|
62 |
-
id2label=id2label,
|
63 |
-
).to(device)
|
64 |
-
|
65 |
-
optimizer = torch.optim.AdamW(model.parameters(), lr=args["lr"])
|
66 |
-
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
|
67 |
-
optimizer, T_max=len(train_loader) * args["epochs"]
|
68 |
-
)
|
69 |
-
|
70 |
-
max_val_accuracy = 0
|
71 |
-
best_path = ""
|
72 |
-
|
73 |
-
with mlflow.start_run():
|
74 |
-
mlflow.log_params({
|
75 |
-
"model_id": args["model_id"],
|
76 |
-
"batch_size": args["batch_size"],
|
77 |
-
"lr": args["lr"],
|
78 |
-
"epochs": args["epochs"],
|
79 |
-
"seed": args["seed"]
|
80 |
-
})
|
81 |
-
|
82 |
-
for epoch in tqdm(range(args["epochs"])):
|
83 |
-
model.train()
|
84 |
-
train_progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
|
85 |
-
for batch in train_progress_bar:
|
86 |
-
input_values, attention_mask, label = [b.to(device) for b in batch]
|
87 |
-
outputs = model(input_values=input_values,
|
88 |
-
attention_mask=attention_mask,
|
89 |
-
labels=label)
|
90 |
-
loss = outputs.loss
|
91 |
-
loss.backward()
|
92 |
-
optimizer.step()
|
93 |
-
lr_scheduler.step()
|
94 |
-
optimizer.zero_grad()
|
95 |
-
|
96 |
-
train_progress_bar.set_postfix({"loss": loss.item()})
|
97 |
-
train_progress_bar.update(1)
|
98 |
-
mlflow.log_metric("train_loss", loss.item()) # Log training loss
|
99 |
-
|
100 |
-
torch.cuda.empty_cache()
|
101 |
-
model.eval()
|
102 |
-
|
103 |
-
val_progress_bar = tqdm(val_loader, desc="Validation")
|
104 |
-
for batch in val_progress_bar:
|
105 |
-
input_values, attention_mask, label = [b.to(device) for b in batch]
|
106 |
-
with torch.no_grad():
|
107 |
-
outputs = model(input_values=input_values,
|
108 |
-
attention_mask=attention_mask,
|
109 |
-
labels=label)
|
110 |
-
|
111 |
-
logits = outputs.logits
|
112 |
-
predictions = torch.argmax(logits, dim=-1)
|
113 |
-
metric.add_batch(predictions=predictions, references=label)
|
114 |
-
val_progress_bar.update(1)
|
115 |
-
|
116 |
-
val_accuracy = metric.compute()
|
117 |
-
mlflow.log_metric("val_accuracy", val_accuracy["accuracy"], step=epoch) # Log validation accuracy
|
118 |
-
torch.cuda.empty_cache()
|
119 |
-
if val_accuracy["accuracy"] > max_val_accuracy:
|
120 |
-
if best_path:
|
121 |
-
shutil.rmtree(best_path)
|
122 |
-
model_save_dir = os.path.join(
|
123 |
-
args["output_dir"],
|
124 |
-
args['name'],
|
125 |
-
f"{int(round(val_accuracy['accuracy'], 2) * 100)}")
|
126 |
-
if not os.path.exists(model_save_dir):
|
127 |
-
os.makedirs(model_save_dir, exist_ok=True)
|
128 |
-
model.save_pretrained(model_save_dir)
|
129 |
-
max_val_accuracy = val_accuracy["accuracy"]
|
130 |
-
best_path = model_save_dir
|
131 |
-
|
132 |
-
mlflow.pytorch.log_model(model, "model")
|
133 |
-
if __name__ == "__main__":
|
134 |
-
train(parse_args())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|