hance-ai
/

audiomae

Safetensors

audiomae

AudioMAE

PyTorch

custom_code

Model card Files Files and versions Community

dslee2601 commited on Aug 16, 2024

Commit

1a2cf86

1 Parent(s): c60507b

support cuda

Browse files

Files changed (1) hide show

model.py +7 -7

model.py CHANGED Viewed

@@ -28,13 +28,12 @@ class AudioMAEConfig(PretrainedConfig):
 class AudioMAEEncoder(VisionTransformer):
-    def __init__(self, img_size, in_chans, num_classes, device):
-        super().__init__(img_size, in_chans, num_classes)
         """
         - img_size of (1024, 128) = (temporal_length, n_freq_bins) is fixed, as described in the paper
         - AudoMAE accepts a mono-channel (i.e., in_chans=1)
         """
-        self.device = device
         self.MEAN = -4.2677393  # written on the paper
         self.STD = 4.5689974  # written on the paper
@@ -96,13 +95,13 @@ class AudioMAEEncoder(VisionTransformer):
         return mel_spectrogram
     @torch.no_grad()
-    def encode(self, file_path:str):
         self.eval()
         waveform = self.load_wav_file(file_path)
         melspec = self.waveform_to_melspec(waveform)  # (length, n_freq_bins) = (1024, 128)
         melspec = melspec[None,None,:,:]  # (1, 1, length, n_freq_bins) = (1, 1, 1024, 128)
-        z = self.forward_features(melspec.to(self.device)).cpu()  # (b, 1+n, d); d=768
         z = z[:,1:,:]  # (b n d); remove [CLS], the class token
         b, c, w, h = melspec.shape  # w: temporal dim; h:freq dim
@@ -123,7 +122,8 @@ class PretrainedAudioMAEEncoder(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.encoder = AudioMAEEncoder(img_size=config.img_size, in_chans=config.in_chans, num_classes=config.num_classes, device=self.device)
     def forward(self, file_path:str):
-        return self.encoder.encode(file_path)  # (d h' w')

 class AudioMAEEncoder(VisionTransformer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         """
         - img_size of (1024, 128) = (temporal_length, n_freq_bins) is fixed, as described in the paper
         - AudoMAE accepts a mono-channel (i.e., in_chans=1)
         """
         self.MEAN = -4.2677393  # written on the paper
         self.STD = 4.5689974  # written on the paper
         return mel_spectrogram
     @torch.no_grad()
+    def encode(self, file_path:str, device):
         self.eval()
         waveform = self.load_wav_file(file_path)
         melspec = self.waveform_to_melspec(waveform)  # (length, n_freq_bins) = (1024, 128)
         melspec = melspec[None,None,:,:]  # (1, 1, length, n_freq_bins) = (1, 1, 1024, 128)
+        z = self.forward_features(melspec.to(device)).cpu()  # (b, 1+n, d); d=768
         z = z[:,1:,:]  # (b n d); remove [CLS], the class token
         b, c, w, h = melspec.shape  # w: temporal dim; h:freq dim
     def __init__(self, config):
         super().__init__(config)
+        self.encoder = AudioMAEEncoder(img_size=config.img_size, in_chans=config.in_chans, num_classes=config.num_classes)
     def forward(self, file_path:str):
+        device = self.device
+        return self.encoder.encode(file_path, device)  # (d h' w')