hance-ai
/

audiomae

dslee2601 commited on Aug 16, 2024

Commit

c60507b

1 Parent(s): 2fccb59

support cuda

Files changed (1) hide show

model.py CHANGED Viewed

@@ -28,12 +28,13 @@ class AudioMAEConfig(PretrainedConfig):
 class AudioMAEEncoder(VisionTransformer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
         """
         - img_size of (1024, 128) = (temporal_length, n_freq_bins) is fixed, as described in the paper
         - AudoMAE accepts a mono-channel (i.e., in_chans=1)
         """
         self.MEAN = -4.2677393  # written on the paper
         self.STD = 4.5689974  # written on the paper
@@ -101,7 +102,7 @@ class AudioMAEEncoder(VisionTransformer):
         waveform = self.load_wav_file(file_path)
         melspec = self.waveform_to_melspec(waveform)  # (length, n_freq_bins) = (1024, 128)
         melspec = melspec[None,None,:,:]  # (1, 1, length, n_freq_bins) = (1, 1, 1024, 128)
-        z = self.forward_features(melspec)  # (b, 1+n, d); d=768
         z = z[:,1:,:]  # (b n d); remove [CLS], the class token
         b, c, w, h = melspec.shape  # w: temporal dim; h:freq dim
@@ -122,7 +123,7 @@ class PretrainedAudioMAEEncoder(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.encoder = AudioMAEEncoder(img_size=config.img_size, in_chans=config.in_chans, num_classes=config.num_classes)
     def forward(self, file_path:str):
         return self.encoder.encode(file_path)  # (d h' w')

 class AudioMAEEncoder(VisionTransformer):
+    def __init__(self, img_size, in_chans, num_classes, device):
+        super().__init__(img_size, in_chans, num_classes)
         """
         - img_size of (1024, 128) = (temporal_length, n_freq_bins) is fixed, as described in the paper
         - AudoMAE accepts a mono-channel (i.e., in_chans=1)
         """
+        self.device = device
         self.MEAN = -4.2677393  # written on the paper
         self.STD = 4.5689974  # written on the paper
         waveform = self.load_wav_file(file_path)
         melspec = self.waveform_to_melspec(waveform)  # (length, n_freq_bins) = (1024, 128)
         melspec = melspec[None,None,:,:]  # (1, 1, length, n_freq_bins) = (1, 1, 1024, 128)
+        z = self.forward_features(melspec.to(self.device)).cpu()  # (b, 1+n, d); d=768
         z = z[:,1:,:]  # (b n d); remove [CLS], the class token
         b, c, w, h = melspec.shape  # w: temporal dim; h:freq dim
     def __init__(self, config):
         super().__init__(config)
+        self.encoder = AudioMAEEncoder(img_size=config.img_size, in_chans=config.in_chans, num_classes=config.num_classes, device=self.device)
     def forward(self, file_path:str):
         return self.encoder.encode(file_path)  # (d h' w')