wsntxxn
/

effb2-trm-audiocaps-captioning

Feature Extraction

Model card Files Files and versions Community

wsntxxn commited on Aug 19, 2024

Commit

1d09ca2

·

verified ·

1 Parent(s): 90bb951

Update README.md

Files changed (1) hide show

README.md +8 -0

README.md CHANGED Viewed

@@ -22,7 +22,10 @@ pip install numpy torch torchaudio einops transformers efficientnet_pytorch
 import torch
 from transformers import AutoModel, PreTrainedTokenizerFast
 import torchaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # use the model trained on AudioCaps
 model = AutoModel.from_pretrained(
     "wsntxxn/effb2-trm-audiocaps-captioning",
@@ -31,6 +34,7 @@ model = AutoModel.from_pretrained(
 tokenizer = PreTrainedTokenizerFast.from_pretrained(
     "wsntxxn/audiocaps-simple-tokenizer"
 )
 # inference on a single audio clip
 wav, sr = torchaudio.load("/path/to/file.wav")
 wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
@@ -43,14 +47,18 @@ with torch.no_grad():
     )
 caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
 print(caption)
 # inference on a batch
 wav1, sr1 = torchaudio.load("/path/to/file1.wav")
 wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
 wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
 wav2, sr2 = torchaudio.load("/path/to/file2.wav")
 wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
 wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
 wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
 with torch.no_grad():
     word_idxs = model(
         audio=wav_batch,

 import torch
 from transformers import AutoModel, PreTrainedTokenizerFast
 import torchaudio
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # use the model trained on AudioCaps
 model = AutoModel.from_pretrained(
     "wsntxxn/effb2-trm-audiocaps-captioning",
 tokenizer = PreTrainedTokenizerFast.from_pretrained(
     "wsntxxn/audiocaps-simple-tokenizer"
 )
 # inference on a single audio clip
 wav, sr = torchaudio.load("/path/to/file.wav")
 wav = torchaudio.functional.resample(wav, sr, model.config.sample_rate)
     )
 caption = tokenizer.decode(word_idxs[0], skip_special_tokens=True)
 print(caption)
 # inference on a batch
 wav1, sr1 = torchaudio.load("/path/to/file1.wav")
 wav1 = torchaudio.functional.resample(wav1, sr1, model.config.sample_rate)
 wav1 = wav1.mean(0) if wav1.size(0) > 1 else wav1[0]
 wav2, sr2 = torchaudio.load("/path/to/file2.wav")
 wav2 = torchaudio.functional.resample(wav2, sr2, model.config.sample_rate)
 wav2 = wav2.mean(0) if wav2.size(0) > 1 else wav2[0]
 wav_batch = torch.nn.utils.rnn.pad_sequence([wav1, wav2], batch_first=True)
 with torch.no_grad():
     word_idxs = model(
         audio=wav_batch,