|
--- |
|
library_name: transformers |
|
pipeline_tag: automatic-speech-recognition |
|
inference: true |
|
--- |
|
|
|
This model is for debugging. It is randomly initialized with the config from [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) but is of smaller size. |
|
|
|
Codes: |
|
```python |
|
import os |
|
|
|
import torch |
|
|
|
from huggingface_hub import create_repo, upload_folder |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
GenerationConfig, |
|
AutoConfig, |
|
pipeline, |
|
set_seed, |
|
) |
|
import torch |
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoConfig |
|
from datasets import load_dataset |
|
|
|
model_id = "openai/whisper-large-v3" |
|
repo_id = "yujiepan/whisper-v3-tiny-random" |
|
save_path = f"/tmp/{repo_id}" |
|
os.system(f'rm -rf {save_path}') |
|
os.makedirs(save_path, exist_ok=True) |
|
|
|
device = "cuda" |
|
torch_dtype = torch.float16 |
|
model_id = "openai/whisper-large-v3" |
|
|
|
config = AutoConfig.from_pretrained(model_id) |
|
config.num_hidden_layers = 2 |
|
config.d_model = 8 |
|
config.decoder_attention_heads = 2 |
|
config.decoder_ffn_dim = 16 |
|
config.decoder_layers = 2 |
|
config.encoder_ffn_dim = 16 |
|
config.encoder_attention_heads = 2 |
|
config.encoder_layers = 2 |
|
|
|
model = AutoModelForSpeechSeq2Seq.from_config(config) |
|
model.to(device).to(torch_dtype) |
|
model.generation_config = GenerationConfig.from_pretrained(model_id) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
set_seed(42) |
|
num_params = 0 |
|
with torch.no_grad(): |
|
for name, p in sorted(model.named_parameters()): |
|
print(name, p.shape) |
|
torch.nn.init.uniform_(p, -0.5, 0.5) |
|
num_params += p.numel() |
|
print("Total number of parameters:", num_params) |
|
|
|
pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model=model, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
torch_dtype=torch_dtype, |
|
device=device, |
|
) |
|
|
|
sample = load_dataset( |
|
"distil-whisper/librispeech_long", "clean", |
|
split="validation", |
|
)[0]["audio"] |
|
result = pipe(sample, return_timestamps=True) |
|
print(result["text"]) |
|
|
|
create_repo(repo_id, exist_ok=True) |
|
upload_folder(repo_id=repo_id, folder_path=save_path, repo_type='model') |
|
``` |
|
|