yujiepan
/

whisper-v3-tiny-random

Automatic Speech Recognition

Inference Endpoints

Model card Files Files and versions Community

whisper-v3-tiny-random / README.md

yujiepan's picture

Upload folder using huggingface_hub

0ff26a2 verified 2 months ago

|

history blame contribute delete

2.15 kB

	---
	library_name: transformers
	pipeline_tag: automatic-speech-recognition
	inference: true
	---

	This model is for debugging. It is randomly initialized with the config from [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) but is of smaller size.

	Codes:
	```python
	import os

	import torch

	from huggingface_hub import create_repo, upload_folder
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	GenerationConfig,
	AutoConfig,
	pipeline,
	set_seed,
	)
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, AutoConfig
	from datasets import load_dataset

	model_id = "openai/whisper-large-v3"
	repo_id = "yujiepan/whisper-v3-tiny-random"
	save_path = f"/tmp/{repo_id}"
	os.system(f'rm -rf {save_path}')
	os.makedirs(save_path, exist_ok=True)

	device = "cuda"
	torch_dtype = torch.float16
	model_id = "openai/whisper-large-v3"

	config = AutoConfig.from_pretrained(model_id)
	config.num_hidden_layers = 2
	config.d_model = 8
	config.decoder_attention_heads = 2
	config.decoder_ffn_dim = 16
	config.decoder_layers = 2
	config.encoder_ffn_dim = 16
	config.encoder_attention_heads = 2
	config.encoder_layers = 2

	model = AutoModelForSpeechSeq2Seq.from_config(config)
	model.to(device).to(torch_dtype)
	model.generation_config = GenerationConfig.from_pretrained(model_id)
	processor = AutoProcessor.from_pretrained(model_id)

	set_seed(42)
	num_params = 0
	with torch.no_grad():
	for name, p in sorted(model.named_parameters()):
	print(name, p.shape)
	torch.nn.init.uniform_(p, -0.5, 0.5)
	num_params += p.numel()
	print("Total number of parameters:", num_params)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	)

	sample = load_dataset(
	"distil-whisper/librispeech_long", "clean",
	split="validation",
	)[0]["audio"]
	result = pipe(sample, return_timestamps=True)
	print(result["text"])

	create_repo(repo_id, exist_ok=True)
	upload_folder(repo_id=repo_id, folder_path=save_path, repo_type='model')
	```