indonesian-nlp
/

wav2vec2-indonesian-javanese-sundanese

@@ -29,7 +29,10 @@ model-index:
 # Multilingual Speech Recognition for Indonesian Languages
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
-on the [Indonesian Common Voice dataset](https://huggingface.co/datasets/common_voice).
 When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
@@ -42,8 +45,8 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 test_dataset = load_dataset("common_voice", "id", split="test[:2%]")
-processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
-model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -81,8 +84,8 @@ import re
 test_dataset = load_dataset("common_voice", "id", split="test")
 wer = load_metric("wer")
-processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
-model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
 model.to("cuda")
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
@@ -90,7 +93,7 @@ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
-# We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
     speech_array, sampling_rate = torchaudio.load(batch["path"])
@@ -100,7 +103,7 @@ def speech_file_to_array_fn(batch):
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
-# We need to read the aduio files as arrays
 def evaluate(batch):
     inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
@@ -116,7 +119,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
-**Test Result**: 25.86 %
 ## Training

 # Multilingual Speech Recognition for Indonesian Languages
 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
+on the [Indonesian Common Voice dataset](https://huggingface.co/datasets/common_voice),
+[High-quality TTS data for Javanese - SLR41](https://huggingface.co/datasets/openslr), and
+[High-quality TTS data for Sundanese - SLR44](https://huggingface.co/datasets/openslr).
 When using this model, make sure that your speech input is sampled at 16kHz.
 ## Usage
 test_dataset = load_dataset("common_voice", "id", split="test[:2%]")
+processor = Wav2Vec2Processor.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
+model = Wav2Vec2ForCTC.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 test_dataset = load_dataset("common_voice", "id", split="test")
 wer = load_metric("wer")
+processor = Wav2Vec2Processor.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
+model = Wav2Vec2ForCTC.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
 model.to("cuda")
 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
+# We need to read the audio files as arrays
 def speech_file_to_array_fn(batch):
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
     speech_array, sampling_rate = torchaudio.load(batch["path"])
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 # Preprocessing the datasets.
+# We need to read the audio files as arrays
 def evaluate(batch):
     inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 ```
+**Test Result**: 11.57 %
 ## Training