cahya commited on
Commit
287938a
·
1 Parent(s): 93fe023

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +11 -8
README.md CHANGED
@@ -29,7 +29,10 @@ model-index:
29
  # Multilingual Speech Recognition for Indonesian Languages
30
 
31
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
32
- on the [Indonesian Common Voice dataset](https://huggingface.co/datasets/common_voice).
 
 
 
33
  When using this model, make sure that your speech input is sampled at 16kHz.
34
 
35
  ## Usage
@@ -42,8 +45,8 @@ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
42
 
43
  test_dataset = load_dataset("common_voice", "id", split="test[:2%]")
44
 
45
- processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
46
- model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
47
 
48
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
49
 
@@ -81,8 +84,8 @@ import re
81
  test_dataset = load_dataset("common_voice", "id", split="test")
82
  wer = load_metric("wer")
83
 
84
- processor = Wav2Vec2Processor.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
85
- model = Wav2Vec2ForCTC.from_pretrained("cahya/wav2vec2-large-xlsr-indonesian")
86
  model.to("cuda")
87
 
88
  chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
@@ -90,7 +93,7 @@ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
90
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
91
 
92
  # Preprocessing the datasets.
93
- # We need to read the aduio files as arrays
94
  def speech_file_to_array_fn(batch):
95
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
96
  speech_array, sampling_rate = torchaudio.load(batch["path"])
@@ -100,7 +103,7 @@ def speech_file_to_array_fn(batch):
100
  test_dataset = test_dataset.map(speech_file_to_array_fn)
101
 
102
  # Preprocessing the datasets.
103
- # We need to read the aduio files as arrays
104
  def evaluate(batch):
105
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
106
 
@@ -116,7 +119,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
117
  ```
118
 
119
- **Test Result**: 25.86 %
120
 
121
  ## Training
122
 
 
29
  # Multilingual Speech Recognition for Indonesian Languages
30
 
31
  Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)
32
+ on the [Indonesian Common Voice dataset](https://huggingface.co/datasets/common_voice),
33
+ [High-quality TTS data for Javanese - SLR41](https://huggingface.co/datasets/openslr), and
34
+ [High-quality TTS data for Sundanese - SLR44](https://huggingface.co/datasets/openslr).
35
+
36
  When using this model, make sure that your speech input is sampled at 16kHz.
37
 
38
  ## Usage
 
45
 
46
  test_dataset = load_dataset("common_voice", "id", split="test[:2%]")
47
 
48
+ processor = Wav2Vec2Processor.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
49
+ model = Wav2Vec2ForCTC.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
50
 
51
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
52
 
 
84
  test_dataset = load_dataset("common_voice", "id", split="test")
85
  wer = load_metric("wer")
86
 
87
+ processor = Wav2Vec2Processor.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
88
+ model = Wav2Vec2ForCTC.from_pretrained("indonesian-nlp/wav2vec2-indonesian-javanese-sundanese")
89
  model.to("cuda")
90
 
91
  chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\'\”\�]'
 
93
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
94
 
95
  # Preprocessing the datasets.
96
+ # We need to read the audio files as arrays
97
  def speech_file_to_array_fn(batch):
98
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
99
  speech_array, sampling_rate = torchaudio.load(batch["path"])
 
103
  test_dataset = test_dataset.map(speech_file_to_array_fn)
104
 
105
  # Preprocessing the datasets.
106
+ # We need to read the audio files as arrays
107
  def evaluate(batch):
108
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
109
 
 
119
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
120
  ```
121
 
122
+ **Test Result**: 11.57 %
123
 
124
  ## Training
125