Update README.md
Browse files
README.md
CHANGED
@@ -27,7 +27,7 @@ model-index:
|
|
27 |
value: {wer_result_on_test} #TODO (IMPORTANT): replace {wer_result_on_test} with the WER error rate you achieved on the common_voice test set. It should be in the format XX.XX (don't add the % sign here). **Please** remember to fill out this value after you evaluated your model, so that your model appears on the leaderboard. If you fill out this model card before evaluating your model, please remember to edit the model card afterward to fill in your value
|
28 |
---
|
29 |
|
30 |
-
# Wav2Vec2-Large-XLSR-53-
|
31 |
|
32 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Georgian using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
|
33 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
@@ -52,15 +52,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
|
52 |
# Preprocessing the datasets.
|
53 |
# We need to read the aduio files as arrays
|
54 |
def speech_file_to_array_fn(batch):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
|
59 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
60 |
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
61 |
|
62 |
with torch.no_grad():
|
63 |
-
|
64 |
|
65 |
predicted_ids = torch.argmax(logits, dim=-1)
|
66 |
|
@@ -88,30 +88,30 @@ processor = Wav2Vec2Processor.from_pretrained("Temur/wav2vec2-Georgian-Daytona")
|
|
88 |
model = Wav2Vec2ForCTC.from_pretrained("Temur/wav2vec2-Georgian-Daytona")
|
89 |
model.to("cuda")
|
90 |
|
91 |
-
chars_to_ignore_regex = '[
|
92 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
93 |
|
94 |
# Preprocessing the datasets.
|
95 |
# We need to read the aduio files as arrays
|
96 |
def speech_file_to_array_fn(batch):
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
103 |
|
104 |
# Preprocessing the datasets.
|
105 |
# We need to read the aduio files as arrays
|
106 |
def evaluate(batch):
|
107 |
-
|
108 |
|
109 |
-
|
110 |
-
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
|
116 |
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
117 |
|
|
|
27 |
value: {wer_result_on_test} #TODO (IMPORTANT): replace {wer_result_on_test} with the WER error rate you achieved on the common_voice test set. It should be in the format XX.XX (don't add the % sign here). **Please** remember to fill out this value after you evaluated your model, so that your model appears on the leaderboard. If you fill out this model card before evaluating your model, please remember to edit the model card afterward to fill in your value
|
28 |
---
|
29 |
|
30 |
+
# Wav2Vec2-Large-XLSR-53-Georgian
|
31 |
|
32 |
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Georgian using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
|
33 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
|
|
52 |
# Preprocessing the datasets.
|
53 |
# We need to read the aduio files as arrays
|
54 |
def speech_file_to_array_fn(batch):
|
55 |
+
\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
|
56 |
+
\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
|
57 |
+
\treturn batch
|
58 |
|
59 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
60 |
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
61 |
|
62 |
with torch.no_grad():
|
63 |
+
\tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
64 |
|
65 |
predicted_ids = torch.argmax(logits, dim=-1)
|
66 |
|
|
|
88 |
model = Wav2Vec2ForCTC.from_pretrained("Temur/wav2vec2-Georgian-Daytona")
|
89 |
model.to("cuda")
|
90 |
|
91 |
+
chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]' # TODO: adapt this list to include all special characters you removed from the data
|
92 |
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
93 |
|
94 |
# Preprocessing the datasets.
|
95 |
# We need to read the aduio files as arrays
|
96 |
def speech_file_to_array_fn(batch):
|
97 |
+
\tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
98 |
+
\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
|
99 |
+
\tbatch["speech"] = resampler(speech_array).squeeze().numpy()
|
100 |
+
\treturn batch
|
101 |
|
102 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
103 |
|
104 |
# Preprocessing the datasets.
|
105 |
# We need to read the aduio files as arrays
|
106 |
def evaluate(batch):
|
107 |
+
\tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
108 |
|
109 |
+
\twith torch.no_grad():
|
110 |
+
\t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
111 |
|
112 |
+
\tpred_ids = torch.argmax(logits, dim=-1)
|
113 |
+
\tbatch["pred_strings"] = processor.batch_decode(pred_ids)
|
114 |
+
\treturn batch
|
115 |
|
116 |
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
117 |
|