joselobenitezg commited on
Commit
c106947
·
verified ·
1 Parent(s): 11aaa55

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +48 -3
  2. config.json +79 -48
  3. tokenizer_config.json +11 -6
  4. vocab.json +56 -0
README.md CHANGED
@@ -1,5 +1,50 @@
1
- # TTSMMS Model - grn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- Text-to-speech model from the Massively Multilingual Speech (MMS) project
4
 
5
- This model was converted from the original MMS VITS model for use with 🤗 Transformers.
 
1
+ ---
2
+ language: gn
3
+ tags:
4
+ - guarani
5
+ - tts
6
+ - speech
7
+ - tts-mms
8
+ license: mit
9
+ datasets:
10
+ - mozilla-foundation/common_voice_11_0
11
+ ---
12
+
13
+ # Guarani TTS-MMS Model
14
+
15
+ This is a Text-to-Speech model for the Guarani language, based on the META Massive Multilingual Speech (MMS) architecture.
16
+
17
+ ## Model Description
18
+
19
+ This model is designed for Guarani text-to-speech synthesis, utilizing the TTS-MMS architecture. It can generate natural-sounding speech from Guarani text input.
20
+
21
+ ## Usage
22
+ python
23
+ from transformers import AutoProcessor, AutoModel
24
+ processor = AutoProcessor.from_pretrained("joselobenitezg/mms-grn-tts")
25
+ model = AutoModel.from_pretrained("joselobenitezg/mms-grn-tts")
26
+ Example usage
27
+ text = "Mba'éichapa"
28
+ inputs = processor(text=text, return_tensors="pt")
29
+ speech = model.generate(inputs)
30
+
31
+ # Training Data
32
+
33
+ The model was trained using:
34
+ - Guarani Common Voice dataset
35
+ - [Add other data sources if applicable]
36
+
37
+ ## Model Architecture
38
+
39
+ The model uses the TTS-MMS architecture with the following key components:
40
+ - Encoder-decoder architecture
41
+ - Self-attention mechanisms
42
+ - [Add specific architectural details]
43
+
44
+ ## Limitations
45
+
46
+ - [List any known limitations]
47
+ - [Add performance considerations]
48
+
49
 
 
50
 
 
config.json CHANGED
@@ -1,52 +1,83 @@
1
  {
2
- "architectures": [
3
- "SynthesizerTrn"
4
- ],
5
- "model_type": "ttsmms",
6
- "vocab_size": 53,
7
- "spec_channels": 513,
8
- "segment_size": 32,
9
- "inter_channels": 192,
10
- "hidden_channels": 192,
11
- "filter_channels": 768,
12
- "n_heads": 2,
13
- "n_layers": 6,
14
- "kernel_size": 3,
15
- "p_dropout": 0.1,
16
- "resblock": "1",
17
- "resblock_kernel_sizes": [
18
- 3,
19
- 7,
20
- 11
21
- ],
22
- "resblock_dilation_sizes": [
23
- [
24
- 1,
25
- 3,
26
- 5
27
  ],
28
- [
29
- 1,
30
- 3,
31
- 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  ],
33
- [
34
- 1,
35
  3,
36
- 5
37
- ]
38
- ],
39
- "upsample_rates": [
40
- 8,
41
- 8,
42
- 2,
43
- 2
44
- ],
45
- "upsample_initial_channel": 512,
46
- "upsample_kernel_sizes": [
47
- 16,
48
- 16,
49
- 4,
50
- 4
51
- ]
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "activation_dropout": 0.1,
3
+ "architectures": [
4
+ "VitsModel"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
+ "attention_dropout": 0.1,
7
+ "depth_separable_channels": 2,
8
+ "depth_separable_num_layers": 3,
9
+ "duration_predictor_dropout": 0.5,
10
+ "duration_predictor_filter_channels": 256,
11
+ "duration_predictor_flow_bins": 10,
12
+ "duration_predictor_kernel_size": 3,
13
+ "duration_predictor_num_flows": 4,
14
+ "duration_predictor_tail_bound": 5.0,
15
+ "ffn_dim": 768,
16
+ "ffn_kernel_size": 3,
17
+ "flow_size": 192,
18
+ "hidden_act": "relu",
19
+ "hidden_dropout": 0.1,
20
+ "hidden_size": 192,
21
+ "initializer_range": 0.02,
22
+ "layer_norm_eps": 1e-05,
23
+ "layerdrop": 0.1,
24
+ "leaky_relu_slope": 0.1,
25
+ "model_type": "vits",
26
+ "noise_scale": 0.667,
27
+ "noise_scale_duration": 0.8,
28
+ "num_attention_heads": 2,
29
+ "num_hidden_layers": 6,
30
+ "num_speakers": 1,
31
+ "posterior_encoder_num_wavenet_layers": 16,
32
+ "prior_encoder_num_flows": 4,
33
+ "prior_encoder_num_wavenet_layers": 4,
34
+ "resblock_dilation_sizes": [
35
+ [
36
+ 1,
37
+ 3,
38
+ 5
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ]
50
  ],
51
+ "resblock_kernel_sizes": [
 
52
  3,
53
+ 7,
54
+ 11
55
+ ],
56
+ "sampling_rate": 16000,
57
+ "speaker_embedding_size": 0,
58
+ "speaking_rate": 1.0,
59
+ "spectrogram_bins": 513,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.33.0.dev0",
62
+ "upsample_initial_channel": 512,
63
+ "upsample_kernel_sizes": [
64
+ 16,
65
+ 16,
66
+ 4,
67
+ 4
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2
74
+ ],
75
+ "use_bias": true,
76
+ "use_stochastic_duration_prediction": true,
77
+ "vocab_size": 53,
78
+ "wavenet_dilation_rate": 1,
79
+ "wavenet_dropout": 0.0,
80
+ "wavenet_kernel_size": 5,
81
+ "window_size": 4
82
+ }
83
+
tokenizer_config.json CHANGED
@@ -1,7 +1,12 @@
1
  {
2
- "model_type": "ttsmms",
3
- "tokenizer_class": "TTSMMSTokenizer",
4
- "pad_token": "_",
5
- "unk_token": "_",
6
- "do_lower_case": true
7
- }
 
 
 
 
 
 
1
  {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "grn",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "3",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
vocab.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 11,
3
+ "'": 46,
4
+ "-": 37,
5
+ "0": 24,
6
+ "1": 23,
7
+ "2": 42,
8
+ "3": 0,
9
+ "4": 21,
10
+ "5": 41,
11
+ "6": 8,
12
+ "7": 16,
13
+ "8": 9,
14
+ "9": 26,
15
+ "_": 35,
16
+ "a": 10,
17
+ "b": 22,
18
+ "c": 29,
19
+ "d": 15,
20
+ "e": 38,
21
+ "f": 27,
22
+ "g": 45,
23
+ "h": 6,
24
+ "i": 43,
25
+ "j": 30,
26
+ "k": 7,
27
+ "l": 3,
28
+ "m": 2,
29
+ "n": 18,
30
+ "o": 50,
31
+ "p": 17,
32
+ "q": 19,
33
+ "r": 36,
34
+ "s": 52,
35
+ "t": 40,
36
+ "u": 39,
37
+ "v": 13,
38
+ "x": 51,
39
+ "y": 31,
40
+ "z": 47,
41
+ "á": 44,
42
+ "ã": 20,
43
+ "é": 34,
44
+ "í": 33,
45
+ "ñ": 1,
46
+ "ó": 32,
47
+ "õ": 48,
48
+ "ú": 25,
49
+ "ý": 14,
50
+ "ĩ": 49,
51
+ "ũ": 5,
52
+ "ẽ": 12,
53
+ "ỹ": 4,
54
+ "—": 28
55
+ }
56
+