File size: 1,356 Bytes
37ced70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import torch
from fireredtts.modules.tokenizer.whisper_tokenizer import get_tokenizer
from fireredtts.modules.text_normalizer.normalize import TextNormalizer


DEFAULT_VOCAB_FILE = os.path.join(
    os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json"
)


class VoiceBpeTokenizer:
    def __init__(self):
        self.tokenizer = get_tokenizer(multilingual=True)
        self.tn_engine = TextNormalizer()

    def redtts_text_cleaner(self, text):
        text = text.strip()
        text, text_lang = self.tn_engine.tn(text)
        # print("---text after tn:", text)
        return text, text_lang

    def encode(self, text, lang="auto"):
        text, text_lang = self.redtts_text_cleaner(text=text)
        if lang == "auto":
            lang = text_lang
        text = f"[{lang}]{text}"
        return self.tokenizer.encode(text)

    def decode(self, seq):
        if isinstance(seq, torch.Tensor):
            seq = seq.cpu().numpy()
        text = self.tokenizer.decode(seq)
        return text

    def __len__(self):
        return self.tokenizer.get_vocab_size()

    def get_number_tokens(self):
        return self.tokenizer.get_vocab_size()


if __name__ == "__main__":
    tok = VoiceBpeTokenizer()
    codes = tok.encode("我、真是hello USA啊?谢谢你world!")
    print([tok.decode([c]) for c in codes])