Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,356 Bytes
37ced70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import os
import torch
from fireredtts.modules.tokenizer.whisper_tokenizer import get_tokenizer
from fireredtts.modules.text_normalizer.normalize import TextNormalizer
DEFAULT_VOCAB_FILE = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json"
)
class VoiceBpeTokenizer:
def __init__(self):
self.tokenizer = get_tokenizer(multilingual=True)
self.tn_engine = TextNormalizer()
def redtts_text_cleaner(self, text):
text = text.strip()
text, text_lang = self.tn_engine.tn(text)
# print("---text after tn:", text)
return text, text_lang
def encode(self, text, lang="auto"):
text, text_lang = self.redtts_text_cleaner(text=text)
if lang == "auto":
lang = text_lang
text = f"[{lang}]{text}"
return self.tokenizer.encode(text)
def decode(self, seq):
if isinstance(seq, torch.Tensor):
seq = seq.cpu().numpy()
text = self.tokenizer.decode(seq)
return text
def __len__(self):
return self.tokenizer.get_vocab_size()
def get_number_tokens(self):
return self.tokenizer.get_vocab_size()
if __name__ == "__main__":
tok = VoiceBpeTokenizer()
codes = tok.encode("我、真是hello USA啊?谢谢你world!")
print([tok.decode([c]) for c in codes])
|