Yurii Paniv commited on
Commit
349b2ad
·
1 Parent(s): 1ce668d

Improve number handling

Browse files
Files changed (2) hide show
  1. app.py +8 -4
  2. formatter.py +42 -13
app.py CHANGED
@@ -7,7 +7,6 @@ import requests
7
  from os.path import exists
8
  from formatter import preprocess_text
9
  from datetime import datetime
10
- from stress import sentence_to_stress
11
  from enum import Enum
12
  import torch
13
 
@@ -46,11 +45,15 @@ if synthesizer is None:
46
  raise NameError("model not found")
47
 
48
  def tts(text: str, stress: str):
49
- text = preprocess_text(text)
 
 
 
 
 
50
  text_limit = 1200
51
  text = text if len(text) < text_limit else text[0:text_limit] # mitigate crashes on hf space
52
- text = sentence_to_stress(text) if stress == StressOption.AutomaticStress.value else text
53
- print(text, stress, datetime.utcnow())
54
 
55
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
56
  with torch.no_grad():
@@ -81,6 +84,7 @@ iface = gr.Interface(
81
  ["Введ+іть, б+удь л+аска, сво+є р+ечення.", StressOption.ManualStress.value],
82
  ["Введіть, будь ласка, своє речення.", StressOption.ManualStress.value],
83
  ["Привіт, як тебе звати?", StressOption.AutomaticStress.value],
 
84
  ]
85
  )
86
  iface.launch(enable_queue=True, prevent_thread_lock=True)
 
7
  from os.path import exists
8
  from formatter import preprocess_text
9
  from datetime import datetime
 
10
  from enum import Enum
11
  import torch
12
 
 
45
  raise NameError("model not found")
46
 
47
  def tts(text: str, stress: str):
48
+ print("============================")
49
+ print("Original text:", text)
50
+ print("Stress:", stress)
51
+ print("Time:", datetime.utcnow())
52
+ autostress = True if stress == StressOption.AutomaticStress.value else False
53
+ text = preprocess_text(text, autostress)
54
  text_limit = 1200
55
  text = text if len(text) < text_limit else text[0:text_limit] # mitigate crashes on hf space
56
+ print("Converted:", text)
 
57
 
58
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
59
  with torch.no_grad():
 
84
  ["Введ+іть, б+удь л+аска, сво+є р+ечення.", StressOption.ManualStress.value],
85
  ["Введіть, будь ласка, своє речення.", StressOption.ManualStress.value],
86
  ["Привіт, як тебе звати?", StressOption.AutomaticStress.value],
87
+ ["Договір підписано 4 квітня 1949 року.", StressOption.AutomaticStress.value],
88
  ]
89
  )
90
  iface.launch(enable_queue=True, prevent_thread_lock=True)
formatter.py CHANGED
@@ -1,18 +1,42 @@
1
- def preprocess_text(text):
 
 
 
 
 
 
 
 
2
  # replace apostrophe
3
  text = text.replace("`", "'")
4
  text = text.replace("ʼ", "'")
5
  # numbers
6
- text = text.replace("1", "од+ин ")
7
- text = text.replace("2", "дв+а ")
8
- text = text.replace("3", "тр+и ")
9
- text = text.replace("4", "чот+ири ")
10
- text = text.replace("5", "п'+ять ")
11
- text = text.replace("6", "ш+ість ")
12
- text = text.replace("7", "с+ім ")
13
- text = text.replace("8", "в+ісім ")
14
- text = text.replace("9", "д+ев'ять ")
15
- text = text.replace("0", "н+уль ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # speak english alphabet using brute force transliteration
17
  english = {
18
  "a": "а",
@@ -47,9 +71,14 @@ def preprocess_text(text):
47
  text = text.replace(english_char.upper(), english[english_char].upper())
48
  text = text.replace(english_char, english[english_char])
49
 
50
- # TODO: autostress support here
 
 
51
  return text
52
 
53
 
54
  if __name__ == "__main__":
55
- print(preprocess_text("Quality of life update"))
 
 
 
 
1
+ import num2words
2
+ import re
3
+ from stress import sentence_to_stress
4
+
5
+ def preprocess_text(text, autostress=False):
6
+ # currencies
7
+ text = text.replace("$", "долар")
8
+ text = text.replace("₴", "гривня")
9
+ text = text.replace("€", "євро")
10
  # replace apostrophe
11
  text = text.replace("`", "'")
12
  text = text.replace("ʼ", "'")
13
  # numbers
14
+ text = re.sub(r'(\d)\s+(\d)', r'\1\2', text)
15
+
16
+ def detect_num_and_convert(word):
17
+ numbers = "0123456789,."
18
+ is_number = all(map(lambda x: x in numbers, word))
19
+ if is_number:
20
+ try:
21
+ return num2words.num2words(word, lang="uk")
22
+ except:
23
+ return word
24
+ else:
25
+ return word
26
+
27
+ text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
28
+
29
+ # fallback numbers
30
+ text = text.replace("1", "один ")
31
+ text = text.replace("2", "два ")
32
+ text = text.replace("3", "три ")
33
+ text = text.replace("4", "чотири ")
34
+ text = text.replace("5", "п'ять ")
35
+ text = text.replace("6", "шість ")
36
+ text = text.replace("7", "сім ")
37
+ text = text.replace("8", "вісім ")
38
+ text = text.replace("9", "дев'ять ")
39
+ text = text.replace("0", "нуль ")
40
  # speak english alphabet using brute force transliteration
41
  english = {
42
  "a": "а",
 
71
  text = text.replace(english_char.upper(), english[english_char].upper())
72
  text = text.replace(english_char, english[english_char])
73
 
74
+ if autostress:
75
+ text = sentence_to_stress(text)
76
+
77
  return text
78
 
79
 
80
  if __name__ == "__main__":
81
+ print(preprocess_text("Quality of life update"))
82
+ print(preprocess_text("Він украв 20000000 $"))
83
+ print(preprocess_text("111 000 000 000 доларів державного боргу."))
84
+ print(preprocess_text("11100000001 доларів державного боргу."))