Daimon commited on
Commit
37d9263
·
1 Parent(s): 74a7d13

Testing with SMALL-100

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -1,19 +1,25 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from pathlib import Path
4
- from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
 
 
5
 
6
  st.set_page_config(page_title="Translation Demo", page_icon=":milky_way:", layout="wide")
7
 
8
 
9
  def get_translation(src_code, trg_code, src):
10
 
11
- tokenizer.src_lang = src_code
 
 
 
 
 
 
 
12
  encoded = tokenizer(src, return_tensors="pt")
13
- generated_tokens = model.generate(
14
- **encoded,
15
- forced_bos_token_id=tokenizer.lang_code_to_id[trg_code]
16
- )
17
  trg = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
18
 
19
  return trg
@@ -29,16 +35,18 @@ def open_input(the_file):
29
  return parsed
30
 
31
 
32
- st.subheader("MBART-50 Translator")
33
 
34
  source = "In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move."
35
  target = ""
36
- model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
37
- tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
38
-
39
 
40
- valid_languages = ['de_DE', 'en_XX', 'it_IT']
 
41
 
 
 
42
 
43
 
44
  valid_languages_tuple = (lang for lang in valid_languages)
 
1
  import streamlit as st
2
  import pandas as pd
3
  from pathlib import Path
4
+ #from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
5
+ from transformers import M2M100ForConditionalGeneration
6
+ from tokenization_small100 import SMALL100Tokenizer
7
 
8
  st.set_page_config(page_title="Translation Demo", page_icon=":milky_way:", layout="wide")
9
 
10
 
11
  def get_translation(src_code, trg_code, src):
12
 
13
+ #tokenizer.src_lang = src_code
14
+ #encoded = tokenizer(src, return_tensors="pt")
15
+ #generated_tokens = model.generate(
16
+ #**encoded,
17
+ #forced_bos_token_id=tokenizer.lang_code_to_id[trg_code]
18
+ #)
19
+ #trg = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
20
+ tokenizer.tgt_lang = trg_code
21
  encoded = tokenizer(src, return_tensors="pt")
22
+ generated_tokens = model.generate(**encoded)
 
 
 
23
  trg = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
24
 
25
  return trg
 
35
  return parsed
36
 
37
 
38
+ st.subheader("SMALL-100 Translator")
39
 
40
  source = "In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move."
41
  target = ""
42
+ #model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
43
+ #tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
 
44
 
45
+ model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
46
+ tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100")
47
 
48
+ #valid_languages = ['de_DE', 'en_XX', 'it_IT']
49
+ valid_languages = ['de', 'it', 'en']
50
 
51
 
52
  valid_languages_tuple = (lang for lang in valid_languages)