Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,25 +17,7 @@ with st.container():
|
|
17 |
[Twitter](https://twitter.com/FrancescoDaimon)
|
18 |
""")
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
st.subheader('Input TSV/CSV')
|
23 |
-
uploaded_file = st.file_uploader("Choose a file")
|
24 |
-
with st.spinner("Loading..."):
|
25 |
-
if uploaded_file is not None:
|
26 |
-
if uploaded_file.name.endswith('.tsv'):
|
27 |
-
data = pd.read_csv(uploaded_file, sep="\t")
|
28 |
-
else:
|
29 |
-
data = pd.read_csv(uploaded_file)
|
30 |
-
|
31 |
-
st.subheader("DataFrame")
|
32 |
-
st.write(data)
|
33 |
-
st.write(data.describe())
|
34 |
-
|
35 |
-
else:
|
36 |
-
st.info("☝️ Upload a TSV/CSV file")
|
37 |
-
|
38 |
-
|
39 |
st.subheader("MBART-50 Translator")
|
40 |
|
41 |
source = "In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move."
|
@@ -44,6 +26,7 @@ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-m
|
|
44 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
45 |
|
46 |
|
|
|
47 |
def get_translation(src_code, trg_code, src):
|
48 |
|
49 |
tokenizer.src_lang = src_code
|
@@ -56,19 +39,29 @@ def get_translation(src_code, trg_code, src):
|
|
56 |
|
57 |
return trg
|
58 |
|
59 |
-
valid_languages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
with st.form("my_form"):
|
62 |
left_c, right_c = st.columns(2)
|
63 |
-
with left_c:
|
64 |
-
|
65 |
'Source language',
|
66 |
-
|
67 |
)
|
68 |
-
with right_c:
|
69 |
-
|
70 |
'Target language',
|
71 |
-
|
72 |
)
|
73 |
source = st.text_area("Source", value=source, height=130, placeholder="Enter the source text...")
|
74 |
|
@@ -88,29 +81,64 @@ with st.form("my_form"):
|
|
88 |
st.write("Please enter the source text, source language and target language.")
|
89 |
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
|
95 |
-
local_css("style/style.css")
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
[Twitter](https://twitter.com/FrancescoDaimon)
|
18 |
""")
|
19 |
|
20 |
+
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
st.subheader("MBART-50 Translator")
|
22 |
|
23 |
source = "In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move."
|
|
|
26 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
27 |
|
28 |
|
29 |
+
|
30 |
def get_translation(src_code, trg_code, src):
|
31 |
|
32 |
tokenizer.src_lang = src_code
|
|
|
39 |
|
40 |
return trg
|
41 |
|
42 |
+
valid_languages = [
|
43 |
+
'ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX',
|
44 |
+
'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP',
|
45 |
+
'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN' 'zh_CN', 'af_ZA', 'az_AZ', 'bn_IN',
|
46 |
+
'fa_IR', 'he_IL', 'hr_HR', 'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 'ml_IN', 'mn_MN', 'mr_IN',
|
47 |
+
'pl_PL', 'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 'uk_UA',
|
48 |
+
'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI'
|
49 |
+
]
|
50 |
+
|
51 |
+
valid_languages_tuple = (lang for lang in valid_languages)
|
52 |
+
valid_languages_tuple_trg = (lang for lang in valid_languages)
|
53 |
|
54 |
with st.form("my_form"):
|
55 |
left_c, right_c = st.columns(2)
|
56 |
+
#with left_c:
|
57 |
+
src_lang = st.selectbox(
|
58 |
'Source language',
|
59 |
+
valid_languages_tuple,
|
60 |
)
|
61 |
+
#with right_c:
|
62 |
+
trg_lang = st.selectbox(
|
63 |
'Target language',
|
64 |
+
valid_languages_tuple_trg,
|
65 |
)
|
66 |
source = st.text_area("Source", value=source, height=130, placeholder="Enter the source text...")
|
67 |
|
|
|
81 |
st.write("Please enter the source text, source language and target language.")
|
82 |
|
83 |
|
84 |
+
st.subheader('Input TSV')
|
85 |
+
uploaded_file = st.file_uploader("Choose a file")
|
86 |
+
done = False
|
87 |
|
|
|
88 |
|
89 |
+
|
90 |
+
if uploaded_file is not None:
|
91 |
+
valid_languages_col = (lang for lang in valid_languages)
|
92 |
+
valid_languages_col_trg = (lang for lang in valid_languages)
|
93 |
+
if uploaded_file.name.endswith('.tsv'):
|
94 |
+
data = pd.read_csv(uploaded_file, sep="\t")
|
95 |
+
st.subheader("DataFrame")
|
96 |
+
st.write(data)
|
97 |
+
st.write(data.describe())
|
98 |
+
columns = (col for col in data.columns)
|
99 |
+
src_col = st.selectbox(
|
100 |
+
'Select the column to translate:',
|
101 |
+
columns,
|
102 |
+
)
|
103 |
+
if src_col:
|
104 |
+
col_src_lang = st.selectbox(
|
105 |
+
'Source language:',
|
106 |
+
valid_languages_col,
|
107 |
+
)
|
108 |
+
col_trg_lang = st.selectbox(
|
109 |
+
'Target language:',
|
110 |
+
valid_languages_col_trg,
|
111 |
+
)
|
112 |
+
submitted_cols = st.button("Translate column")
|
113 |
+
if submitted_cols:
|
114 |
+
translated_data = []
|
115 |
+
new_df = data
|
116 |
+
for text in data[src_col]:
|
117 |
+
if len(text) > 0 and col_src_lang in valid_languages and col_trg_lang in valid_languages:
|
118 |
+
with st.spinner("Translating..."):
|
119 |
+
try:
|
120 |
+
target_text = get_translation(col_src_lang, col_trg_lang, text)[0]
|
121 |
+
translated_data.append(target_text)
|
122 |
+
except:
|
123 |
+
st.subheader("Translation failed :sad:")
|
124 |
+
break
|
125 |
+
|
126 |
+
else:
|
127 |
+
st.write("Please enter the source text, source language and target language.")
|
128 |
+
|
129 |
+
new_df[src_col] = translated_data
|
130 |
+
done = True
|
131 |
+
|
132 |
+
else:
|
133 |
+
data = pd.read_csv(uploaded_file)
|
134 |
+
|
135 |
+
if done:
|
136 |
+
st.subheader("Translated DataFrame")
|
137 |
+
st.write(new_df)
|
138 |
+
st.write(new_df.describe())
|
139 |
+
to_dl = new_df.to_csv(index=False, sep='\t').encode('utf-8')
|
140 |
+
st.download_button('Download TSV', to_dl, 'translated_file.tsv', 'text/tsv', key='download-tsv')
|
141 |
+
|
142 |
+
|
143 |
+
else:
|
144 |
+
st.info("☝️ Upload a TSV file")
|