Daimon commited on
Commit
312ff3c
·
1 Parent(s): 0d3d09b

Expanded Excel functionality to support multiple sheet translation

Browse files
Files changed (1) hide show
  1. app.py +98 -47
app.py CHANGED
@@ -4,6 +4,8 @@ from pathlib import Path
4
  #from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
5
  from transformers import M2M100ForConditionalGeneration
6
  from tokenization_small100 import SMALL100Tokenizer
 
 
7
 
8
  st.set_page_config(page_title="Translation Demo", page_icon=":milky_way:", layout="wide")
9
 
@@ -27,13 +29,67 @@ def get_translation(src_code, trg_code, src):
27
 
28
  def open_input(the_file):
29
 
 
 
30
  if the_file.name.endswith('.tsv'):
31
  parsed = pd.read_csv(the_file, sep="\t")
 
32
  elif the_file.name.endswith('.xlsx'):
33
- parsed = pd.read_excel(the_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- return parsed
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  st.subheader("SMALL-100 Translator")
39
 
@@ -82,60 +138,55 @@ with st.form("my_form"):
82
  st.write("Please enter the source text, source language and target language.")
83
 
84
 
85
- st.subheader('Input Excel/TSV')
86
  uploaded_file = st.file_uploader("Choose a file")
87
  done = False
88
 
89
-
90
  if uploaded_file is not None:
91
- valid_languages_col = (lang for lang in valid_languages)
92
- valid_languages_col_trg = (lang for lang in valid_languages)
93
- data = open_input(uploaded_file)
94
- st.subheader("DataFrame")
95
- st.write(data)
96
- st.write(data.describe())
97
-
98
- columns = (col for col in data.columns)
99
- src_col = st.selectbox(
100
- 'Select the column to translate:',
101
- columns,
102
- )
103
-
104
- if src_col:
105
- col_src_lang = st.selectbox(
106
- 'Source language:',
107
- valid_languages_col,
108
- )
109
- col_trg_lang = st.selectbox(
110
- 'Target language:',
111
- valid_languages_col_trg,
112
- )
113
- submitted_cols = st.button("Translate column")
114
 
 
 
 
 
 
 
 
115
  if submitted_cols:
116
- translated_data = []
117
- new_df = data
118
- for text in data[src_col]:
119
- if len(text) > 0 and col_src_lang in valid_languages and col_trg_lang in valid_languages:
120
- with st.spinner("Translating..."):
121
- try:
122
- target_text = get_translation(col_src_lang, col_trg_lang, text)[0]
123
- translated_data.append(target_text)
124
- except:
125
- st.subheader("Translation failed :sad:")
126
- break
127
- else:
128
- st.write("Please enter the source text, source language and target language.")
129
-
130
- new_df[src_col] = translated_data
131
- done = True
132
 
133
  if done:
134
  st.subheader("Translated DataFrame")
135
- st.write(new_df)
136
- st.write(new_df.describe())
137
- to_dl = new_df.to_csv(index=False, sep='\t').encode('utf-8')
138
- st.download_button('Download TSV', to_dl, 'translated_file.tsv', 'text/tsv', key='download-tsv')
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  else:
 
4
  #from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
5
  from transformers import M2M100ForConditionalGeneration
6
  from tokenization_small100 import SMALL100Tokenizer
7
+ import io
8
+
9
 
10
  st.set_page_config(page_title="Translation Demo", page_icon=":milky_way:", layout="wide")
11
 
 
29
 
30
  def open_input(the_file):
31
 
32
+ sheets = []
33
+
34
  if the_file.name.endswith('.tsv'):
35
  parsed = pd.read_csv(the_file, sep="\t")
36
+
37
  elif the_file.name.endswith('.xlsx'):
38
+ xlsx = pd.ExcelFile(the_file)
39
+ if len(xlsx.sheet_names) > 1:
40
+ sheets = [sheet for sheet in xlsx.sheet_names]
41
+ parsed = [pd.read_excel(xlsx, sheet) for sheet in sheets]
42
+ else:
43
+ parsed = pd.read_excel(the_file)
44
+
45
+ return parsed, sheets
46
+
47
+
48
+ def translate_data(data, valid_languages_col, valid_languages_col_trg, col_for_translation, languages):
49
+ translated_data = []
50
+ new_df = data
51
+ for text in data[col_for_translation]:
52
+ if len(text) > 0 and col_src_lang in languages and col_trg_lang in languages:
53
+ with st.spinner("Translating..."):
54
+ try:
55
+ target_text = get_translation(valid_languages_col, valid_languages_col_trg, text)[0]
56
+ translated_data.append(target_text)
57
+ except:
58
+ st.subheader("Translation failed :sad:")
59
+ break
60
+ else:
61
+ st.write("Please enter the source text, source language and target language.")
62
+
63
+ new_df["SMALL-100 translation"] = translated_data
64
+
65
+ return new_df
66
 
 
67
 
68
+ def select_column(data, is_excel=False):
69
+
70
+ if is_excel:
71
+ columns = (col for col in data[0].columns)
72
+ else:
73
+ columns = (col for col in data.columns)
74
+
75
+ src_col = st.selectbox(
76
+ 'Select the column to translate (WARNING: You can only select a single column - please make sure all columns are named accordingly):',
77
+ columns,
78
+ )
79
+
80
+ if src_col:
81
+ col_src_lang = st.selectbox(
82
+ 'Source language:',
83
+ valid_languages_col,
84
+ )
85
+ col_trg_lang = st.selectbox(
86
+ 'Target language:',
87
+ valid_languages_col_trg,
88
+ )
89
+ submitted_cols = st.button("Translate column")
90
+
91
+ return submitted_cols, src_col, col_src_lang, col_trg_lang
92
+
93
 
94
  st.subheader("SMALL-100 Translator")
95
 
 
138
  st.write("Please enter the source text, source language and target language.")
139
 
140
 
141
+ st.subheader('Input XLSX/TSV')
142
  uploaded_file = st.file_uploader("Choose a file")
143
  done = False
144
 
 
145
  if uploaded_file is not None:
146
+ valid_col = (lang for lang in valid_languages)
147
+ valid_col_trg = (lang for lang in valid_languages)
148
+ data, sheets = open_input(uploaded_file)
149
+
150
+ if len(sheets) > 0:
151
+ translated_sheets = []
152
+ submitted_cols, src_col, valid_col, valid_col_trg = select_column(data, is_excel=True)
153
+
154
+ if submitted_cols:
155
+ for sheet in data:
156
+ translated_sheets.append(translate_data(data, valid_col, valid_col_trg, src_col, valid_languages))
157
+
158
+ done = True
 
 
 
 
 
 
 
 
 
 
159
 
160
+ else:
161
+ submitted_cols, src_col, valid_col, valid_col_trg = select_column(data)
162
+
163
+ st.subheader("DataFrame")
164
+ st.write(data)
165
+ st.write(data.describe())
166
+
167
  if submitted_cols:
168
+ new_df = translate_data(data, valid_col, valid_col_trg, src_col, valid_languages)
169
+
170
+ done = True
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  if done:
173
  st.subheader("Translated DataFrame")
174
+
175
+ if len(sheets) > 0:
176
+ pass
177
+ buffer = io.BytesIO()
178
+ with pd.ExcelWriter(buffer) as writer:
179
+ for idx, sheet in enumerate(translated_sheets):
180
+ sheet.to_excel(writer, sheet_name=sheets[idx])
181
+
182
+ st.download_button('Download XLSX', buffer, 'translated_file.xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', key='download-xlsx')
183
+
184
+
185
+ else:
186
+ st.write(new_df)
187
+ st.write(new_df.describe())
188
+ to_dl = new_df.to_csv(index=False, sep='\t').encode('utf-8')
189
+ st.download_button('Download TSV', to_dl, 'translated_file.tsv', 'text/tsv', key='download-tsv')
190
 
191
 
192
  else: