kargaranamir commited on
Commit
e94a434
·
1 Parent(s): 4ab4a60
Files changed (5) hide show
  1. README.md +6 -5
  2. app.py +183 -0
  3. assets/GlotLID_logo.svg +0 -0
  4. constants.py +4 -0
  5. requirements.txt +3 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: GlotLID Space
3
- emoji: 📉
4
- colorFrom: blue
5
- colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GlotLID
3
+ emoji:
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
9
+ pinned: true
10
+ tags: [multilingual]
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The GlotLID Authors.
3
+ # Lint as: python3
4
+ """
5
+ GlotLID Space
6
+ """
7
+
8
+ """ This space is built based on AMR-KELEG/ALDi space """
9
+
10
+
11
+ import constants
12
+ import pandas as pd
13
+ import streamlit as st
14
+ from huggingface_hub import hf_hub_download
15
+ from GlotScript import get_script_predictor
16
+ import matplotlib.pyplot as plt
17
+ import fasttext
18
+ import altair as alt
19
+ from altair import X, Y, Scale
20
+ import base64
21
+
22
+
23
+ @st.cache_resource
24
+ def load_sp():
25
+ sp = get_script_predictor()
26
+ return sp
27
+
28
+
29
+ sp = load_sp()
30
+
31
+ def get_script(text):
32
+ """Get the writing system of given text.
33
+
34
+ Args:
35
+ text: The text to be preprocessed.
36
+
37
+ Returns:
38
+ The writing system of text.
39
+ """
40
+
41
+ return sp(text)[0]
42
+
43
+ @st.cache_data
44
+ def render_svg(svg):
45
+ """Renders the given svg string."""
46
+ b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
47
+ html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}"/> </p>'
48
+ c = st.container()
49
+ c.write(html, unsafe_allow_html=True)
50
+
51
+
52
+ @st.cache_data
53
+ def convert_df(df):
54
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
55
+ return df.to_csv(index=None).encode("utf-8")
56
+
57
+
58
+ @st.cache_resource
59
+ def load_model(model_name):
60
+ model_path = hf_hub_download(repo_id=model_name, filename="model.bin")
61
+ model = fasttext.load_model(model_path)
62
+ return model
63
+
64
+
65
+ model = load_model(constants.MODEL_NAME)
66
+
67
+
68
+ def compute(sentences):
69
+ """Computes the language labels for the given sentences.
70
+
71
+ Args:
72
+ sentences: A list of sentences.
73
+
74
+ Returns:
75
+ A list of language probablities and labels for the given sentences.
76
+ """
77
+ progress_text = "Computing Language..."
78
+ my_bar = st.progress(0, text=progress_text)
79
+
80
+ BATCH_SIZE = 1
81
+ probs = []
82
+ labels = []
83
+ preprocessed_sentences = sentences
84
+
85
+ for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
86
+
87
+ outputs = model.predict(preprocessed_sentences[first_index : first_index + BATCH_SIZE])
88
+
89
+ # BATCH_SIZE = 1
90
+ outputs_labels = outputs[0][0]
91
+ outputs_probs = outputs[1][0]
92
+
93
+ probs = probs + [max(min(o, 1), 0) for o in outputs_probs]
94
+ labels = labels + outputs_labels
95
+
96
+ my_bar.progress(
97
+ min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
98
+ text=progress_text,
99
+ )
100
+ my_bar.empty()
101
+ return probs, labels
102
+
103
+
104
+ render_svg(open("assets/GlotLID_logo.svg").read())
105
+
106
+ tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
107
+
108
+ with tab1:
109
+ sent = st.text_input(
110
+ "Sentence:", placeholder="Enter a sentence.", on_change=None
111
+ )
112
+
113
+ # TODO: Check if this is needed!
114
+ clicked = st.button("Submit")
115
+
116
+ if sent:
117
+ probs, labels = compute([sent])
118
+ prob = probs[0]
119
+ label = labels[0]
120
+
121
+ ORANGE_COLOR = "#FF8000"
122
+ fig, ax = plt.subplots(figsize=(8, 1))
123
+ fig.patch.set_facecolor("none")
124
+ ax.set_facecolor("none")
125
+
126
+ ax.spines["left"].set_color(ORANGE_COLOR)
127
+ ax.spines["bottom"].set_color(ORANGE_COLOR)
128
+ ax.tick_params(axis="x", colors=ORANGE_COLOR)
129
+
130
+ ax.spines[["right", "top"]].set_visible(False)
131
+
132
+ ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
133
+ ax.set_xlim(0, 1)
134
+ ax.set_ylim(-1, 1)
135
+ ax.set_title(f"Langauge is: {label}", color=ORANGE_COLOR)
136
+ ax.get_yaxis().set_visible(False)
137
+ ax.set_xlabel("Confidence", color=ORANGE_COLOR)
138
+ st.pyplot(fig)
139
+
140
+ print(sent)
141
+ with open("logs.txt", "a") as f:
142
+ f.write(sent + "\n")
143
+
144
+ with tab2:
145
+ file = st.file_uploader("Upload a file", type=["txt"])
146
+ if file is not None:
147
+ df = pd.read_csv(file, sep="\t", header=None)
148
+ df.columns = ["Sentence"]
149
+ df.reset_index(drop=True, inplace=True)
150
+
151
+ # TODO: Run the model
152
+ df['Probs'], df["Language"] = compute(df["Sentence"].tolist())
153
+
154
+ # A horizontal rule
155
+ st.markdown("""---""")
156
+
157
+ chart = (
158
+ alt.Chart(df.reset_index())
159
+ .mark_area(color="darkorange", opacity=0.5)
160
+ .encode(
161
+ x=X(field="index", title="Sentence Index"),
162
+ y=Y("Probs", scale=Scale(domain=[0, 1])),
163
+ )
164
+ )
165
+ st.altair_chart(chart.interactive(), use_container_width=True)
166
+
167
+ col1, col2 = st.columns([4, 1])
168
+
169
+ with col1:
170
+ # Display the output
171
+ st.table(
172
+ df,
173
+ )
174
+
175
+ with col2:
176
+ # Add a download button
177
+ csv = convert_df(df)
178
+ st.download_button(
179
+ label=":file_folder: Download predictions as CSV",
180
+ data=csv,
181
+ file_name="GlotLID.csv",
182
+ mime="text/csv",
183
+ )
assets/GlotLID_logo.svg ADDED
constants.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ CHOICE_TEXT = "Input Text"
2
+ CHOICE_FILE = "Upload File"
3
+ TITLE = "GlotLID: Language Identification for Around 2000 Languages"
4
+ MODEL_NAME = "cis-lmu/GlotLID"
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fasttext
2
+ huggingface_hub
3
+ GlotScript