ola13 commited on
Commit
0b73201
Β·
1 Parent(s): 1419aad

Update tool - fix

Browse files
Files changed (3) hide show
  1. README.md +6 -6
  2. app.py +484 -278
  3. spaces.code-workspace +0 -8
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: GÆA / gaia / gæa
3
- emoji: πŸŒπŸŒ–
4
  colorFrom: blue
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.18.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
-
 
1
  ---
2
+ title: Roots Search Tool - dev tier
3
+ emoji: πŸŒ–
4
  colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
app.py CHANGED
@@ -1,153 +1,59 @@
1
  import json
2
  import os
3
- import pprint
 
4
 
5
- import streamlit as st
6
- import streamlit.components.v1 as components
7
  import requests
 
8
 
9
- from typing import Union
10
-
11
- pp = pprint.PrettyPrinter(indent=2)
12
-
13
- os.environ["address"] = "http://34.79.83.149:8080"
14
-
15
- st.set_page_config(page_title="Gaia Search πŸŒ–πŸŒ", layout="wide")
16
-
17
- os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True)
18
- with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
19
- file.write('[theme]\nbase="light"')
20
-
21
-
22
- corpus_name_map = {
23
- "LAION": "laion",
24
- "ROOTS": "roots",
25
- "The Pile": "pile",
26
- "C4": "c4",
27
  }
28
 
29
- st.sidebar.markdown(
30
- """
31
- <style>
32
- .aligncenter {
33
- text-align: center;
34
- font-weight: bold;
35
- font-size: 36px;
36
- }
37
- </style>
38
- <p class="aligncenter">Gaia Search πŸŒ–πŸŒ</p>
39
- <p>A search engine for large scale texual
40
- corpora. Most of the datasets included in the tool are based on Common
41
- Crawl. By using the tool, you are also bound by the Common Crawl terms
42
- of use in respect of the content contained in the datasets.
43
- </p>
44
- """,
45
- unsafe_allow_html=True,
46
- )
47
 
48
- st.sidebar.markdown(
49
- """
50
- <style>
51
- .aligncenter {
52
- text-align: center;
53
- }
54
- </style>
55
- <p style='text-align: center'>
56
- <a href="" style="color:#7978FF;">GitHub</a> | <a href="" style="color:#7978FF;" >Project Report</a> | <a href="" style="color:#7978FF;" >Colab</a>
57
- </p>
58
- """,
59
- unsafe_allow_html=True,
60
- )
61
-
62
- # <p class="aligncenter">
63
- # <a href="" target="_blank">
64
- # <img src="https://colab.research.google.com/assets/colab-badge.svg"/>
65
- # </a>
66
- # </p>
67
-
68
-
69
- query = st.sidebar.text_input(label="Query", placeholder="Type your query here")
70
- corpus = st.sidebar.selectbox(
71
- "Corpus",
72
- tuple(corpus_name_map.keys()),
73
- index=2,
74
- )
75
- max_results = st.sidebar.slider(
76
- "Max Results",
77
- min_value=1,
78
- max_value=100,
79
- step=1,
80
- value=10,
81
- help="Max Number of Documents to return",
82
- )
83
-
84
- # dark_mode_toggle = """
85
- # <script>
86
- # function load_image(id){
87
- # console.log(id)
88
- # var x = document.getElementById(id);
89
- # console.log(x)
90
- # if (x.style.display === "none") {
91
- # x.style.display = "block";
92
- # } else {
93
- # x.style.display = "none";
94
- # }
95
- # };
96
- # function myFunction() {
97
- # var element = document.body;
98
- # element.classList.toggle("dark-mode");
99
- # }
100
- # </script>
101
- # <button onclick="myFunction()">Toggle dark mode</button>
102
- # """
103
- # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True)
104
-
105
-
106
- footer = """
107
- <style>
108
- .footer {
109
- position: fixed;
110
- left: 0;
111
- bottom: 0;
112
- width: 100%;
113
- background-color: white;
114
- color: black;
115
- text-align: center;
116
- }
117
- </style>
118
- <div class="footer">
119
- <p>Powered by <a href="https://huggingface.co/" >HuggingFace πŸ€—</a> and <a href="https://github.com/castorini/pyserini" >Pyserini πŸ¦†</a></p>
120
- </div>
121
- """
122
- st.sidebar.markdown(footer, unsafe_allow_html=True)
123
-
124
-
125
- def scisearch(query, corpus, num_results=10):
126
- try:
127
- print(query, corpus, num_results)
128
- query = query.strip()
129
- if query == "" or query is None:
130
- return
131
-
132
- post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"}
133
- address = (
134
- os.environ.get("address")
135
- if corpus != "roots"
136
- else "http://34.116.206.238:8080"
137
  )
138
-
139
- output = requests.post(
140
- address,
141
- headers={"Content-type": "application/json"},
142
- data=json.dumps(post_data),
143
- timeout=60,
 
 
 
 
 
 
 
 
 
144
  )
145
-
146
- payload = json.loads(output.text)
147
- return payload["results"], payload["highlight_terms"]
148
-
149
- except Exception as e:
150
- print(e)
151
 
152
 
153
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
@@ -165,55 +71,103 @@ def process_pii(text):
165
  return text
166
 
167
 
168
- def highlight_string(paragraph: str, highlight_terms: list) -> str:
169
- tokens = paragraph.split()
170
- tokens_html = []
171
- for token in tokens:
172
- if token in highlight_terms:
173
- tokens_html.append("<b>{}</b>".format(token))
174
- else:
175
- tokens_html.append(token)
176
- tokens_html = " ".join(tokens_html)
177
- return process_pii(tokens_html)
178
-
179
-
180
  def extract_lang_from_docid(docid):
181
  return docid.split("_")[1]
182
 
183
 
184
- def format_result(result, highlight_terms):
185
- text = result["text"]
186
- docid = result["docid"]
187
- tokens_html = highlight_string(text, highlight_terms)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  language = extract_lang_from_docid(docid)
189
- result_html = """
190
  <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
191
- <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span><br>
 
 
 
192
  <span style='font-family: Arial;'>{}</span><br>
193
  <br>
194
  """.format(
195
- language, docid, tokens_html
196
  )
197
  return "<p>" + result_html + "</p>"
198
 
199
 
200
- def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str:
201
- hit_list = []
 
 
 
 
202
 
203
- if corpus == "roots":
204
- result_page_html = ""
205
- for lang, results_for_lang in hits.items():
206
- print("Processing language", lang)
207
- if len(results_for_lang) == 0:
 
 
 
 
 
 
 
 
 
208
  result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
209
  No results for language: <b>{}</b></div>""".format(
210
  lang
211
  )
212
- continue
213
- results_for_lang_html = ""
214
- for result in results_for_lang:
215
- result_html = format_result(result, highlight_terms)
216
- results_for_lang_html += result_html
 
 
 
 
 
217
  results_for_lang_html = f"""
218
  <details>
219
  <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
@@ -221,117 +175,369 @@ def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list)
221
  </summary>
222
  {results_for_lang_html}
223
  </details>"""
224
- result_page_html += results_for_lang_html
225
- return result_page_html
226
-
227
- for hit in hits:
228
- res_head = f"""
229
- <p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
230
- """
231
- if corpus == "laion":
232
- res_head += f"""
233
- <p style="color: #7978FF;">Caption:</p>
234
- <p>{highlight_string(hit['text'], highlight_terms)}</p>
235
- """
236
- if (
237
- "meta" in hit
238
- and hit["meta"] is not None
239
- and "docs" in hit["meta"]
240
- and len(hit["meta"]["docs"]) > 0
241
- ):
242
- res_head += """<p style="color: #7978FF;"> Image links:</p><ul>"""
243
- for subhit in hit["meta"]["docs"]:
244
- res_head += f"""<li><a href={subhit["URL"]} target="_blank" style="color:#ffcdf8; ">{subhit["URL"]}</a></li>"""
245
- res_head += "</ul>"
246
- res_head += "<hr>"
247
- else:
248
- res_head += (
249
- f"""<p>{highlight_string(hit['text'], highlight_terms)}</p></div><hr>"""
 
 
 
 
 
 
 
 
250
  )
251
- hit_list.append(res_head)
252
- return " ".join(hit_list)
 
 
 
 
253
 
254
 
255
- submit_button = st.sidebar.button("Search", type="primary")
 
 
 
 
256
 
257
- if submit_button or query:
258
- query = query.strip()
259
- if query is None or query == "":
260
- components.html(
261
- """<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
262
- Please provide a non-empty query.
 
 
263
  </p><br><hr><br>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  )
265
- else:
266
- hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
267
- html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
268
- rendered_results = f"""
269
- <div id="searchresultsarea">
270
- <br>
271
- <p id="searchresultsnumber">About {max_results} results</p>
272
- {html_results}
273
- </div>"""
274
- # st.markdown(
275
- # """
276
- # <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
277
- # integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
278
- # """,
279
- # unsafe_allow_html=True,
280
- # )
281
- # st.markdown(
282
- # """
283
- # <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
284
- # """,
285
- # unsafe_allow_html=True,
286
- # )
287
- # st.markdown(
288
- # f"""
289
- # <div class="row no-gutters mt-3 align-items-center">
290
- # Gaia Search πŸŒ–πŸŒ
291
- # <div class="col col-md-4">
292
- # <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
293
- # </div>
294
- # <div class="col-auto">
295
- # <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
296
- # <i class="fa fa-search"></i>
297
- # </button>
298
- # </div>
299
- # </div>
300
- # """,
301
- # unsafe_allow_html=True,
302
- # )
303
- # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
304
- components.html(
305
- """
306
- <head>
307
- <link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
308
- </head>
309
- <style>
310
- #searchresultsarea {
311
- font-family: "Source Sans Pro", sans-serif;
312
- }
313
- #searchresultsnumber {
314
- font-size: 0.8rem;
315
- color: gray;
316
- }
317
- .searchresult h2 {
318
- font-size: 19px;
319
- line-height: 18px;
320
- font-weight: normal;
321
- color: rgb(7, 111, 222);
322
- margin-bottom: 0px;
323
- margin-top: 25px;
324
- color: #7978FF;"
325
- }
326
- .searchresult a {
327
- font-size: 12px;
328
- line-height: 12px;
329
- color: green;
330
- margin-bottom: 0px;
331
- }
332
- </style>
333
- """
334
- + rendered_results,
335
- height=800,
336
- scrolling=True,
337
  )
 
 
1
  import json
2
  import os
3
+ import traceback
4
+ from typing import List, Tuple
5
 
6
+ import gradio as gr
 
7
  import requests
8
+ from huggingface_hub import HfApi
9
 
10
+ hf_api = HfApi()
11
+ roots_datasets = {
12
+ dset.id.split("/")[-1]: dset
13
+ for dset in hf_api.list_datasets(
14
+ author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
15
+ )
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ def get_docid_html(docid):
20
+ data_org, dataset, docid = docid.split("/")
21
+ metadata = roots_datasets[dataset]
22
+ locked_color = "LightGray"
23
+ open_color = "#7978FF"
24
+ if metadata.private:
25
+ docid_html = """
26
+ <a title="This dataset is private. See the introductory text for more information"
27
+ style="color:{locked_color}; font-weight: bold; text-decoration:none"
28
+ onmouseover="style='color:{locked_color}; font-weight: bold; text-decoration:underline'"
29
+ onmouseout="style='color:{locked_color}; font-weight: bold; text-decoration:none'"
30
+ href="https://huggingface.co/datasets/bigscience-data/{dataset}"
31
+ target="_blank">
32
+ πŸ”’{dataset}
33
+ </a>
34
+ <span style="color:{open_color}; ">/{docid}</span>""".format(
35
+ dataset=dataset,
36
+ docid=docid,
37
+ locked_color=locked_color,
38
+ open_color=open_color,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  )
40
+ else:
41
+ docid_html = """
42
+ <a title="This dataset is licensed {metadata}"
43
+ style="color:{open_color}; font-weight: bold; text-decoration:none"
44
+ onmouseover="style='color:{open_color}; font-weight: bold; text-decoration:underline'"
45
+ onmouseout="style='color:{open_color}; font-weight: bold; text-decoration:none'"
46
+ href="https://huggingface.co/datasets/bigscience-data/{dataset}"
47
+ target="_blank">
48
+ {dataset}
49
+ </a>
50
+ <span style="color:{open_color}; ">/{docid}</span>""".format(
51
+ metadata=metadata.tags[0].split(":")[-1],
52
+ dataset=dataset,
53
+ docid=docid,
54
+ open_color=open_color,
55
  )
56
+ return docid_html
 
 
 
 
 
57
 
58
 
59
  PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
 
71
  return text
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def extract_lang_from_docid(docid):
75
  return docid.split("_")[1]
76
 
77
 
78
+ def format_result(result, highlight_terms, exact_search, datasets_filter=None):
79
+ text, url, docid = result
80
+ if datasets_filter is not None:
81
+ datasets_filter = set(datasets_filter)
82
+ dataset = docid.split("/")[1]
83
+ if not dataset in datasets_filter:
84
+ return ""
85
+
86
+ if exact_search:
87
+ query_start = text.find(highlight_terms)
88
+ query_end = query_start + len(highlight_terms)
89
+ tokens_html = text[0:query_start]
90
+ tokens_html += "<b>{}</b>".format(text[query_start:query_end])
91
+ tokens_html += text[query_end:]
92
+ else:
93
+ tokens = text.split()
94
+ tokens_html = []
95
+ for token in tokens:
96
+ if token in highlight_terms:
97
+ tokens_html.append("<b>{}</b>".format(token))
98
+ else:
99
+ tokens_html.append(token)
100
+ tokens_html = " ".join(tokens_html)
101
+ tokens_html = process_pii(tokens_html)
102
+
103
+ url_html = (
104
+ """
105
+ <span style='font-size:12px; font-family: Arial; color:Silver; text-align: left;'>
106
+ <a style='text-decoration:none; color:Silver;'
107
+ onmouseover="style='text-decoration:underline; color:Silver;'"
108
+ onmouseout="style='text-decoration:none; color:Silver;'"
109
+ href='{url}'
110
+ target="_blank">
111
+ {url}
112
+ </a>
113
+ </span><br>
114
+ """.format(
115
+ url=url
116
+ )
117
+ if url is not None
118
+ else ""
119
+ )
120
+ docid_html = get_docid_html(docid)
121
  language = extract_lang_from_docid(docid)
122
+ result_html = """{}
123
  <span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
124
+ <span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span>
125
+ <a href="https://forms.gle/AdBLLwRApqcLkHYA8" target="_blank">
126
+ <button style="color:#ffcdf8; ">πŸ΄β€β˜ οΈ Flag result πŸ΄β€β˜ οΈ</button>
127
+ </a><br>
128
  <span style='font-family: Arial;'>{}</span><br>
129
  <br>
130
  """.format(
131
+ url_html, language, docid_html, tokens_html
132
  )
133
  return "<p>" + result_html + "</p>"
134
 
135
 
136
+ def format_result_page(
137
+ language, results, highlight_terms, num_results, exact_search, datasets_filter=None
138
+ ) -> gr.HTML:
139
+
140
+ filtered_num_results = 0
141
+ header_html = ""
142
 
143
+ if language == "detect_language" and not exact_search:
144
+ header_html += """<div style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
145
+ Detected language: <b style='color:MediumAquaMarine'>{}</b></div>""".format(
146
+ list(results.keys())[0]
147
+ )
148
+
149
+ result_page_html = ""
150
+ for lang, results_for_lang in results.items():
151
+ print("Processing language", lang)
152
+ if len(results_for_lang) == 0:
153
+ if exact_search:
154
+ result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
155
+ No results found.</div>"""
156
+ else:
157
  result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
158
  No results for language: <b>{}</b></div>""".format(
159
  lang
160
  )
161
+ continue
162
+ results_for_lang_html = ""
163
+ for result in results_for_lang:
164
+ result_html = format_result(
165
+ result, highlight_terms, exact_search, datasets_filter
166
+ )
167
+ if result_html != "":
168
+ filtered_num_results += 1
169
+ results_for_lang_html += result_html
170
+ if language == "all" and not exact_search:
171
  results_for_lang_html = f"""
172
  <details>
173
  <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
 
175
  </summary>
176
  {results_for_lang_html}
177
  </details>"""
178
+ result_page_html += results_for_lang_html
179
+
180
+ if num_results is not None:
181
+ header_html += """<div style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
182
+ Total number of matches: <b style='color:MediumAquaMarine'>{}</b></div>""".format(
183
+ num_results
184
+ )
185
+ return header_html + result_page_html
186
+
187
+
188
+ def extract_results_from_payload(query, language, payload, exact_search):
189
+ results = payload["results"]
190
+ processed_results = dict()
191
+ datasets = set()
192
+ highlight_terms = None
193
+ num_results = None
194
+
195
+ if exact_search:
196
+ highlight_terms = query
197
+ num_results = payload["num_results"]
198
+ results = {"dummy": results}
199
+ else:
200
+ highlight_terms = payload["highlight_terms"]
201
+
202
+ for lang, results_for_lang in results.items():
203
+ processed_results[lang] = list()
204
+ for result in results_for_lang:
205
+ text = result["text"]
206
+ url = (
207
+ result["meta"]["url"]
208
+ if "meta" in result
209
+ and result["meta"] is not None
210
+ and "url" in result["meta"]
211
+ else None
212
  )
213
+ docid = result["docid"]
214
+ _, dataset, _ = docid.split("/")
215
+ datasets.add(dataset)
216
+ processed_results[lang].append((text, url, docid))
217
+
218
+ return processed_results, highlight_terms, num_results, list(datasets)
219
 
220
 
221
+ def no_query_error_message():
222
+ return f"""
223
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
224
+ Please provide a non-empty query.
225
+ </p><br><hr><br>"""
226
 
227
+
228
+ def process_error(error_type, payload):
229
+ if error_type == "unsupported_lang":
230
+ detected_lang = payload["err"]["meta"]["detected_lang"]
231
+ return f"""
232
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
233
+ Detected language <b>{detected_lang}</b> is not supported.<br>
234
+ Please choose a language from the dropdown or type another query.
235
  </p><br><hr><br>"""
236
+
237
+
238
+ def extract_error_from_payload(payload):
239
+ if "err" in payload:
240
+ return payload["err"]["type"]
241
+ return None
242
+
243
+
244
+ def request_payload(query, language, exact_search, num_results=10, received_results=0):
245
+ post_data = {"query": query, "k": num_results, "received_results": received_results}
246
+ if language != "detect_language":
247
+ post_data["lang"] = language
248
+ address = "http://34.105.160.81:8080" if exact_search else os.environ.get("address")
249
+ output = requests.post(
250
+ address,
251
+ headers={"Content-type": "application/json"},
252
+ data=json.dumps(post_data),
253
+ timeout=60,
254
+ )
255
+ payload = json.loads(output.text)
256
+ return payload
257
+
258
+
259
+ title = (
260
+ """<p style="text-align: center; font-size:28px"> 🌸 πŸ”Ž ROOTS search tool πŸ” 🌸 </p>"""
261
+ )
262
+ description = """
263
+ The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
264
+ of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
265
+ you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
266
+ ROOTS. You can read more about the details of the tool design
267
+ [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
268
+ information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
269
+
270
+
271
+ if __name__ == "__main__":
272
+ demo = gr.Blocks(css=".underline-on-hover:hover { text-decoration: underline; }")
273
+
274
+ with demo:
275
+ processed_results_state = gr.State([])
276
+ highlight_terms_state = gr.State([])
277
+ num_results_state = gr.State(0)
278
+ exact_search_state = gr.State(False)
279
+ received_results_state = gr.State(0)
280
+
281
+ with gr.Row():
282
+ gr.Markdown(value=title)
283
+ with gr.Row():
284
+ gr.Markdown(value=description)
285
+ with gr.Row():
286
+ query = gr.Textbox(
287
+ lines=1,
288
+ max_lines=1,
289
+ placeholder="Put your query in double quotes for exact search.",
290
+ label="Query",
291
+ )
292
+ with gr.Row():
293
+ lang = gr.Dropdown(
294
+ choices=[
295
+ "ar",
296
+ "ca",
297
+ "code",
298
+ "en",
299
+ "es",
300
+ "eu",
301
+ "fr",
302
+ "id",
303
+ "indic",
304
+ "nigercongo",
305
+ "pt",
306
+ "vi",
307
+ "zh",
308
+ "detect_language",
309
+ "all",
310
+ ],
311
+ value="en",
312
+ label="Language",
313
+ )
314
+ k = gr.Slider(
315
+ 1,
316
+ 100,
317
+ value=10,
318
+ step=1,
319
+ label="Max Results in fuzzy search or Max Results per page in exact search",
320
+ )
321
+ with gr.Row():
322
+ submit_btn = gr.Button("Submit")
323
+ with gr.Row(visible=False) as datasets_filter:
324
+ available_datasets = gr.Dropdown(
325
+ type="value",
326
+ choices=[],
327
+ value=[],
328
+ label="Datasets Filter",
329
+ multiselect=True,
330
+ )
331
+ with gr.Row():
332
+ result_page_html = gr.HTML(label="Results")
333
+
334
+ with gr.Row(visible=False) as pagination:
335
+ next_page_btn = gr.Button("Next Page")
336
+
337
+ def run_query(query, lang, k, dropdown_input, received_results):
338
+ query = query.strip()
339
+ exact_search = False
340
+ if query.startswith('"') and query.endswith('"') and len(query) >= 2:
341
+ exact_search = True
342
+ query = query[1:-1]
343
+ else:
344
+ query = " ".join(query.split())
345
+ if query == "" or query is None:
346
+ return (
347
+ [],
348
+ [],
349
+ 0,
350
+ False,
351
+ no_query_error_message(),
352
+ [],
353
+ )
354
+
355
+ payload = request_payload(query, lang, exact_search, k, received_results)
356
+ err = extract_error_from_payload(payload)
357
+ if err is not None:
358
+ return (
359
+ [],
360
+ [],
361
+ 0,
362
+ False,
363
+ process_error(err, payload),
364
+ [],
365
+ )
366
+
367
+ (
368
+ processed_results,
369
+ highlight_terms,
370
+ num_results,
371
+ ds,
372
+ ) = extract_results_from_payload(
373
+ query,
374
+ lang,
375
+ payload,
376
+ exact_search,
377
+ )
378
+ result_page = format_result_page(
379
+ lang, processed_results, highlight_terms, num_results, exact_search
380
+ )
381
+ return (
382
+ processed_results,
383
+ highlight_terms,
384
+ num_results,
385
+ exact_search,
386
+ result_page,
387
+ ds,
388
+ )
389
+
390
+ def submit(query, lang, k, dropdown_input):
391
+ print("submitting", query, lang, k)
392
+ (
393
+ processed_results,
394
+ highlight_terms,
395
+ num_results,
396
+ exact_search,
397
+ result_page,
398
+ datasets,
399
+ ) = run_query(query, lang, k, dropdown_input, 0)
400
+ has_more_results = exact_search and (num_results > k)
401
+ current_results = (
402
+ len(next(iter(processed_results.values())))
403
+ if len(processed_results) > 0
404
+ else 0
405
+ )
406
+ return [
407
+ processed_results,
408
+ highlight_terms,
409
+ num_results,
410
+ exact_search,
411
+ gr.update(visible=True)
412
+ if current_results > 0
413
+ else gr.update(visible=False),
414
+ gr.Dropdown.update(choices=datasets, value=datasets),
415
+ gr.update(visible=has_more_results),
416
+ current_results,
417
+ result_page,
418
+ ]
419
+
420
+ def next_page(
421
+ query,
422
+ lang,
423
+ k,
424
+ dropdown_input,
425
+ received_results,
426
+ processed_results,
427
+ ):
428
+ (
429
+ processed_results,
430
+ highlight_terms,
431
+ num_results,
432
+ exact_search,
433
+ result_page,
434
+ datasets,
435
+ ) = run_query(query, lang, k, dropdown_input, received_results)
436
+ current_results = sum(
437
+ len(results) for results in processed_results.values()
438
+ )
439
+ has_more_results = exact_search and (
440
+ received_results + current_results < num_results
441
+ )
442
+ print("received_results", received_results)
443
+ print("current_results", current_results)
444
+ print("has_more_results", has_more_results)
445
+ return [
446
+ processed_results,
447
+ highlight_terms,
448
+ num_results,
449
+ exact_search,
450
+ gr.update(visible=True)
451
+ if current_results > 0
452
+ else gr.update(visible=False),
453
+ gr.Dropdown.update(choices=datasets, value=datasets),
454
+ gr.update(visible=current_results >= k and has_more_results),
455
+ received_results + current_results,
456
+ result_page,
457
+ ]
458
+
459
+ def filter_datasets(
460
+ lang,
461
+ processed_results,
462
+ highlight_terms,
463
+ num_results,
464
+ exact_search,
465
+ datasets_filter,
466
+ ):
467
+ result_page_html = format_result_page(
468
+ lang,
469
+ processed_results,
470
+ highlight_terms,
471
+ num_results,
472
+ exact_search,
473
+ datasets_filter,
474
+ )
475
+ return result_page_html
476
+
477
+ query.submit(
478
+ fn=submit,
479
+ inputs=[query, lang, k, available_datasets],
480
+ outputs=[
481
+ processed_results_state,
482
+ highlight_terms_state,
483
+ num_results_state,
484
+ exact_search_state,
485
+ datasets_filter,
486
+ available_datasets,
487
+ pagination,
488
+ received_results_state,
489
+ result_page_html,
490
+ ],
491
  )
492
+ submit_btn.click(
493
+ submit,
494
+ inputs=[query, lang, k, available_datasets],
495
+ outputs=[
496
+ processed_results_state,
497
+ highlight_terms_state,
498
+ num_results_state,
499
+ exact_search_state,
500
+ datasets_filter,
501
+ available_datasets,
502
+ pagination,
503
+ received_results_state,
504
+ result_page_html,
505
+ ],
506
+ )
507
+
508
+ next_page_btn.click(
509
+ next_page,
510
+ inputs=[
511
+ query,
512
+ lang,
513
+ k,
514
+ available_datasets,
515
+ received_results_state,
516
+ processed_results_state,
517
+ ],
518
+ outputs=[
519
+ processed_results_state,
520
+ highlight_terms_state,
521
+ num_results_state,
522
+ exact_search_state,
523
+ datasets_filter,
524
+ available_datasets,
525
+ pagination,
526
+ received_results_state,
527
+ result_page_html,
528
+ ],
529
+ )
530
+
531
+ available_datasets.change(
532
+ filter_datasets,
533
+ inputs=[
534
+ lang,
535
+ processed_results_state,
536
+ highlight_terms_state,
537
+ num_results_state,
538
+ exact_search_state,
539
+ available_datasets,
540
+ ],
541
+ outputs=result_page_html,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  )
543
+ demo.launch(enable_queue=True, debug=True)
spaces.code-workspace DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "folders": [
3
- {
4
- "path": ".."
5
- }
6
- ],
7
- "settings": {}
8
- }