momenaca commited on
Commit
9b4020a
·
1 Parent(s): c780ad4

update app main file by performing a major cleanup and simplification

Browse files
app.py CHANGED
@@ -1,27 +1,29 @@
1
  import gradio as gr
2
  import time
3
- import yaml
4
- from langchain.prompts.chat import ChatPromptTemplate
5
- from huggingface_hub import hf_hub_download
6
  from spinoza_project.source.backend.llm_utils import (
7
- get_llm,
8
  get_llm_api,
9
- get_vectorstore,
10
  get_vectorstore_api,
11
  )
12
- from spinoza_project.source.backend.document_store import pickle_to_document_store
13
- from spinoza_project.source.backend.get_prompts import get_qa_prompts
14
  from spinoza_project.source.frontend.utils import (
15
- make_html_source,
16
- make_html_presse_source,
17
- make_html_afp_source,
18
- make_html_politique_source,
19
- parse_output_llm_with_sources,
20
  init_env,
 
21
  )
22
- from spinoza_project.source.backend.prompt_utils import (
23
- to_chat_instruction,
24
- SpecialTokens,
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
 
27
  from assets.utils_javascript import (
@@ -33,172 +35,65 @@ from assets.utils_javascript import (
33
  )
34
 
35
  init_env()
 
36
 
37
- with open("./spinoza_project/config.yaml") as f:
38
- config = yaml.full_load(f)
39
-
40
- prompts = {}
41
- for source in config["prompt_naming"]:
42
- with open(f"./spinoza_project/prompt_{source}.yaml") as f:
43
- prompts[source] = yaml.full_load(f)
44
 
45
  ## Building LLM
46
  print("Building LLM")
47
- model = "gpt35turbo"
48
  llm = get_llm_api()
49
 
50
- ## Loading_tools
51
  print("Loading Databases")
52
  bdd_presse = get_vectorstore_api("presse")
53
  bdd_afp = get_vectorstore_api("afp")
54
- qdrants = {
55
- tab: pickle_to_document_store(
56
- hf_hub_download(
57
- repo_id="SpinozaProject/spinoza-database",
58
- filename=f"database_{tab}.pickle",
59
- repo_type="dataset",
60
- )
61
- )
62
- for tab in config["prompt_naming"]
63
- if tab != "Presse" and tab != "AFP"
64
- }
65
-
66
- ## Load Prompts
67
- print("Loading Prompts")
68
- chat_qa_prompts, chat_reformulation_prompts, chat_summarize_memory_prompts = {}, {}, {}
69
- for source, prompt in prompts.items():
70
- chat_qa_prompt, chat_reformulation_prompt = get_qa_prompts(config, prompt)
71
- chat_qa_prompts[source] = chat_qa_prompt
72
- chat_reformulation_prompts[source] = chat_reformulation_prompt
73
-
74
-
75
- with open("./assets/style.css", "r") as f:
76
- css = f.read()
77
-
78
-
79
- special_tokens = SpecialTokens(config)
80
-
81
- synthesis_template = """You are a factual journalist that summarize the secialized awnsers from thechnical sources.
82
-
83
- Based on the folowing question:
84
- {question}
85
 
86
- And the following expert answer:
87
- {answers}
 
 
88
 
89
- - When using legal answers, keep tracking of the name of the articles.
90
- - When using ADEME answers, name the sources that are mainly used.
91
- - List the different elements mentionned, and highlight the agreement points between the sources, as well as the contradictions or differences.
92
- - Contradictions don't lie in whether or not a subject is dealt with, but more in the opinion given or the way the subject is dealt with.
93
- - Generate the answer as markdown, with an aerated layout, and headlines in bold
94
- - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.",
95
- - Do not use the sentence 'Doc i says ...' to say where information came from.",
96
- - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]",
97
- - Start by highlighting contradictions, then do a general summary and finally get into the details that might be interesting for article writing. Where relevant, quote them.
98
- - Awnser in French / Répond en Français
99
- """
100
 
101
- synthesis_prompt = to_chat_instruction(synthesis_template, special_tokens)
102
- synthesis_prompt_template = ChatPromptTemplate.from_messages([synthesis_prompt])
103
-
104
-
105
- def zip_longest_fill(*args, fillvalue=None):
106
- # zip_longest('ABCD', 'xy', fillvalue='-') --> Ax By C- D-
107
- iterators = [iter(it) for it in args]
108
- num_active = len(iterators)
109
- if not num_active:
110
- return
111
-
112
- cond = True
113
- fillvalues = [None] * len(iterators)
114
- while cond:
115
- values = []
116
- for i, it in enumerate(iterators):
117
- try:
118
- value = next(it)
119
- except StopIteration:
120
- value = fillvalues[i]
121
- values.append(value)
122
-
123
- new_cond = False
124
- for i, elt in enumerate(values):
125
- if elt != fillvalues[i]:
126
- new_cond = True
127
- cond = new_cond
128
-
129
- fillvalues = values.copy()
130
- yield tuple(values)
131
-
132
-
133
- def format_question(question):
134
- return f"{question}" # ###
135
-
136
-
137
- def parse_question(question):
138
- x = question.replace("<p>", "").replace("</p>\n", "")
139
- if "### " in x:
140
- return x.split("### ")[1]
141
- return x
142
-
143
-
144
- def reformulate(question, tab, config=config):
145
- if tab in list(config["tabs"].keys()):
146
- return llm.stream(
147
- chat_reformulation_prompts[config["source_mapping"][tab]],
148
- {"question": parse_question(question)},
149
- )
150
- else:
151
- return iter([None] * 5)
152
-
153
-
154
- def reformulate_single_question(question, tab, config=config):
155
- for elt in reformulate(question, tab, config=config):
156
- time.sleep(0.02)
157
- yield elt
158
-
159
-
160
- def reformulate_questions(question, config=config):
161
  for elt in zip_longest_fill(
162
- *[reformulate(question, tab, config=config) for tab in config["tabs"]]
 
 
 
163
  ):
164
  time.sleep(0.02)
165
  yield elt
166
 
167
 
168
- def add_question(question):
169
- return question
170
-
171
-
172
- def answer(question, source, tab, config=config):
173
- if tab in list(config["tabs"].keys()):
174
- if len(source) < 10:
175
- return iter(["Aucune source trouvée, veuillez reformuler votre question"])
176
- else:
177
-
178
- return llm.stream(
179
- chat_qa_prompts[config["source_mapping"][tab]],
180
- {
181
- "question": parse_question(question),
182
- "sources": source.replace("<p>", "").replace("</p>\n", ""),
183
- },
184
- )
185
- else:
186
- return iter([None] * 5)
187
-
188
 
189
- def answer_single_question(source, question, tab, config=config):
190
- for elt in answer(question, source, tab, config=config):
191
- time.sleep(0.02)
192
- yield elt
193
 
194
 
195
- def answer_questions(*questions_sources, config=config):
 
 
196
  questions = [elt for elt in questions_sources[: len(questions_sources) // 2]]
197
  sources = [elt for elt in questions_sources[len(questions_sources) // 2 :]]
198
 
199
  for elt in zip_longest_fill(
200
  *[
201
- answer(question, source, tab, config=config)
202
  for question, source, tab in zip(questions, sources, config["tabs"])
203
  ]
204
  ):
@@ -209,105 +104,13 @@ def answer_questions(*questions_sources, config=config):
209
  ]
210
 
211
 
212
- def get_sources(
213
- questions, qdrants=qdrants, bdd_presse=bdd_presse, bdd_afp=bdd_afp, config=config
 
 
 
 
214
  ):
215
- k = config["num_document_retrieved"]
216
- min_similarity = config["min_similarity"]
217
- text, formated = [], []
218
- for i, (question, tab) in enumerate(zip(questions, list(config["tabs"].keys()))):
219
- if tab == "Presse":
220
- sources = bdd_presse.similarity_search_with_relevance_scores(
221
- question.replace("<p>", "").replace("</p>\n", ""), k=k
222
- )
223
- sources = [
224
- (doc, score) for doc, score in sources if score >= min_similarity
225
- ]
226
- formated.extend(
227
- [
228
- make_html_presse_source(source[0], j, source[1])
229
- for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
230
- ]
231
- )
232
-
233
- elif tab == "AFP":
234
- sources = bdd_afp.similarity_search_with_relevance_scores(
235
- question.replace("<p>", "").replace("</p>\n", ""), k=k
236
- )
237
- sources = [
238
- (doc, score) for doc, score in sources if score >= min_similarity
239
- ]
240
- formated.extend(
241
- [
242
- make_html_afp_source(source[0], j, source[1])
243
- for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
244
- ]
245
- )
246
-
247
- elif tab == "Documents Stratégiques":
248
- sources = qdrants[
249
- config["source_mapping"][tab]
250
- ].similarity_search_with_relevance_scores(
251
- config["query_preprompt"]
252
- + question.replace("<p>", "").replace("</p>\n", ""),
253
- k=k,
254
- )
255
- sources = [
256
- (doc, score) for doc, score in sources if score >= min_similarity
257
- ]
258
- formated.extend(
259
- [
260
- make_html_politique_source(source[0], j, source[1], config)
261
- for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
262
- ]
263
- )
264
-
265
- else:
266
- sources = qdrants[
267
- config["source_mapping"][tab]
268
- ].similarity_search_with_relevance_scores(
269
- config["query_preprompt"]
270
- + question.replace("<p>", "").replace("</p>\n", ""),
271
- k=k,
272
- )
273
- sources = [
274
- (doc, score) for doc, score in sources if score >= min_similarity
275
- ]
276
- formated.extend(
277
- [
278
- make_html_source(source[0], j, source[1], config)
279
- for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
280
- ]
281
- )
282
-
283
- text.extend(
284
- [
285
- "\n\n".join(
286
- [
287
- f"Doc {str(j)} with source type {source[0].metadata.get('file_source_type')}:\n"
288
- + source[0].page_content
289
- for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
290
- ]
291
- )
292
- ]
293
- )
294
-
295
- formated = "".join(formated)
296
-
297
- return formated, text
298
-
299
-
300
- def retrieve_sources(
301
- *questions, qdrants=qdrants, bdd_presse=bdd_presse, bdd_afp=bdd_afp, config=config
302
- ):
303
- formated_sources, text_sources = get_sources(
304
- questions, qdrants, bdd_presse, bdd_afp, config
305
- )
306
-
307
- return (formated_sources, *text_sources)
308
-
309
-
310
- def get_synthesis(question, *answers, config=config):
311
  answer = []
312
  for i, tab in enumerate(config["tabs"]):
313
  if len(str(answers[i])) >= 100:
@@ -329,47 +132,6 @@ def get_synthesis(question, *answers, config=config):
329
  yield [(question, parse_output_llm_with_sources(elt))]
330
 
331
 
332
- theme = gr.themes.Base(
333
- primary_hue="blue",
334
- secondary_hue="red",
335
- font=[gr.themes.GoogleFont("Poppins"), "ui-sans-serif", "system-ui", "sans-serif"],
336
- )
337
-
338
- with open("./assets/style.css", "r") as f:
339
- css = f.read()
340
-
341
- with open("./assets/source_information.md", "r") as f:
342
- source_information = f.read()
343
-
344
-
345
- def start_agents():
346
- gr.Info(message="The agents and Spinoza are loading...", duration=3)
347
-
348
- return [
349
- (None, "I am waiting until all the agents are done to generate an answer...")
350
- ]
351
-
352
-
353
- def end_agents():
354
- gr.Info(
355
- message="The agents and Spinoza have finished answering your question",
356
- duration=3,
357
- )
358
-
359
-
360
- def next_call():
361
- return
362
-
363
-
364
- init_prompt = """
365
- Hello, I am Spinoza, a conversational assistant designed to help you in your journalistic journey. I will answer your questions based **on the provided sources**.
366
-
367
- ⚠️ Limitations
368
- *Please note that this chatbot is in an early stage, it is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
369
-
370
- What do you want to learn ?
371
- """
372
-
373
  with gr.Blocks(
374
  title=f"🔍 Spinoza",
375
  css=css,
@@ -388,132 +150,43 @@ with gr.Blocks(
388
  with gr.Row(elem_id="chatbot-row"):
389
  with gr.Column(scale=2, elem_id="center-panel"):
390
  with gr.Group(elem_id="chatbot-group"):
391
- with gr.Accordion(
392
- "Science agent",
393
- open=False,
394
- elem_id="accordion-science",
395
- elem_classes="accordion",
396
- ):
397
- chatbots[list(config["tabs"].keys())[0]] = gr.Chatbot(
398
- show_copy_button=True,
399
- show_share_button=False,
400
- show_label=False,
401
- elem_id="chatbot-science",
402
- layout="panel",
403
- avatar_images=(
404
- "./assets/logos/help.png",
405
- None,
406
- ),
407
- )
408
-
409
- with gr.Accordion(
410
- "Law agent",
411
- open=False,
412
- elem_id="accordion-legal",
413
- elem_classes="accordion",
414
- ):
415
- chatbots[list(config["tabs"].keys())[1]] = gr.Chatbot(
416
- show_copy_button=True,
417
- show_share_button=False,
418
- show_label=False,
419
- elem_id="chatbot-legal",
420
- layout="panel",
421
- avatar_images=(
422
- "./assets/logos/help.png",
423
- None,
424
- ),
425
- )
426
-
427
- with gr.Accordion(
428
- "Politics agent",
429
- open=False,
430
- elem_id="accordion-politique",
431
- elem_classes="accordion",
432
- ):
433
- chatbots[list(config["tabs"].keys())[2]] = gr.Chatbot(
434
- show_copy_button=True,
435
- show_share_button=False,
436
- show_label=False,
437
- elem_id="chatbot-politique",
438
- layout="panel",
439
- avatar_images=(
440
- "./assets/logos/help.png",
441
- None,
442
- ),
443
- )
444
-
445
- with gr.Accordion(
446
- "ADEME agent",
447
- open=False,
448
- elem_id="accordion-ademe",
449
- elem_classes="accordion",
450
- ):
451
- chatbots[list(config["tabs"].keys())[3]] = gr.Chatbot(
452
- show_copy_button=True,
453
- show_share_button=False,
454
- show_label=False,
455
- elem_id="chatbot-ademe",
456
- layout="panel",
457
- avatar_images=(
458
- "./assets/logos/help.png",
459
- None,
460
- ),
461
- )
462
-
463
- with gr.Accordion(
464
- "Press agent",
465
- open=False,
466
- elem_id="accordion-presse",
467
- elem_classes="accordion",
468
- ):
469
- chatbots[list(config["tabs"].keys())[4]] = gr.Chatbot(
470
- show_copy_button=True,
471
- show_share_button=False,
472
- show_label=False,
473
- elem_id="chatbot-presse",
474
- layout="panel",
475
- avatar_images=(
476
- "./assets/logos/help.png",
477
- None,
478
- ),
479
- )
480
-
481
- with gr.Accordion(
482
- "AFP agent",
483
- open=False,
484
- elem_id="accordion-afp",
485
- elem_classes="accordion",
486
- ):
487
- chatbots[list(config["tabs"].keys())[5]] = gr.Chatbot(
488
- show_copy_button=True,
489
- show_share_button=False,
490
- show_label=False,
491
- elem_id="chatbot-afp",
492
- layout="panel",
493
- avatar_images=(
494
- "./assets/logos/help.png",
495
- None,
496
- ),
497
- )
498
-
499
- with gr.Accordion(
500
- "Spinoza",
501
- open=True,
502
- elem_id="accordion-spinoza",
503
- elem_classes="accordion",
504
- ):
505
- chatbots["spinoza"] = gr.Chatbot(
506
- value=[(None, init_prompt)],
507
- show_copy_button=True,
508
- show_share_button=False,
509
- show_label=False,
510
- elem_id="chatbot-spinoza",
511
- layout="panel",
512
- avatar_images=(
513
- "./assets/logos/help.png",
514
- "./assets/logos/spinoza.png",
515
- ),
516
- )
517
 
518
  with gr.Row(elem_id="input-message"):
519
  ask = gr.Textbox(
@@ -542,7 +215,7 @@ with gr.Blocks(
542
  gr.Markdown("For any issue contact **[email protected]**.")
543
 
544
  ask.submit(
545
- start_agents, inputs=[], outputs=[chatbots["spinoza"]], js=accordion_trigger()
546
  ).then(
547
  fn=reformulate_questions,
548
  inputs=[ask],
@@ -564,7 +237,7 @@ with gr.Blocks(
564
  fn=get_synthesis,
565
  inputs=[agent_questions[list(config["tabs"].keys())[1]]]
566
  + [chatbots[tab] for tab in config["tabs"]],
567
- outputs=[chatbots["spinoza"]],
568
  ).then(
569
  fn=next_call, inputs=[], outputs=[], js=accordion_trigger_spinoza_end()
570
  ).then(
 
1
  import gradio as gr
2
  import time
 
 
 
3
  from spinoza_project.source.backend.llm_utils import (
 
4
  get_llm_api,
 
5
  get_vectorstore_api,
6
  )
 
 
7
  from spinoza_project.source.frontend.utils import (
 
 
 
 
 
8
  init_env,
9
+ parse_output_llm_with_sources,
10
  )
11
+ from spinoza_project.source.frontend.gradio_utils import (
12
+ get_sources,
13
+ set_prompts,
14
+ get_config,
15
+ get_prompts,
16
+ get_assets,
17
+ get_theme,
18
+ get_init_prompt,
19
+ get_synthesis_prompt,
20
+ get_qdrants,
21
+ start_agents,
22
+ end_agents,
23
+ next_call,
24
+ zip_longest_fill,
25
+ reformulate,
26
+ answer,
27
  )
28
 
29
  from assets.utils_javascript import (
 
35
  )
36
 
37
  init_env()
38
+ config = get_config()
39
 
40
+ ## Loading Prompts
41
+ print("Loading Prompts")
42
+ prompts = get_prompts(config)
43
+ chat_qa_prompts, chat_reformulation_prompts = set_prompts(prompts, config)
44
+ synthesis_prompt_template = get_synthesis_prompt(config)
 
 
45
 
46
  ## Building LLM
47
  print("Building LLM")
 
48
  llm = get_llm_api()
49
 
50
+ ## Loading BDDs
51
  print("Loading Databases")
52
  bdd_presse = get_vectorstore_api("presse")
53
  bdd_afp = get_vectorstore_api("afp")
54
+ qdrants = get_qdrants(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ ## Loading Assets
57
+ css, source_information = get_assets()
58
+ theme = get_theme()
59
+ init_prompt = get_init_prompt()
60
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ def reformulate_questions(
63
+ question,
64
+ llm=llm,
65
+ chat_reformulation_prompts=chat_reformulation_prompts,
66
+ config=config,
67
+ ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  for elt in zip_longest_fill(
69
+ *[
70
+ reformulate(llm, chat_reformulation_prompts, question, tab, config=config)
71
+ for tab in config["tabs"]
72
+ ]
73
  ):
74
  time.sleep(0.02)
75
  yield elt
76
 
77
 
78
+ def retrieve_sources(
79
+ *questions, qdrants=qdrants, bdd_presse=bdd_presse, bdd_afp=bdd_afp, config=config
80
+ ):
81
+ formated_sources, text_sources = get_sources(
82
+ questions, qdrants, bdd_presse, bdd_afp, config
83
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ return (formated_sources, *text_sources)
 
 
 
86
 
87
 
88
+ def answer_questions(
89
+ *questions_sources, llm=llm, chat_qa_prompts=chat_qa_prompts, config=config
90
+ ):
91
  questions = [elt for elt in questions_sources[: len(questions_sources) // 2]]
92
  sources = [elt for elt in questions_sources[len(questions_sources) // 2 :]]
93
 
94
  for elt in zip_longest_fill(
95
  *[
96
+ answer(llm, chat_qa_prompts, question, source, tab, config)
97
  for question, source, tab in zip(questions, sources, config["tabs"])
98
  ]
99
  ):
 
104
  ]
105
 
106
 
107
+ def get_synthesis(
108
+ question,
109
+ *answers,
110
+ llm=llm,
111
+ synthesis_prompt_template=synthesis_prompt_template,
112
+ config=config,
113
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  answer = []
115
  for i, tab in enumerate(config["tabs"]):
116
  if len(str(answers[i])) >= 100:
 
132
  yield [(question, parse_output_llm_with_sources(elt))]
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  with gr.Blocks(
136
  title=f"🔍 Spinoza",
137
  css=css,
 
150
  with gr.Row(elem_id="chatbot-row"):
151
  with gr.Column(scale=2, elem_id="center-panel"):
152
  with gr.Group(elem_id="chatbot-group"):
153
+ for tab in list(config["tabs"].keys()) + ["Spinoza"]:
154
+ if tab == "Spinoza":
155
+ agent_name = f"Spinoza"
156
+ elem_id = f"accordion-{tab}"
157
+ elem_classes = "accordion accordion-agent spinoza-agent"
158
+ else:
159
+ agent_name = f"Agent {config['source_mapping'][tab]}"
160
+ elem_id = f"accordion-{config['source_mapping'][tab]}"
161
+ elem_classes = "accordion accordion-agent"
162
+
163
+ with gr.Accordion(
164
+ agent_name,
165
+ open=True if agent_name == "Spinoza" else False,
166
+ elem_id=elem_id,
167
+ elem_classes=elem_classes,
168
+ ):
169
+ # chatbot_key = agent_name.lower().replace(" ", "_")
170
+ chatbots[tab] = gr.Chatbot(
171
+ value=(
172
+ [(None, init_prompt)]
173
+ if agent_name == "Spinoza"
174
+ else None
175
+ ),
176
+ show_copy_button=True,
177
+ show_share_button=False,
178
+ show_label=False,
179
+ elem_id=f"chatbot-{agent_name.lower().replace(' ', '-')}",
180
+ layout="panel",
181
+ avatar_images=(
182
+ "./assets/logos/help.png",
183
+ (
184
+ "./assets/logos/spinoza.png"
185
+ if agent_name == "Spinoza"
186
+ else None
187
+ ),
188
+ ),
189
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  with gr.Row(elem_id="input-message"):
192
  ask = gr.Textbox(
 
215
  gr.Markdown("For any issue contact **[email protected]**.")
216
 
217
  ask.submit(
218
+ start_agents, inputs=[], outputs=[chatbots["Spinoza"]], js=accordion_trigger()
219
  ).then(
220
  fn=reformulate_questions,
221
  inputs=[ask],
 
237
  fn=get_synthesis,
238
  inputs=[agent_questions[list(config["tabs"].keys())[1]]]
239
  + [chatbots[tab] for tab in config["tabs"]],
240
+ outputs=[chatbots["Spinoza"]],
241
  ).then(
242
  fn=next_call, inputs=[], outputs=[], js=accordion_trigger_spinoza_end()
243
  ).then(
assets/style.css CHANGED
@@ -118,53 +118,20 @@ a {
118
  height: calc(-100px + 100vh) !important;
119
  }
120
 
121
- #accordion-spinoza {
122
  height: 15cm;
123
  }
124
 
125
-
126
- #accordion-spinoza>open>span:nth-child(1) {
127
  color: #000000;
128
  font-size: large;
129
  font-weight: bold;
130
  }
131
 
132
- #accordion-spinoza>button:nth-child(2)>span:nth-child(1) {
133
- color: #000000;
134
- font-size: large;
135
- font-weight: bold;
136
- }
137
-
138
- #accordion-science>button:nth-child(2)>span:nth-child(1) {
139
- color: #9ca1a5e7;
140
- font-weight: bold;
141
- }
142
-
143
- #accordion-presse>button:nth-child(2)>span:nth-child(1) {
144
- color: #9ca1a5e7;
145
- font-weight: bold;
146
- }
147
-
148
- #accordion-legal>button:nth-child(2)>span:nth-child(1) {
149
  color: #9ca1a5e7;
150
  font-weight: bold;
151
  }
152
-
153
- #accordion-politique>button:nth-child(2)>span:nth-child(1) {
154
- color: #9ca1a5e7;
155
- font-weight: bold;
156
- }
157
-
158
- #accordion-ademe>button:nth-child(2)>span:nth-child(1) {
159
- color: #9ca1a5e7;
160
- font-weight: bold;
161
- }
162
-
163
- #accordion-afp>button:nth-child(2)>span:nth-child(1) {
164
- color: #9ca1a5e7;
165
- font-weight: bold;
166
- }
167
-
168
  }
169
 
170
  textarea.scroll-hide {
 
118
  height: calc(-100px + 100vh) !important;
119
  }
120
 
121
+ .accordion-agent.spinoza-agent {
122
  height: 15cm;
123
  }
124
 
125
+ .accordion-agent.spinoza-agent > button > span {
 
126
  color: #000000;
127
  font-size: large;
128
  font-weight: bold;
129
  }
130
 
131
+ .accordion-agent > button > span {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  color: #9ca1a5e7;
133
  font-weight: bold;
134
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  }
136
 
137
  textarea.scroll-hide {
assets/utils_javascript.py CHANGED
@@ -15,37 +15,19 @@ def update_footer():
15
  def accordion_trigger():
16
  return """
17
  function accordion_trigger() {
18
- input_textbox = document.getElementById("input-textbox")
19
  input_textbox.addEventListener('keyup', function (e) {
20
  if (e.key === 'Enter' || e.keyCode === 13) {
21
- var accordion_science = document.getElementById("accordion-science")
22
- var accordion_presse = document.getElementById("accordion-presse")
23
- var accordion_politique = document.getElementById("accordion-politique")
24
- var accordion_legal = document.getElementById("accordion-legal")
25
- var accordion_ademe= document.getElementById("accordion-ademe")
26
- var accordion_afp= document.getElementById("accordion-afp")
27
- var accordion_spinoza = document.getElementById("accordion-spinoza")
28
- document.querySelectorAll(".loader").forEach(el => el.remove());
29
- document.querySelectorAll(".loader-helper").forEach(el => el.remove());
30
- accordion_science.children[1].children[0].textContent = "Science agent";
31
- accordion_presse.children[1].children[0].textContent = "Press agent";
32
- accordion_politique.children[1].children[0].textContent = "Politics agent";
33
- accordion_legal.children[1].children[0].textContent = "Law agent";
34
- accordion_ademe.children[1].children[0].textContent = "ADEME agent";
35
- accordion_afp.children[1].children[0].textContent = "AFP agent";
36
- accordion_spinoza.children[1].children[0].textContent = "Spinoza";
37
- accordion_science.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
38
- accordion_science.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
39
- accordion_presse.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
40
- accordion_presse.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
41
- accordion_politique.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
42
- accordion_politique.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
43
- accordion_legal.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
44
- accordion_legal.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
45
- accordion_ademe.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
46
- accordion_ademe.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
47
- accordion_afp.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
48
- accordion_afp.children[1].children[0].innerHTML += "<span class='loader'>loading</span>";
49
  }
50
  });
51
  }
@@ -55,18 +37,15 @@ def accordion_trigger():
55
  def accordion_trigger_end():
56
  return """
57
  function accordion_trigger_end() {
58
- var accordion_science = document.getElementById("accordion-science")
59
- var accordion_presse = document.getElementById("accordion-presse")
60
- var accordion_politique = document.getElementById("accordion-politique")
61
- var accordion_legal = document.getElementById("accordion-legal")
62
- var accordion_ademe = document.getElementById("accordion-ademe")
63
- var accordion_afp= document.getElementById("accordion-afp")
64
- accordion_science.children[1].children[0].textContent = "Science agent - ready";
65
- accordion_presse.children[1].children[0].textContent = "Press agent - ready";
66
- accordion_politique.children[1].children[0].textContent = "Politics agent - ready";
67
- accordion_legal.children[1].children[0].textContent = "Law agent - ready";
68
- accordion_ademe.children[1].children[0].textContent = "ADEME agent - ready";
69
- accordion_afp.children[1].children[0].textContent = "AFP agent - ready";
70
  }
71
  """
72
 
@@ -74,12 +53,11 @@ def accordion_trigger_end():
74
  def accordion_trigger_spinoza():
75
  return """
76
  function accordion_trigger_spinoza() {
77
- var accordion_spinoza = document.getElementById("accordion-spinoza")
78
- document.querySelectorAll(".loader").forEach(el => el.remove());
79
- document.querySelectorAll(".loader-helper").forEach(el => el.remove());
80
- accordion_spinoza.children[1].children[0].textContent = "Spinoza";
81
- accordion_spinoza.children[1].children[0].innerHTML += "<span class='loader-helper'> - </span>";
82
- accordion_spinoza.children[1].children[0].innerHTML += "<span class='loader'>generating</span>";
83
  }
84
  """
85
 
@@ -87,7 +65,8 @@ def accordion_trigger_spinoza():
87
  def accordion_trigger_spinoza_end():
88
  return """
89
  function accordion_trigger_spinoza_end() {
90
- var accordion_spinoza = document.getElementById("accordion-spinoza")
91
- accordion_spinoza.children[1].children[0].textContent = "Spinoza - ready";
 
92
  }
93
  """
 
15
  def accordion_trigger():
16
  return """
17
  function accordion_trigger() {
18
+ var input_textbox = document.getElementById("input-textbox");
19
  input_textbox.addEventListener('keyup', function (e) {
20
  if (e.key === 'Enter' || e.keyCode === 13) {
21
+ document.querySelectorAll(".loader, .loader-helper").forEach(el => el.remove());
22
+ var accordions = document.querySelectorAll('.accordion-agent');
23
+ accordions.forEach(function (accordion) {
24
+ var agentName = "Agent " + accordion.id.split('-')[1];
25
+ var buttonSpan = accordion.querySelector('button > span');
26
+ if (!accordion.classList.contains('spinoza-agent')) {
27
+ buttonSpan.textContent = agentName;
28
+ buttonSpan.innerHTML += "<span class='loader-helper'> - </span><span class='loader'>loading</span>";
29
+ }
30
+ });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  });
33
  }
 
37
  def accordion_trigger_end():
38
  return """
39
  function accordion_trigger_end() {
40
+ var accordions = document.querySelectorAll('.accordion-agent');
41
+
42
+ accordions.forEach(function (accordion) {
43
+ if (!accordion.classList.contains('spinoza-agent')) {
44
+ var agentName = "Agent " + accordion.id.split('-')[1];
45
+ var buttonSpan = accordion.querySelector('button > span');
46
+ buttonSpan.textContent = agentName + " - ready";
47
+ }
48
+ });
 
 
 
49
  }
50
  """
51
 
 
53
  def accordion_trigger_spinoza():
54
  return """
55
  function accordion_trigger_spinoza() {
56
+ var accordion_spinoza = document.querySelector('.spinoza-agent');
57
+ document.querySelectorAll(".loader, .loader-helper").forEach(el => el.remove());
58
+ var buttonSpan = accordion_spinoza.querySelector('button > span');
59
+ buttonSpan.textContent = "Spinoza";
60
+ buttonSpan.innerHTML += "<span class='loader-helper'> - </span><span class='loader'>generating</span>";
 
61
  }
62
  """
63
 
 
65
  def accordion_trigger_spinoza_end():
66
  return """
67
  function accordion_trigger_spinoza_end() {
68
+ var accordion_spinoza = document.querySelector('.spinoza-agent');
69
+ var buttonSpan = accordion_spinoza.querySelector('button > span');
70
+ buttonSpan.textContent = "Spinoza - ready";
71
  }
72
  """
spinoza_project/prompt_Spinoza.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt:
2
+ [
3
+ "You are a factual journalist that summarize the secialized awnsers from thechnical sources.",
4
+ "Based on the folowing question:",
5
+ "{question}",
6
+ "",
7
+ "And the following expert answer:",
8
+ "{answers}",
9
+ "",
10
+ "- When using legal answers, keep tracking of the name of the articles.",
11
+ "- When using ADEME answers, name the sources that are mainly used.",
12
+ "- List the different elements mentionned, and highlight the agreement points between the sources, as well as the contradictions or differences.",
13
+ "- Contradictions don't lie in whether or not a subject is dealt with, but more in the opinion given or the way the subject is dealt with.",
14
+ "- Generate the answer as markdown, with an aerated layout, and headlines in bold",
15
+ "- When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.",
16
+ "- If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]",
17
+ "- Start by highlighting contradictions, then do a general summary and finally get into the details that might be interesting for article writing. Where relevant, quote them.",
18
+ "- Answer in French / Répond en Français"
19
+ ]
spinoza_project/source/frontend/gradio_utils.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import yaml
3
+ from langchain.prompts.chat import ChatPromptTemplate
4
+ from huggingface_hub import hf_hub_download
5
+ from spinoza_project.source.frontend.utils import (
6
+ make_html_source,
7
+ make_html_presse_source,
8
+ make_html_afp_source,
9
+ make_html_politique_source,
10
+ )
11
+ from spinoza_project.source.backend.prompt_utils import (
12
+ to_chat_instruction,
13
+ SpecialTokens,
14
+ )
15
+ from spinoza_project.source.backend.get_prompts import get_qa_prompts
16
+ from spinoza_project.source.backend.document_store import pickle_to_document_store
17
+
18
+
19
+ def get_config():
20
+ with open("./spinoza_project/config.yaml") as f:
21
+ return yaml.full_load(f)
22
+
23
+
24
+ def get_prompts(config):
25
+ prompts = {}
26
+ for source in config["prompt_naming"]:
27
+ with open(f"./spinoza_project/prompt_{source}.yaml") as f:
28
+ prompts[source] = yaml.full_load(f)
29
+ return prompts
30
+
31
+
32
+ def set_prompts(prompts, config):
33
+ chat_qa_prompts, chat_reformulation_prompts = ({}, {})
34
+ for source, prompt in prompts.items():
35
+ chat_qa_prompt, chat_reformulation_prompt = get_qa_prompts(config, prompt)
36
+ chat_qa_prompts[source] = chat_qa_prompt
37
+ chat_reformulation_prompts[source] = chat_reformulation_prompt
38
+
39
+ return chat_qa_prompts, chat_reformulation_prompts
40
+
41
+
42
+ def get_assets():
43
+ with open("./assets/style.css", "r") as f:
44
+ css = f.read()
45
+ with open("./assets/source_information.md", "r") as f:
46
+ source_information = f.read()
47
+ return css, source_information
48
+
49
+
50
+ def get_qdrants(config):
51
+ qdrants = {
52
+ tab: pickle_to_document_store(
53
+ hf_hub_download(
54
+ repo_id="SpinozaProject/spinoza-database",
55
+ filename=f"database_{tab}.pickle",
56
+ repo_type="dataset",
57
+ )
58
+ )
59
+ for tab in config["prompt_naming"]
60
+ if tab != "Presse" and tab != "AFP"
61
+ }
62
+
63
+ return qdrants
64
+
65
+
66
+ def get_theme():
67
+ return gr.themes.Base(
68
+ primary_hue="blue",
69
+ secondary_hue="red",
70
+ font=[
71
+ gr.themes.GoogleFont("Poppins"),
72
+ "ui-sans-serif",
73
+ "system-ui",
74
+ "sans-serif",
75
+ ],
76
+ )
77
+
78
+
79
+ def get_init_prompt():
80
+ return """
81
+ Hello, I am Spinoza, a conversational assistant designed to help you in your journalistic journey. I will answer your questions based **on the provided sources**.
82
+
83
+ ⚠️ Limitations
84
+ *Please note that this chatbot is in an early stage, it is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
85
+
86
+ What do you want to learn ?
87
+ """
88
+
89
+
90
+ def get_synthesis_prompt(config):
91
+ special_tokens = SpecialTokens(config)
92
+ with open(f"./spinoza_project/prompt_Spinoza.yaml", "r") as f:
93
+ synthesis_template = f.read()
94
+
95
+ synthesis_prompt = to_chat_instruction(synthesis_template, special_tokens)
96
+ synthesis_prompt_template = ChatPromptTemplate.from_messages([synthesis_prompt])
97
+
98
+ return synthesis_prompt_template
99
+
100
+
101
+ def zip_longest_fill(*args, fillvalue=None):
102
+ # zip_longest('ABCD', 'xy', fillvalue='-') --> Ax By C- D-
103
+ iterators = [iter(it) for it in args]
104
+ num_active = len(iterators)
105
+ if not num_active:
106
+ return
107
+
108
+ cond = True
109
+ fillvalues = [None] * len(iterators)
110
+ while cond:
111
+ values = []
112
+ for i, it in enumerate(iterators):
113
+ try:
114
+ value = next(it)
115
+ except StopIteration:
116
+ value = fillvalues[i]
117
+ values.append(value)
118
+
119
+ new_cond = False
120
+ for i, elt in enumerate(values):
121
+ if elt != fillvalues[i]:
122
+ new_cond = True
123
+ cond = new_cond
124
+
125
+ fillvalues = values.copy()
126
+ yield tuple(values)
127
+
128
+
129
+ def start_agents():
130
+ gr.Info(message="The agents and Spinoza are loading...", duration=3)
131
+
132
+ return [
133
+ (None, "I am waiting until all the agents are done to generate an answer...")
134
+ ]
135
+
136
+
137
+ def end_agents():
138
+ gr.Info(
139
+ message="The agents and Spinoza have finished answering your question",
140
+ duration=3,
141
+ )
142
+
143
+
144
+ def next_call():
145
+ return
146
+
147
+
148
+ def format_question(question):
149
+ return f"{question}"
150
+
151
+
152
+ def parse_question(question):
153
+ x = question.replace("<p>", "").replace("</p>\n", "")
154
+ if "### " in x:
155
+ return x.split("### ")[1]
156
+ return x
157
+
158
+
159
+ def reformulate(llm, chat_reformulation_prompts, question, tab, config):
160
+ if tab in list(config["tabs"].keys()):
161
+ return llm.stream(
162
+ chat_reformulation_prompts[config["source_mapping"][tab]],
163
+ {"question": parse_question(question)},
164
+ )
165
+ else:
166
+ return iter([None] * 5)
167
+
168
+
169
+ def add_question(question):
170
+ return question
171
+
172
+
173
+ def answer(llm, chat_qa_prompts, question, source, tab, config):
174
+ if tab in list(config["tabs"].keys()):
175
+ if len(source) < 10:
176
+ return iter(["Aucune source trouvée, veuillez reformuler votre question"])
177
+ else:
178
+
179
+ return llm.stream(
180
+ chat_qa_prompts[config["source_mapping"][tab]],
181
+ {
182
+ "question": parse_question(question),
183
+ "sources": source.replace("<p>", "").replace("</p>\n", ""),
184
+ },
185
+ )
186
+ else:
187
+ return iter([None] * 5)
188
+
189
+
190
+ def get_sources(questions, qdrants, bdd_presse, bdd_afp, config):
191
+ k = config["num_document_retrieved"]
192
+ min_similarity = config["min_similarity"]
193
+ text, formated = [], []
194
+ for i, (question, tab) in enumerate(zip(questions, list(config["tabs"].keys()))):
195
+ if tab == "Presse":
196
+ sources = bdd_presse.similarity_search_with_relevance_scores(
197
+ question.replace("<p>", "").replace("</p>\n", ""), k=k
198
+ )
199
+ sources = [
200
+ (doc, score) for doc, score in sources if score >= min_similarity
201
+ ]
202
+ formated.extend(
203
+ [
204
+ make_html_presse_source(source[0], j, source[1])
205
+ for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
206
+ ]
207
+ )
208
+
209
+ elif tab == "AFP":
210
+ sources = bdd_afp.similarity_search_with_relevance_scores(
211
+ question.replace("<p>", "").replace("</p>\n", ""), k=k
212
+ )
213
+ sources = [
214
+ (doc, score) for doc, score in sources if score >= min_similarity
215
+ ]
216
+ formated.extend(
217
+ [
218
+ make_html_afp_source(source[0], j, source[1])
219
+ for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
220
+ ]
221
+ )
222
+
223
+ elif tab == "Documents Stratégiques":
224
+ sources = qdrants[
225
+ config["source_mapping"][tab]
226
+ ].similarity_search_with_relevance_scores(
227
+ config["query_preprompt"]
228
+ + question.replace("<p>", "").replace("</p>\n", ""),
229
+ k=k,
230
+ )
231
+ sources = [
232
+ (doc, score) for doc, score in sources if score >= min_similarity
233
+ ]
234
+ formated.extend(
235
+ [
236
+ make_html_politique_source(source[0], j, source[1], config)
237
+ for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
238
+ ]
239
+ )
240
+
241
+ else:
242
+ sources = qdrants[
243
+ config["source_mapping"][tab]
244
+ ].similarity_search_with_relevance_scores(
245
+ config["query_preprompt"]
246
+ + question.replace("<p>", "").replace("</p>\n", ""),
247
+ k=k,
248
+ )
249
+ sources = [
250
+ (doc, score) for doc, score in sources if score >= min_similarity
251
+ ]
252
+ formated.extend(
253
+ [
254
+ make_html_source(source[0], j, source[1], config)
255
+ for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
256
+ ]
257
+ )
258
+
259
+ text.extend(
260
+ [
261
+ "\n\n".join(
262
+ [
263
+ f"Doc {str(j)} with source type {source[0].metadata.get('file_source_type')}:\n"
264
+ + source[0].page_content
265
+ for j, source in zip(range(k * i + 1, k * (i + 1) + 1), sources)
266
+ ]
267
+ )
268
+ ]
269
+ )
270
+
271
+ formated = "".join(formated)
272
+
273
+ return formated, text