minko186 commited on
Commit
f14cff1
·
1 Parent(s): c85110b

make g search markable, add role/occupation, switch to domain to include

Browse files
Files changed (2) hide show
  1. app.py +41 -26
  2. plagiarism.py +18 -9
app.py CHANGED
@@ -219,11 +219,9 @@ def ai_check(text: str, option: str):
219
 
220
 
221
  def generate_prompt(settings: Dict[str, str]) -> str:
222
- content_string = "\n".join(
223
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
224
- )
225
-
226
  prompt = f"""
 
 
227
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
228
 
229
  Style and Tone:
@@ -244,9 +242,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
244
  - End with a {settings['conclusion_type']} conclusion
245
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
246
  - Do not make any headline, title bold.
247
-
248
- Use the content here from the URLs I've found for you:
249
- {content_string}
250
 
251
  Ensure proper paragraph breaks for better readability.
252
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -255,11 +251,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
255
 
256
 
257
  def regenerate_prompt(settings: Dict[str, str]) -> str:
258
- content_string = "\n".join(
259
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
260
- )
261
-
262
  prompt = f"""
 
 
263
  "{settings['generated_article']}"
264
 
265
  Edit the given text based on user comments.
@@ -269,8 +263,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
269
  - The original content should not be changed. Make minor modifications based on user comments above.
270
  - Keep the references the same as the given text in the same format.
271
  - Do not make any headline, title bold.
272
- Use the content here from the URLs I've found for you:
273
- {content_string}
274
 
275
  Ensure proper paragraph breaks for better readability.
276
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -279,6 +272,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
279
 
280
 
281
  def generate_article(
 
282
  topic: str,
283
  keywords: str,
284
  article_length: str,
@@ -292,15 +286,13 @@ def generate_article(
292
  num_examples: str,
293
  conclusion_type: str,
294
  ai_model: str,
295
- sorted_date,
296
- domains_to_skip,
297
  api_key: str = None,
298
  generated_article: str = None,
299
  user_comments: str = None,
300
  ) -> str:
301
-
302
- url_content = google_search(topic, sorted_date, domains_to_skip)
303
  settings = {
 
304
  "topic": topic,
305
  "keywords": [k.strip() for k in keywords.split(",")],
306
  "article_length": article_length,
@@ -313,7 +305,7 @@ def generate_article(
313
  "references": [r.strip() for r in references.split(",")],
314
  "num_examples": num_examples,
315
  "conclusion_type": conclusion_type,
316
- "sources": url_content,
317
  "generated_article": generated_article,
318
  "user_comments": user_comments,
319
  }
@@ -379,7 +371,11 @@ def format_references(text: str) -> str:
379
  in_references = False
380
 
381
  for line in lines:
382
- if line.strip().lower() == "references" or line.strip().lower() == "references:":
 
 
 
 
383
  in_references = True
384
  continue
385
  if in_references:
@@ -396,6 +392,7 @@ def format_references(text: str) -> str:
396
 
397
 
398
  def generate_and_format(
 
399
  topic,
400
  keywords,
401
  article_length,
@@ -410,20 +407,29 @@ def generate_and_format(
410
  conclusion_type,
411
  ai_model,
412
  api_key,
 
413
  year_from,
414
  month_from,
415
  day_from,
416
  year_to,
417
  month_to,
418
  day_to,
419
- domains_to_skip,
420
  generated_article: str = None,
421
  user_comments: str = None,
422
  ):
423
  date_from = build_date(year_from, month_from, day_from)
424
  date_to = build_date(year_to, month_to, day_to)
425
  sorted_date = f"date:r:{date_from}:{date_to}"
 
 
 
 
 
 
 
426
  article = generate_article(
 
427
  topic,
428
  keywords,
429
  article_length,
@@ -437,9 +443,8 @@ def generate_and_format(
437
  num_examples,
438
  conclusion_type,
439
  ai_model,
 
440
  api_key,
441
- sorted_date,
442
- domains_to_skip,
443
  generated_article,
444
  user_comments,
445
  )
@@ -465,6 +470,7 @@ def create_interface():
465
  with gr.Column(scale=2):
466
  with gr.Group():
467
  gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
 
468
  input_topic = gr.Textbox(
469
  label="Topic",
470
  placeholder="Enter the main topic of your article",
@@ -585,6 +591,10 @@ def create_interface():
585
  )
586
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
587
  with gr.Group():
 
 
 
 
588
  with gr.Row():
589
  month_from = gr.Dropdown(
590
  choices=months,
@@ -606,10 +616,11 @@ def create_interface():
606
  year_to = gr.Textbox(label="To Year", value=d1[2])
607
 
608
  with gr.Row():
609
- domains_to_skip = gr.Dropdown(
610
  domain_list,
 
611
  multiselect=True,
612
- label="Domain To Skip",
613
  )
614
 
615
  with gr.Group():
@@ -691,6 +702,7 @@ def create_interface():
691
  generate_btn.click(
692
  fn=generate_and_format,
693
  inputs=[
 
694
  input_topic,
695
  input_keywords,
696
  input_length,
@@ -705,13 +717,14 @@ def create_interface():
705
  input_conclusion,
706
  ai_generator,
707
  input_api,
 
708
  year_from,
709
  month_from,
710
  day_from,
711
  year_to,
712
  month_to,
713
  day_to,
714
- domains_to_skip,
715
  ],
716
  outputs=[output_article],
717
  )
@@ -719,6 +732,7 @@ def create_interface():
719
  regenerate_btn.click(
720
  fn=generate_and_format,
721
  inputs=[
 
722
  input_topic,
723
  input_keywords,
724
  input_length,
@@ -733,13 +747,14 @@ def create_interface():
733
  input_conclusion,
734
  ai_generator,
735
  input_api,
 
736
  year_from,
737
  month_from,
738
  day_from,
739
  year_to,
740
  month_to,
741
  day_to,
742
- domains_to_skip,
743
  output_article,
744
  ai_comments,
745
  ],
 
219
 
220
 
221
  def generate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
222
  prompt = f"""
223
+ I am a {settings['role']}
224
+
225
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
226
 
227
  Style and Tone:
 
242
  - End with a {settings['conclusion_type']} conclusion
243
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
244
  - Do not make any headline, title bold.
245
+ {settings['sources']}
 
 
246
 
247
  Ensure proper paragraph breaks for better readability.
248
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
251
 
252
 
253
  def regenerate_prompt(settings: Dict[str, str]) -> str:
 
 
 
 
254
  prompt = f"""
255
+ I am a {settings['role']}
256
+
257
  "{settings['generated_article']}"
258
 
259
  Edit the given text based on user comments.
 
263
  - The original content should not be changed. Make minor modifications based on user comments above.
264
  - Keep the references the same as the given text in the same format.
265
  - Do not make any headline, title bold.
266
+ {settings['sources']}
 
267
 
268
  Ensure proper paragraph breaks for better readability.
269
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
272
 
273
 
274
  def generate_article(
275
+ input_role: str,
276
  topic: str,
277
  keywords: str,
278
  article_length: str,
 
286
  num_examples: str,
287
  conclusion_type: str,
288
  ai_model: str,
289
+ content_string: str,
 
290
  api_key: str = None,
291
  generated_article: str = None,
292
  user_comments: str = None,
293
  ) -> str:
 
 
294
  settings = {
295
+ "role": input_role,
296
  "topic": topic,
297
  "keywords": [k.strip() for k in keywords.split(",")],
298
  "article_length": article_length,
 
305
  "references": [r.strip() for r in references.split(",")],
306
  "num_examples": num_examples,
307
  "conclusion_type": conclusion_type,
308
+ "sources": content_string,
309
  "generated_article": generated_article,
310
  "user_comments": user_comments,
311
  }
 
371
  in_references = False
372
 
373
  for line in lines:
374
+ if (
375
+ line.strip().lower() == "references"
376
+ or line.strip().lower() == "references:"
377
+ or line.strip().lower().startswith("references:")
378
+ ):
379
  in_references = True
380
  continue
381
  if in_references:
 
392
 
393
 
394
  def generate_and_format(
395
+ input_role,
396
  topic,
397
  keywords,
398
  article_length,
 
407
  conclusion_type,
408
  ai_model,
409
  api_key,
410
+ google_search_check,
411
  year_from,
412
  month_from,
413
  day_from,
414
  year_to,
415
  month_to,
416
  day_to,
417
+ domains_to_include,
418
  generated_article: str = None,
419
  user_comments: str = None,
420
  ):
421
  date_from = build_date(year_from, month_from, day_from)
422
  date_to = build_date(year_to, month_to, day_to)
423
  sorted_date = f"date:r:{date_from}:{date_to}"
424
+ content_string = ""
425
+ if google_search_check:
426
+ url_content = google_search(topic, sorted_date, domains_to_include)
427
+ content_string = "\n".join(
428
+ f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
429
+ )
430
+ content_string = "Use the trusted information here from the URLs I've found for you:\n" + content_string
431
  article = generate_article(
432
+ input_role,
433
  topic,
434
  keywords,
435
  article_length,
 
443
  num_examples,
444
  conclusion_type,
445
  ai_model,
446
+ content_string,
447
  api_key,
 
 
448
  generated_article,
449
  user_comments,
450
  )
 
470
  with gr.Column(scale=2):
471
  with gr.Group():
472
  gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
473
+ input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student")
474
  input_topic = gr.Textbox(
475
  label="Topic",
476
  placeholder="Enter the main topic of your article",
 
591
  )
592
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
593
  with gr.Group():
594
+ with gr.Row():
595
+ google_search_check = gr.Checkbox(
596
+ label="Enable Google Search For Recent Sources", value=True
597
+ )
598
  with gr.Row():
599
  month_from = gr.Dropdown(
600
  choices=months,
 
616
  year_to = gr.Textbox(label="To Year", value=d1[2])
617
 
618
  with gr.Row():
619
+ domains_to_include = gr.Dropdown(
620
  domain_list,
621
+ value=domain_list,
622
  multiselect=True,
623
+ label="Domains To Include",
624
  )
625
 
626
  with gr.Group():
 
702
  generate_btn.click(
703
  fn=generate_and_format,
704
  inputs=[
705
+ input_role,
706
  input_topic,
707
  input_keywords,
708
  input_length,
 
717
  input_conclusion,
718
  ai_generator,
719
  input_api,
720
+ google_search_check,
721
  year_from,
722
  month_from,
723
  day_from,
724
  year_to,
725
  month_to,
726
  day_to,
727
+ domains_to_include,
728
  ],
729
  outputs=[output_article],
730
  )
 
732
  regenerate_btn.click(
733
  fn=generate_and_format,
734
  inputs=[
735
+ input_role,
736
  input_topic,
737
  input_keywords,
738
  input_length,
 
747
  input_conclusion,
748
  ai_generator,
749
  input_api,
750
+ google_search_check,
751
  year_from,
752
  month_from,
753
  day_from,
754
  year_to,
755
  month_to,
756
  day_to,
757
+ domains_to_include,
758
  output_article,
759
  ai_comments,
760
  ],
plagiarism.py CHANGED
@@ -61,10 +61,18 @@ async def parallel_scrap(urls):
61
  return results
62
 
63
 
 
 
 
 
 
 
 
 
64
  def google_search_urls(
65
  text,
66
  sorted_date,
67
- domains_to_skip,
68
  api_key,
69
  cse_id,
70
  **kwargs,
@@ -75,7 +83,9 @@ def google_search_urls(
75
  if "items" in results and len(results["items"]) > 0:
76
  for count, link in enumerate(results["items"]):
77
  # skip user selected domains
78
- if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
 
 
79
  continue
80
  url = link["link"]
81
  if url not in url_list:
@@ -84,25 +94,24 @@ def google_search_urls(
84
 
85
 
86
  def google_search(
87
- input,
88
  sorted_date,
89
- domains_to_skip,
90
  ):
91
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
92
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
93
- # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
94
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
95
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
96
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
97
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
98
  cse_id = "851813e81162b4ed4"
99
-
100
  # get list of URLS to check
101
  start_time = time.perf_counter()
102
  url_list = google_search_urls(
103
- input,
104
  sorted_date,
105
- domains_to_skip,
106
  api_key,
107
  cse_id,
108
  )
 
61
  return results
62
 
63
 
64
+ def scrap(urls):
65
+ client = httpx.Client()
66
+ soups = []
67
+ for url in urls:
68
+ soups.append(get_url_data(url=url, client=client))
69
+ return soups
70
+
71
+
72
  def google_search_urls(
73
  text,
74
  sorted_date,
75
+ domains_to_include,
76
  api_key,
77
  cse_id,
78
  **kwargs,
 
83
  if "items" in results and len(results["items"]) > 0:
84
  for count, link in enumerate(results["items"]):
85
  # skip user selected domains
86
+ if (domains_to_include is None) or not any(
87
+ ("." + domain) in link["link"] for domain in domains_to_include
88
+ ):
89
  continue
90
  url = link["link"]
91
  if url not in url_list:
 
94
 
95
 
96
  def google_search(
97
+ topic,
98
  sorted_date,
99
+ domains_to_include,
100
  ):
101
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
102
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
103
+ api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
104
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
105
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
106
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
107
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
108
  cse_id = "851813e81162b4ed4"
 
109
  # get list of URLS to check
110
  start_time = time.perf_counter()
111
  url_list = google_search_urls(
112
+ topic,
113
  sorted_date,
114
+ domains_to_include,
115
  api_key,
116
  cse_id,
117
  )