minko186 commited on
Commit
70d74f0
·
1 Parent(s): 89644d7

add google search and updated prompt

Browse files
Files changed (3) hide show
  1. app.py +73 -0
  2. plagiarism.py +109 -0
  3. requirements.txt +3 -1
app.py CHANGED
@@ -14,6 +14,8 @@ from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
  from utils import remove_special_characters
 
 
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -217,6 +219,8 @@ def ai_check(text: str, option: str):
217
 
218
 
219
  def generate_prompt(settings: Dict[str, str]) -> str:
 
 
220
  prompt = f"""
221
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
222
 
@@ -238,6 +242,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
238
  - End with a {settings['conclusion_type']} conclusion
239
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
240
  - Do not make any headline, title bold.
 
 
 
241
 
242
  Ensure proper paragraph breaks for better readability.
243
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -246,6 +253,8 @@ def generate_prompt(settings: Dict[str, str]) -> str:
246
 
247
 
248
  def regenerate_prompt(settings: Dict[str, str]) -> str:
 
 
249
  prompt = f"""
250
  "{settings['generated_article']}"
251
 
@@ -256,6 +265,8 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
256
  - The original content should not be changed. Make minor modifications based on user comments above.
257
  - Keep the references the same as the given text in the same format.
258
  - Do not make any headline, title bold.
 
 
259
 
260
  Ensure proper paragraph breaks for better readability.
261
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -277,10 +288,14 @@ def generate_article(
277
  num_examples: str,
278
  conclusion_type: str,
279
  ai_model: str,
 
 
280
  api_key: str = None,
281
  generated_article: str = None,
282
  user_comments: str = None,
283
  ) -> str:
 
 
284
  settings = {
285
  "topic": topic,
286
  "keywords": [k.strip() for k in keywords.split(",")],
@@ -294,6 +309,7 @@ def generate_article(
294
  "references": [r.strip() for r in references.split(",")],
295
  "num_examples": num_examples,
296
  "conclusion_type": conclusion_type,
 
297
  "generated_article": generated_article,
298
  "user_comments": user_comments,
299
  }
@@ -390,9 +406,19 @@ def generate_and_format(
390
  conclusion_type,
391
  ai_model,
392
  api_key,
 
 
 
 
 
 
 
393
  generated_article: str = None,
394
  user_comments: str = None,
395
  ):
 
 
 
396
  article = generate_article(
397
  topic,
398
  keywords,
@@ -408,6 +434,8 @@ def generate_and_format(
408
  conclusion_type,
409
  ai_model,
410
  api_key,
 
 
411
  generated_article,
412
  user_comments,
413
  )
@@ -423,6 +451,10 @@ def create_interface():
423
  .input-highlight-pink block_label {background-color: #008080}
424
  """,
425
  ) as demo:
 
 
 
 
426
  gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
427
 
428
  with gr.Row():
@@ -547,6 +579,33 @@ def create_interface():
547
  label="Conclusion Type",
548
  elem_classes="input-highlight-turquoise",
549
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  with gr.Group():
552
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
@@ -641,6 +700,13 @@ def create_interface():
641
  input_conclusion,
642
  ai_generator,
643
  input_api,
 
 
 
 
 
 
 
644
  ],
645
  outputs=[output_article],
646
  )
@@ -662,6 +728,13 @@ def create_interface():
662
  input_conclusion,
663
  ai_generator,
664
  input_api,
 
 
 
 
 
 
 
665
  output_article,
666
  ai_comments,
667
  ],
 
14
  from collections import defaultdict
15
  import nltk
16
  from utils import remove_special_characters
17
+ from plagiarism import google_search, months, domain_list, build_date
18
+ from datetime import date
19
 
20
  # Check if CUDA is available
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
219
 
220
 
221
  def generate_prompt(settings: Dict[str, str]) -> str:
222
+ content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
223
+
224
  prompt = f"""
225
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
226
 
 
242
  - End with a {settings['conclusion_type']} conclusion
243
  - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
244
  - Do not make any headline, title bold.
245
+
246
+ Use the content here from the URLs I've found for you:
247
+ {content_string}
248
 
249
  Ensure proper paragraph breaks for better readability.
250
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
253
 
254
 
255
  def regenerate_prompt(settings: Dict[str, str]) -> str:
256
+ content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
257
+
258
  prompt = f"""
259
  "{settings['generated_article']}"
260
 
 
265
  - The original content should not be changed. Make minor modifications based on user comments above.
266
  - Keep the references the same as the given text in the same format.
267
  - Do not make any headline, title bold.
268
+ Use the content here from the URLs I've found for you:
269
+ {content_string}
270
 
271
  Ensure proper paragraph breaks for better readability.
272
  Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 
288
  num_examples: str,
289
  conclusion_type: str,
290
  ai_model: str,
291
+ sorted_date,
292
+ domains_to_skip,
293
  api_key: str = None,
294
  generated_article: str = None,
295
  user_comments: str = None,
296
  ) -> str:
297
+
298
+ url_content = google_search(topic, sorted_date, domains_to_skip)
299
  settings = {
300
  "topic": topic,
301
  "keywords": [k.strip() for k in keywords.split(",")],
 
309
  "references": [r.strip() for r in references.split(",")],
310
  "num_examples": num_examples,
311
  "conclusion_type": conclusion_type,
312
+ "sources": url_content,
313
  "generated_article": generated_article,
314
  "user_comments": user_comments,
315
  }
 
406
  conclusion_type,
407
  ai_model,
408
  api_key,
409
+ year_from,
410
+ month_from,
411
+ day_from,
412
+ year_to,
413
+ month_to,
414
+ day_to,
415
+ domains_to_skip,
416
  generated_article: str = None,
417
  user_comments: str = None,
418
  ):
419
+ date_from = build_date(year_from, month_from, day_from)
420
+ date_to = build_date(year_to, month_to, day_to)
421
+ sorted_date = f"date:r:{date_from}:{date_to}"
422
  article = generate_article(
423
  topic,
424
  keywords,
 
434
  conclusion_type,
435
  ai_model,
436
  api_key,
437
+ sorted_date,
438
+ domains_to_skip,
439
  generated_article,
440
  user_comments,
441
  )
 
451
  .input-highlight-pink block_label {background-color: #008080}
452
  """,
453
  ) as demo:
454
+ today = date.today()
455
+ # dd/mm/YY
456
+ d1 = today.strftime("%d/%B/%Y")
457
+ d1 = d1.split("/")
458
  gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
459
 
460
  with gr.Row():
 
579
  label="Conclusion Type",
580
  elem_classes="input-highlight-turquoise",
581
  )
582
+ with gr.Group():
583
+ with gr.Row():
584
+ month_from = gr.Dropdown(
585
+ choices=months,
586
+ label="From Month",
587
+ value="January",
588
+ interactive=True,
589
+ )
590
+ day_from = gr.Textbox(label="From Day", value="01")
591
+ year_from = gr.Textbox(label="From Year", value="2000")
592
+
593
+ with gr.Row():
594
+ month_to = gr.Dropdown(
595
+ choices=months,
596
+ label="To Month",
597
+ value=d1[1],
598
+ interactive=True,
599
+ )
600
+ day_to = gr.Textbox(label="To Day", value=d1[0])
601
+ year_to = gr.Textbox(label="To Year", value=d1[2])
602
+
603
+ with gr.Row():
604
+ domains_to_skip = gr.Dropdown(
605
+ domain_list,
606
+ multiselect=True,
607
+ label="Domain To Skip",
608
+ )
609
 
610
  with gr.Group():
611
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
 
700
  input_conclusion,
701
  ai_generator,
702
  input_api,
703
+ year_from,
704
+ month_from,
705
+ day_from,
706
+ year_to,
707
+ month_to,
708
+ day_to,
709
+ domains_to_skip,
710
  ],
711
  outputs=[output_article],
712
  )
 
728
  input_conclusion,
729
  ai_generator,
730
  input_api,
731
+ year_from,
732
+ month_from,
733
+ day_from,
734
+ year_to,
735
+ month_to,
736
+ day_to,
737
+ domains_to_skip,
738
  output_article,
739
  ai_comments,
740
  ],
plagiarism.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from googleapiclient.discovery import build
3
+ import asyncio
4
+ import httpx
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ months = {
9
+ "January": "01",
10
+ "February": "02",
11
+ "March": "03",
12
+ "April": "04",
13
+ "May": "05",
14
+ "June": "06",
15
+ "July": "07",
16
+ "August": "08",
17
+ "September": "09",
18
+ "October": "10",
19
+ "November": "11",
20
+ "December": "12",
21
+ }
22
+
23
+ domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
24
+
25
+
26
+ def build_date(year=2024, month="March", day=1):
27
+ return f"{year}{months[month]}{day}"
28
+
29
+
30
+ async def get_url_data(url, client):
31
+ try:
32
+ r = await client.get(url)
33
+ if r.status_code == 200:
34
+ soup = BeautifulSoup(r.content, "html.parser")
35
+ return soup
36
+ except Exception:
37
+ return None
38
+
39
+
40
+ async def parallel_scrap(urls):
41
+ async with httpx.AsyncClient(timeout=30) as client:
42
+ tasks = []
43
+ for url in urls:
44
+ tasks.append(get_url_data(url=url, client=client))
45
+ results = await asyncio.gather(*tasks, return_exceptions=True)
46
+ return results
47
+
48
+
49
+ def google_search_urls(
50
+ text,
51
+ sorted_date,
52
+ domains_to_skip,
53
+ api_key,
54
+ cse_id,
55
+ **kwargs,
56
+ ):
57
+ service = build("customsearch", "v1", developerKey=api_key)
58
+ num_pages = 5
59
+ results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
60
+ url_list = []
61
+ if "items" in results and len(results["items"]) > 0:
62
+ for count, link in enumerate(results["items"]):
63
+ if count >= num_pages:
64
+ break
65
+ # skip user selected domains
66
+ if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
67
+ continue
68
+ url = link["link"]
69
+ if url not in url_list:
70
+ url_list.append(url)
71
+ return url_list
72
+
73
+
74
+ def google_search(
75
+ input,
76
+ sorted_date,
77
+ domains_to_skip,
78
+ ):
79
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
80
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
81
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
82
+ # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
83
+ # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
84
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
85
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
86
+ cse_id = "851813e81162b4ed4"
87
+
88
+ # get list of URLS to check
89
+ start_time = time.perf_counter()
90
+ url_list = google_search_urls(
91
+ input,
92
+ sorted_date,
93
+ domains_to_skip,
94
+ api_key,
95
+ cse_id,
96
+ )
97
+ print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
98
+ # Scrape URLs in list
99
+ start_time = time.perf_counter()
100
+ soups = asyncio.run(parallel_scrap(url_list))
101
+ print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
102
+ result_content = {}
103
+ for url, soup in zip(url_list, soups):
104
+ if soup:
105
+ result_content[url] = soup.text
106
+ # for key, value in result_content.items():
107
+ # print("-------------------URL: ", key)
108
+ # print(value[:30])
109
+ return result_content
requirements.txt CHANGED
@@ -8,4 +8,6 @@ openai
8
  groq
9
  language_tool_python
10
  scipy
11
- Unidecode
 
 
 
8
  groq
9
  language_tool_python
10
  scipy
11
+ Unidecode
12
+ BeautifulSoup4
13
+ google-api-python-client