Spaces:
Runtime error
Runtime error
add google search and updated prompt
Browse files- app.py +73 -0
- plagiarism.py +109 -0
- requirements.txt +3 -1
app.py
CHANGED
@@ -14,6 +14,8 @@ from scipy.special import softmax
|
|
14 |
from collections import defaultdict
|
15 |
import nltk
|
16 |
from utils import remove_special_characters
|
|
|
|
|
17 |
|
18 |
# Check if CUDA is available
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -217,6 +219,8 @@ def ai_check(text: str, option: str):
|
|
217 |
|
218 |
|
219 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
|
|
|
|
220 |
prompt = f"""
|
221 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
|
222 |
|
@@ -238,6 +242,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
238 |
- End with a {settings['conclusion_type']} conclusion
|
239 |
- Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
|
240 |
- Do not make any headline, title bold.
|
|
|
|
|
|
|
241 |
|
242 |
Ensure proper paragraph breaks for better readability.
|
243 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
@@ -246,6 +253,8 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
246 |
|
247 |
|
248 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
|
|
|
249 |
prompt = f"""
|
250 |
"{settings['generated_article']}"
|
251 |
|
@@ -256,6 +265,8 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
256 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
257 |
- Keep the references the same as the given text in the same format.
|
258 |
- Do not make any headline, title bold.
|
|
|
|
|
259 |
|
260 |
Ensure proper paragraph breaks for better readability.
|
261 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
@@ -277,10 +288,14 @@ def generate_article(
|
|
277 |
num_examples: str,
|
278 |
conclusion_type: str,
|
279 |
ai_model: str,
|
|
|
|
|
280 |
api_key: str = None,
|
281 |
generated_article: str = None,
|
282 |
user_comments: str = None,
|
283 |
) -> str:
|
|
|
|
|
284 |
settings = {
|
285 |
"topic": topic,
|
286 |
"keywords": [k.strip() for k in keywords.split(",")],
|
@@ -294,6 +309,7 @@ def generate_article(
|
|
294 |
"references": [r.strip() for r in references.split(",")],
|
295 |
"num_examples": num_examples,
|
296 |
"conclusion_type": conclusion_type,
|
|
|
297 |
"generated_article": generated_article,
|
298 |
"user_comments": user_comments,
|
299 |
}
|
@@ -390,9 +406,19 @@ def generate_and_format(
|
|
390 |
conclusion_type,
|
391 |
ai_model,
|
392 |
api_key,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
generated_article: str = None,
|
394 |
user_comments: str = None,
|
395 |
):
|
|
|
|
|
|
|
396 |
article = generate_article(
|
397 |
topic,
|
398 |
keywords,
|
@@ -408,6 +434,8 @@ def generate_and_format(
|
|
408 |
conclusion_type,
|
409 |
ai_model,
|
410 |
api_key,
|
|
|
|
|
411 |
generated_article,
|
412 |
user_comments,
|
413 |
)
|
@@ -423,6 +451,10 @@ def create_interface():
|
|
423 |
.input-highlight-pink block_label {background-color: #008080}
|
424 |
""",
|
425 |
) as demo:
|
|
|
|
|
|
|
|
|
426 |
gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
|
427 |
|
428 |
with gr.Row():
|
@@ -547,6 +579,33 @@ def create_interface():
|
|
547 |
label="Conclusion Type",
|
548 |
elem_classes="input-highlight-turquoise",
|
549 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
|
551 |
with gr.Group():
|
552 |
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
@@ -641,6 +700,13 @@ def create_interface():
|
|
641 |
input_conclusion,
|
642 |
ai_generator,
|
643 |
input_api,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
644 |
],
|
645 |
outputs=[output_article],
|
646 |
)
|
@@ -662,6 +728,13 @@ def create_interface():
|
|
662 |
input_conclusion,
|
663 |
ai_generator,
|
664 |
input_api,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
665 |
output_article,
|
666 |
ai_comments,
|
667 |
],
|
|
|
14 |
from collections import defaultdict
|
15 |
import nltk
|
16 |
from utils import remove_special_characters
|
17 |
+
from plagiarism import google_search, months, domain_list, build_date
|
18 |
+
from datetime import date
|
19 |
|
20 |
# Check if CUDA is available
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
219 |
|
220 |
|
221 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
222 |
+
content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
|
223 |
+
|
224 |
prompt = f"""
|
225 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
|
226 |
|
|
|
242 |
- End with a {settings['conclusion_type']} conclusion
|
243 |
- Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
|
244 |
- Do not make any headline, title bold.
|
245 |
+
|
246 |
+
Use the content here from the URLs I've found for you:
|
247 |
+
{content_string}
|
248 |
|
249 |
Ensure proper paragraph breaks for better readability.
|
250 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
|
|
253 |
|
254 |
|
255 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
256 |
+
content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
|
257 |
+
|
258 |
prompt = f"""
|
259 |
"{settings['generated_article']}"
|
260 |
|
|
|
265 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
266 |
- Keep the references the same as the given text in the same format.
|
267 |
- Do not make any headline, title bold.
|
268 |
+
Use the content here from the URLs I've found for you:
|
269 |
+
{content_string}
|
270 |
|
271 |
Ensure proper paragraph breaks for better readability.
|
272 |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
|
|
|
288 |
num_examples: str,
|
289 |
conclusion_type: str,
|
290 |
ai_model: str,
|
291 |
+
sorted_date,
|
292 |
+
domains_to_skip,
|
293 |
api_key: str = None,
|
294 |
generated_article: str = None,
|
295 |
user_comments: str = None,
|
296 |
) -> str:
|
297 |
+
|
298 |
+
url_content = google_search(topic, sorted_date, domains_to_skip)
|
299 |
settings = {
|
300 |
"topic": topic,
|
301 |
"keywords": [k.strip() for k in keywords.split(",")],
|
|
|
309 |
"references": [r.strip() for r in references.split(",")],
|
310 |
"num_examples": num_examples,
|
311 |
"conclusion_type": conclusion_type,
|
312 |
+
"sources": url_content,
|
313 |
"generated_article": generated_article,
|
314 |
"user_comments": user_comments,
|
315 |
}
|
|
|
406 |
conclusion_type,
|
407 |
ai_model,
|
408 |
api_key,
|
409 |
+
year_from,
|
410 |
+
month_from,
|
411 |
+
day_from,
|
412 |
+
year_to,
|
413 |
+
month_to,
|
414 |
+
day_to,
|
415 |
+
domains_to_skip,
|
416 |
generated_article: str = None,
|
417 |
user_comments: str = None,
|
418 |
):
|
419 |
+
date_from = build_date(year_from, month_from, day_from)
|
420 |
+
date_to = build_date(year_to, month_to, day_to)
|
421 |
+
sorted_date = f"date:r:{date_from}:{date_to}"
|
422 |
article = generate_article(
|
423 |
topic,
|
424 |
keywords,
|
|
|
434 |
conclusion_type,
|
435 |
ai_model,
|
436 |
api_key,
|
437 |
+
sorted_date,
|
438 |
+
domains_to_skip,
|
439 |
generated_article,
|
440 |
user_comments,
|
441 |
)
|
|
|
451 |
.input-highlight-pink block_label {background-color: #008080}
|
452 |
""",
|
453 |
) as demo:
|
454 |
+
today = date.today()
|
455 |
+
# dd/mm/YY
|
456 |
+
d1 = today.strftime("%d/%B/%Y")
|
457 |
+
d1 = d1.split("/")
|
458 |
gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
|
459 |
|
460 |
with gr.Row():
|
|
|
579 |
label="Conclusion Type",
|
580 |
elem_classes="input-highlight-turquoise",
|
581 |
)
|
582 |
+
with gr.Group():
|
583 |
+
with gr.Row():
|
584 |
+
month_from = gr.Dropdown(
|
585 |
+
choices=months,
|
586 |
+
label="From Month",
|
587 |
+
value="January",
|
588 |
+
interactive=True,
|
589 |
+
)
|
590 |
+
day_from = gr.Textbox(label="From Day", value="01")
|
591 |
+
year_from = gr.Textbox(label="From Year", value="2000")
|
592 |
+
|
593 |
+
with gr.Row():
|
594 |
+
month_to = gr.Dropdown(
|
595 |
+
choices=months,
|
596 |
+
label="To Month",
|
597 |
+
value=d1[1],
|
598 |
+
interactive=True,
|
599 |
+
)
|
600 |
+
day_to = gr.Textbox(label="To Day", value=d1[0])
|
601 |
+
year_to = gr.Textbox(label="To Year", value=d1[2])
|
602 |
+
|
603 |
+
with gr.Row():
|
604 |
+
domains_to_skip = gr.Dropdown(
|
605 |
+
domain_list,
|
606 |
+
multiselect=True,
|
607 |
+
label="Domain To Skip",
|
608 |
+
)
|
609 |
|
610 |
with gr.Group():
|
611 |
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
|
|
700 |
input_conclusion,
|
701 |
ai_generator,
|
702 |
input_api,
|
703 |
+
year_from,
|
704 |
+
month_from,
|
705 |
+
day_from,
|
706 |
+
year_to,
|
707 |
+
month_to,
|
708 |
+
day_to,
|
709 |
+
domains_to_skip,
|
710 |
],
|
711 |
outputs=[output_article],
|
712 |
)
|
|
|
728 |
input_conclusion,
|
729 |
ai_generator,
|
730 |
input_api,
|
731 |
+
year_from,
|
732 |
+
month_from,
|
733 |
+
day_from,
|
734 |
+
year_to,
|
735 |
+
month_to,
|
736 |
+
day_to,
|
737 |
+
domains_to_skip,
|
738 |
output_article,
|
739 |
ai_comments,
|
740 |
],
|
plagiarism.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from googleapiclient.discovery import build
|
3 |
+
import asyncio
|
4 |
+
import httpx
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
|
7 |
+
|
8 |
+
months = {
|
9 |
+
"January": "01",
|
10 |
+
"February": "02",
|
11 |
+
"March": "03",
|
12 |
+
"April": "04",
|
13 |
+
"May": "05",
|
14 |
+
"June": "06",
|
15 |
+
"July": "07",
|
16 |
+
"August": "08",
|
17 |
+
"September": "09",
|
18 |
+
"October": "10",
|
19 |
+
"November": "11",
|
20 |
+
"December": "12",
|
21 |
+
}
|
22 |
+
|
23 |
+
domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
|
24 |
+
|
25 |
+
|
26 |
+
def build_date(year=2024, month="March", day=1):
|
27 |
+
return f"{year}{months[month]}{day}"
|
28 |
+
|
29 |
+
|
30 |
+
async def get_url_data(url, client):
|
31 |
+
try:
|
32 |
+
r = await client.get(url)
|
33 |
+
if r.status_code == 200:
|
34 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
35 |
+
return soup
|
36 |
+
except Exception:
|
37 |
+
return None
|
38 |
+
|
39 |
+
|
40 |
+
async def parallel_scrap(urls):
|
41 |
+
async with httpx.AsyncClient(timeout=30) as client:
|
42 |
+
tasks = []
|
43 |
+
for url in urls:
|
44 |
+
tasks.append(get_url_data(url=url, client=client))
|
45 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
46 |
+
return results
|
47 |
+
|
48 |
+
|
49 |
+
def google_search_urls(
|
50 |
+
text,
|
51 |
+
sorted_date,
|
52 |
+
domains_to_skip,
|
53 |
+
api_key,
|
54 |
+
cse_id,
|
55 |
+
**kwargs,
|
56 |
+
):
|
57 |
+
service = build("customsearch", "v1", developerKey=api_key)
|
58 |
+
num_pages = 5
|
59 |
+
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
|
60 |
+
url_list = []
|
61 |
+
if "items" in results and len(results["items"]) > 0:
|
62 |
+
for count, link in enumerate(results["items"]):
|
63 |
+
if count >= num_pages:
|
64 |
+
break
|
65 |
+
# skip user selected domains
|
66 |
+
if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
|
67 |
+
continue
|
68 |
+
url = link["link"]
|
69 |
+
if url not in url_list:
|
70 |
+
url_list.append(url)
|
71 |
+
return url_list
|
72 |
+
|
73 |
+
|
74 |
+
def google_search(
|
75 |
+
input,
|
76 |
+
sorted_date,
|
77 |
+
domains_to_skip,
|
78 |
+
):
|
79 |
+
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
80 |
+
api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
81 |
+
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
82 |
+
# api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
83 |
+
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
84 |
+
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
85 |
+
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
86 |
+
cse_id = "851813e81162b4ed4"
|
87 |
+
|
88 |
+
# get list of URLS to check
|
89 |
+
start_time = time.perf_counter()
|
90 |
+
url_list = google_search_urls(
|
91 |
+
input,
|
92 |
+
sorted_date,
|
93 |
+
domains_to_skip,
|
94 |
+
api_key,
|
95 |
+
cse_id,
|
96 |
+
)
|
97 |
+
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
|
98 |
+
# Scrape URLs in list
|
99 |
+
start_time = time.perf_counter()
|
100 |
+
soups = asyncio.run(parallel_scrap(url_list))
|
101 |
+
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
102 |
+
result_content = {}
|
103 |
+
for url, soup in zip(url_list, soups):
|
104 |
+
if soup:
|
105 |
+
result_content[url] = soup.text
|
106 |
+
# for key, value in result_content.items():
|
107 |
+
# print("-------------------URL: ", key)
|
108 |
+
# print(value[:30])
|
109 |
+
return result_content
|
requirements.txt
CHANGED
@@ -8,4 +8,6 @@ openai
|
|
8 |
groq
|
9 |
language_tool_python
|
10 |
scipy
|
11 |
-
Unidecode
|
|
|
|
|
|
8 |
groq
|
9 |
language_tool_python
|
10 |
scipy
|
11 |
+
Unidecode
|
12 |
+
BeautifulSoup4
|
13 |
+
google-api-python-client
|