Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Jul 30, 2024

Commit

70d74f0

1 Parent(s): 89644d7

add google search and updated prompt

Browse files

Files changed (3) hide show

app.py +73 -0
plagiarism.py +109 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ from scipy.special import softmax
 from collections import defaultdict
 import nltk
 from utils import remove_special_characters
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -217,6 +219,8 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
     prompt = f"""
     Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
@@ -238,6 +242,9 @@ def generate_prompt(settings: Dict[str, str]) -> str:
     - End with a {settings['conclusion_type']} conclusion
     - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
     - Do not make any headline, title bold.
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -246,6 +253,8 @@ def generate_prompt(settings: Dict[str, str]) -> str:
 def regenerate_prompt(settings: Dict[str, str]) -> str:
     prompt = f"""
     "{settings['generated_article']}"
@@ -256,6 +265,8 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Keep the references the same as the given text in the same format.
     - Do not make any headline, title bold.
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
@@ -277,10 +288,14 @@ def generate_article(
     num_examples: str,
     conclusion_type: str,
     ai_model: str,
     api_key: str = None,
     generated_article: str = None,
     user_comments: str = None,
 ) -> str:
     settings = {
         "topic": topic,
         "keywords": [k.strip() for k in keywords.split(",")],
@@ -294,6 +309,7 @@ def generate_article(
         "references": [r.strip() for r in references.split(",")],
         "num_examples": num_examples,
         "conclusion_type": conclusion_type,
         "generated_article": generated_article,
         "user_comments": user_comments,
     }
@@ -390,9 +406,19 @@ def generate_and_format(
     conclusion_type,
     ai_model,
     api_key,
     generated_article: str = None,
     user_comments: str = None,
 ):
     article = generate_article(
         topic,
         keywords,
@@ -408,6 +434,8 @@ def generate_and_format(
         conclusion_type,
         ai_model,
         api_key,
         generated_article,
         user_comments,
     )
@@ -423,6 +451,10 @@ def create_interface():
             .input-highlight-pink block_label {background-color: #008080}
             """,
     ) as demo:
         gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
         with gr.Row():
@@ -547,6 +579,33 @@ def create_interface():
                         label="Conclusion Type",
                         elem_classes="input-highlight-turquoise",
                     )
                 with gr.Group():
                     gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
@@ -641,6 +700,13 @@ def create_interface():
                 input_conclusion,
                 ai_generator,
                 input_api,
             ],
             outputs=[output_article],
         )
@@ -662,6 +728,13 @@ def create_interface():
                 input_conclusion,
                 ai_generator,
                 input_api,
                 output_article,
                 ai_comments,
             ],

 from collections import defaultdict
 import nltk
 from utils import remove_special_characters
+from plagiarism import google_search, months, domain_list, build_date
+from datetime import date
 # Check if CUDA is available
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def generate_prompt(settings: Dict[str, str]) -> str:
+    content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
     prompt = f"""
     Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
     - End with a {settings['conclusion_type']} conclusion
     - Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
     - Do not make any headline, title bold.
+    Use the content here from the URLs I've found for you:
+    {content_string}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
 def regenerate_prompt(settings: Dict[str, str]) -> str:
+    content_string = "\n".join(f"{url.strip()}: \n{content.strip()}" for url, content in settings["sources"].items())
     prompt = f"""
     "{settings['generated_article']}"
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Keep the references the same as the given text in the same format.
     - Do not make any headline, title bold.
+    Use the content here from the URLs I've found for you:
+    {content_string}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     num_examples: str,
     conclusion_type: str,
     ai_model: str,
+    sorted_date,
+    domains_to_skip,
     api_key: str = None,
     generated_article: str = None,
     user_comments: str = None,
 ) -> str:
+    url_content = google_search(topic, sorted_date, domains_to_skip)
     settings = {
         "topic": topic,
         "keywords": [k.strip() for k in keywords.split(",")],
         "references": [r.strip() for r in references.split(",")],
         "num_examples": num_examples,
         "conclusion_type": conclusion_type,
+        "sources": url_content,
         "generated_article": generated_article,
         "user_comments": user_comments,
     }
     conclusion_type,
     ai_model,
     api_key,
+    year_from,
+    month_from,
+    day_from,
+    year_to,
+    month_to,
+    day_to,
+    domains_to_skip,
     generated_article: str = None,
     user_comments: str = None,
 ):
+    date_from = build_date(year_from, month_from, day_from)
+    date_to = build_date(year_to, month_to, day_to)
+    sorted_date = f"date:r:{date_from}:{date_to}"
     article = generate_article(
         topic,
         keywords,
         conclusion_type,
         ai_model,
         api_key,
+        sorted_date,
+        domains_to_skip,
         generated_article,
         user_comments,
     )
             .input-highlight-pink block_label {background-color: #008080}
             """,
     ) as demo:
+        today = date.today()
+        # dd/mm/YY
+        d1 = today.strftime("%d/%B/%Y")
+        d1 = d1.split("/")
         gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")
         with gr.Row():
                         label="Conclusion Type",
                         elem_classes="input-highlight-turquoise",
                     )
+                    with gr.Group():
+                        with gr.Row():
+                            month_from = gr.Dropdown(
+                                choices=months,
+                                label="From Month",
+                                value="January",
+                                interactive=True,
+                            )
+                            day_from = gr.Textbox(label="From Day", value="01")
+                            year_from = gr.Textbox(label="From Year", value="2000")
+                        with gr.Row():
+                            month_to = gr.Dropdown(
+                                choices=months,
+                                label="To Month",
+                                value=d1[1],
+                                interactive=True,
+                            )
+                            day_to = gr.Textbox(label="To Day", value=d1[0])
+                            year_to = gr.Textbox(label="To Year", value=d1[2])
+                        with gr.Row():
+                            domains_to_skip = gr.Dropdown(
+                                domain_list,
+                                multiselect=True,
+                                label="Domain To Skip",
+                            )
                 with gr.Group():
                     gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
                 input_conclusion,
                 ai_generator,
                 input_api,
+                year_from,
+                month_from,
+                day_from,
+                year_to,
+                month_to,
+                day_to,
+                domains_to_skip,
             ],
             outputs=[output_article],
         )
                 input_conclusion,
                 ai_generator,
                 input_api,
+                year_from,
+                month_from,
+                day_from,
+                year_to,
+                month_to,
+                day_to,
+                domains_to_skip,
                 output_article,
                 ai_comments,
             ],

plagiarism.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import time
+from googleapiclient.discovery import build
+import asyncio
+import httpx
+from bs4 import BeautifulSoup
+months = {
+    "January": "01",
+    "February": "02",
+    "March": "03",
+    "April": "04",
+    "May": "05",
+    "June": "06",
+    "July": "07",
+    "August": "08",
+    "September": "09",
+    "October": "10",
+    "November": "11",
+    "December": "12",
+}
+domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
+def build_date(year=2024, month="March", day=1):
+    return f"{year}{months[month]}{day}"
+async def get_url_data(url, client):
+    try:
+        r = await client.get(url)
+        if r.status_code == 200:
+            soup = BeautifulSoup(r.content, "html.parser")
+            return soup
+    except Exception:
+        return None
+async def parallel_scrap(urls):
+    async with httpx.AsyncClient(timeout=30) as client:
+        tasks = []
+        for url in urls:
+            tasks.append(get_url_data(url=url, client=client))
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    return results
+def google_search_urls(
+    text,
+    sorted_date,
+    domains_to_skip,
+    api_key,
+    cse_id,
+    **kwargs,
+):
+    service = build("customsearch", "v1", developerKey=api_key)
+    num_pages = 5
+    results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
+    url_list = []
+    if "items" in results and len(results["items"]) > 0:
+        for count, link in enumerate(results["items"]):
+            if count >= num_pages:
+                break
+            # skip user selected domains
+            if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
+                continue
+            url = link["link"]
+            if url not in url_list:
+                url_list.append(url)
+    return url_list
+def google_search(
+    input,
+    sorted_date,
+    domains_to_skip,
+):
+    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
+    # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
+    # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
+    # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
+    # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
+    # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
+    cse_id = "851813e81162b4ed4"
+    # get list of URLS to check
+    start_time = time.perf_counter()
+    url_list = google_search_urls(
+        input,
+        sorted_date,
+        domains_to_skip,
+        api_key,
+        cse_id,
+    )
+    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
+    # Scrape URLs in list
+    start_time = time.perf_counter()
+    soups = asyncio.run(parallel_scrap(url_list))
+    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
+    result_content = {}
+    for url, soup in zip(url_list, soups):
+        if soup:
+            result_content[url] = soup.text
+    # for key, value in result_content.items():
+    #     print("-------------------URL: ", key)
+    #     print(value[:30])
+    return result_content

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ openai
 groq
 language_tool_python
 scipy
-Unidecode

 groq
 language_tool_python
 scipy
+Unidecode
+BeautifulSoup4
+google-api-python-client