Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Aug 7, 2024

Commit

43d4e83

1 Parent(s): 48d4d11

merge main + multi pdfs + updated html cleaning + better references

Browse files

Files changed (4) hide show

ai_generate.py +9 -6
app.py +38 -43
plagiarism.py +62 -29
requirements.txt +2 -3

ai_generate.py CHANGED Viewed

@@ -77,17 +77,20 @@ rag_llms = {
 def create_db_with_langchain(path):
-    loader = PyMuPDFLoader(path)
-    data = loader.load()
-    # split it into chunks
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    docs = text_splitter.split_documents(data)
     # create the open-source embedding function
     embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     # load it into Chroma
-    db = Chroma.from_documents(docs, embedding_function)
     return db

 def create_db_with_langchain(path):
+    all_docs = []
+    for file in path:
+        loader = PyMuPDFLoader(file)
+        data = loader.load()
+        # split it into chunks
+        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+        docs = text_splitter.split_documents(data)
+        all_docs.extend(docs)
     # create the open-source embedding function
     embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     # load it into Chroma
+    db = Chroma.from_documents(all_docs, embedding_function)
     return db

app.py CHANGED Viewed

@@ -64,6 +64,11 @@ def clean_text(text: str) -> str:
     return "\n".join(cleaned_paragraphs)
 def split_text_from_refs(text: str, sep="\n"):
     lines = text.split("\n")
     references = []
@@ -72,25 +77,37 @@ def split_text_from_refs(text: str, sep="\n"):
     in_references = False
     for line in lines:
         if line.strip().lower() == "references" or line.strip().lower() == "references:":
             in_references = True
             continue
         if line.strip().lower().startswith("references:"):
             in_references = True
         if in_references:
             matches = index_pattern.split(line)
             for match in matches:
                 if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
                     references.append(match.strip())
         else:
-            article_text.append(line)
-    formatted_refs = []
-    for i, ref in enumerate(references, 1):
-        ref = remove_bracketed_numbers(ref)
-        formatted_refs.append(f"[{i}] {ref}{sep}")
-    return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
 def ends_with_references(text):
@@ -225,7 +242,7 @@ def ai_generated_test_gptzero(text):
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     body, references = split_text_from_refs(text, "<br>")
     score, text = detection_polygraf(text=body, model=model)
-    text = text + "<br>" + references
     return score, text
@@ -262,8 +279,10 @@ def generate_prompt(settings: Dict[str, str]) -> str:
     - Include {settings['num_examples']} relevant examples or case studies
     - Incorporate data or statistics from {', '.join(settings['references'])}
     - End with a {settings['conclusion_type']} conclusion
-    - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
     - Do not make any headline, title bold.
     {settings['sources']}
     Ensure proper paragraph breaks for better readability.
@@ -284,6 +303,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Keep the references the same as the given text in the same format.
     - Do not make any headline, title bold.
     {settings['sources']}
     Ensure proper paragraph breaks for better readability.
@@ -355,6 +375,7 @@ def humanize(
     top_k: int = 50,
     length_penalty: float = 1,
 ) -> str:
     body, references = split_text_from_refs(text)
     result = paraphrase_text(
         text=body,
@@ -364,7 +385,7 @@ def humanize(
         top_k=top_k,
         length_penalty=length_penalty,
     )
-    result = result + "\n\n" + references
     return format_and_correct_language_check(result)
@@ -375,35 +396,6 @@ def update_visibility_api(model: str):
         return gr.update(visible=False)
-def format_references(text: str) -> str:
-    lines = text.split("\n")
-    references = []
-    article_text = []
-    index_pattern = re.compile(r"\[(\d+)\]")
-    in_references = False
-    for line in lines:
-        if line.strip().lower() == "references" or line.strip().lower() == "references:":
-            in_references = True
-            continue
-        if line.strip().lower().startswith("references:"):
-            in_references = True
-        if in_references:
-            matches = index_pattern.split(line)
-            for match in matches:
-                if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
-                    references.append(match.strip())
-        else:
-            article_text.append(line)
-    formatted_refs = []
-    for i, ref in enumerate(references, 1):
-        ref = remove_bracketed_numbers(ref)
-        formatted_refs.append(f"[{i}] {ref}\n")
-    return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
 def generate_and_format(
     input_role,
     topic,
@@ -450,7 +442,7 @@ def generate_and_format(
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
         content_string = "\n".join(
-            f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
         )
         content_string = (
             "Use the trusted information here from the URLs and add them as References:\n" + content_string
@@ -627,9 +619,12 @@ def create_interface():
                         elem_classes="input-highlight-turquoise",
                     )
                     gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
                     with gr.Row():
-                        google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
-                    with gr.Group(visible=True) as search_options:
                         with gr.Row():
                             include_sites = gr.Textbox(
                                 label="Include Specific Websites",
@@ -669,8 +664,8 @@ def create_interface():
                             day_to = gr.Textbox(label="To Day", value=d1[0])
                             year_to = gr.Textbox(label="To Year", value=d1[2])
-                    gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
-                    pdf_file_input = gr.File(label="Upload PDF")
                 with gr.Group():
                     gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")

     return "\n".join(cleaned_paragraphs)
+def format_references(text: str) -> str:
+    body, references = split_text_from_refs(text)
+    return body + references
 def split_text_from_refs(text: str, sep="\n"):
     lines = text.split("\n")
     references = []
     in_references = False
     for line in lines:
+        if line == "":
+            continue
+        match = re.search(r"[Rr]eferences:", line, re.DOTALL)
         if line.strip().lower() == "references" or line.strip().lower() == "references:":
             in_references = True
             continue
         if line.strip().lower().startswith("references:"):
             in_references = True
+        if match:
+            in_references = True
+            line = line[match.end() :]
         if in_references:
             matches = index_pattern.split(line)
             for match in matches:
                 if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
                     references.append(match.strip())
         else:
+            article_text.append(line.strip())
+    if len(references) > 0:
+        formatted_refs = []
+        for i, ref in enumerate(references, 1):
+            ref = remove_bracketed_numbers(ref)
+            formatted_refs.append(f"[{i}] {ref}{sep}")
+        formatted_refs = f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
+    else:
+        formatted_refs = ""
+    body = f"{sep}{sep}".join(article_text)
+    return body, formatted_refs
 def ends_with_references(text):
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     body, references = split_text_from_refs(text, "<br>")
     score, text = detection_polygraf(text=body, model=model)
+    text = text + references
     return score, text
     - Include {settings['num_examples']} relevant examples or case studies
     - Incorporate data or statistics from {', '.join(settings['references'])}
     - End with a {settings['conclusion_type']} conclusion
+    - Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line
+    - Do not repeat sources
     - Do not make any headline, title bold.
     {settings['sources']}
     Ensure proper paragraph breaks for better readability.
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Keep the references the same as the given text in the same format.
     - Do not make any headline, title bold.
     {settings['sources']}
     Ensure proper paragraph breaks for better readability.
     top_k: int = 50,
     length_penalty: float = 1,
 ) -> str:
+    print("Humanizing text...")
     body, references = split_text_from_refs(text)
     result = paraphrase_text(
         text=body,
         top_k=top_k,
         length_penalty=length_penalty,
     )
+    result = result + references
     return format_and_correct_language_check(result)
         return gr.update(visible=False)
 def generate_and_format(
     input_role,
     topic,
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
         content_string = "\n".join(
+            f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
         )
         content_string = (
             "Use the trusted information here from the URLs and add them as References:\n" + content_string
                         elem_classes="input-highlight-turquoise",
                     )
                     gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
+                    google_default = False
                     with gr.Row():
+                        google_search_check = gr.Checkbox(
+                            label="Enable Google Search For Recent Sources", value=google_default
+                        )
+                    with gr.Group(visible=google_default) as search_options:
                         with gr.Row():
                             include_sites = gr.Textbox(
                                 label="Include Specific Websites",
                             day_to = gr.Textbox(label="To Day", value=d1[0])
                             year_to = gr.Textbox(label="To Year", value=d1[2])
+                    gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
+                    pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
                 with gr.Group():
                     gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")

plagiarism.py CHANGED Viewed

@@ -4,24 +4,72 @@ from googleapiclient.discovery import build
 import asyncio
 import httpx
 from bs4 import BeautifulSoup
-import justext
-import newspaper
 from dotenv import load_dotenv
 load_dotenv()
 def clean_html(text):
-    result = ""
-    article = newspaper.Article(url=" ")
-    article.set_html(text)
-    article.parse()
-    result += article.title + "\n"
-    paragraphs = justext.justext(text, justext.get_stoplist("English"))
-    for paragraph in paragraphs:
-        if not paragraph.is_boilerplate:
-            result += paragraph.text
-    return result
 months = {
@@ -112,21 +160,6 @@ def google_search(
         api_key,
         cse_id,
     )
-    print("URLS: ", url_list)
-    print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
-    # Scrape URLs in list
-    start_time = time.perf_counter()
-    soups = asyncio.run(parallel_scrap(url_list))
-    print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
-    result_content = {}
-    num_pages = 3
-    count = 0
-    for url, soup in zip(url_list, soups):
-        if count >= num_pages:
-            break
-        if soup:
-            text = clean_html(soup.text)
-            if len(text) > 500:
-                result_content[url] = text
-                count += 1
     return result_content

 import asyncio
 import httpx
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
+import html2text
+import requests
 load_dotenv()
+# load html2text and set up configs
+h2t = html2text.HTML2Text()
+h2t.bodywidth = 0  # No wrapping
+h2t.ignore_links = True  # Ignore hyperlinks
+h2t.ignore_images = True  # Ignore images
+h2t.ignore_emphasis = True  # Ignore emphasis
+h2t.ignore_tables = False  # Include tables
+h2t.skip_internal_links = True  # Skip internal links
+h2t.skip_external_links = True  # Skip external links
+h2t.single_line_break = True  # Use single line breaks
+h2t.protect_links = True  # Protect links from being split
+h2t.default_image_alt = "[image]"  # Default alt text for images
 def clean_html(text):
+    return h2t.handle(text)
+def build_results_beautifulsoup(url_list):
+    # Scrape URLs in list
+    start_time = time.perf_counter()
+    soups = asyncio.run(parallel_scrap(url_list))
+    print("Scraping processing time: ", time.perf_counter() - start_time)
+    result_content = {}
+    num_pages = 3
+    count = 0
+    for url, soup in zip(url_list, soups):
+        if count >= num_pages:
+            break
+        if soup:
+            text = clean_html(soup.text)
+            if len(text) > 500:
+                result_content[url] = text
+                count += 1
+    return result_content
+def build_results_extractor(url_list):
+    try:
+        endpoint = "https://extractorapi.com/api/v1/extractor"
+        result_content = {}
+        num_pages = 3
+        count = 0
+        for url in url_list:
+            if count >= num_pages:
+                break
+            params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
+            r = requests.get(endpoint, params=params)
+            if r.status_code == 200:
+                text = r.json()["text"]
+                if len(text) > 500:
+                    result_content[url] = text
+                    count += 1
+            if r.status_code == 403:
+                raise Exception(f"Error with API; using default implementaion instead")
+        return result_content
+    except Exception as e:
+        print(e)
+        return build_results_beautifulsoup(url_list)
 months = {
         api_key,
         cse_id,
     )
+    print("Google Search processing time: ", time.perf_counter() - start_time)
+    result_content = build_results_beautifulsoup(url_list)
     return result_content

requirements.txt CHANGED Viewed

@@ -11,8 +11,6 @@ scipy
 Unidecode
 BeautifulSoup4
 google-api-python-client
-newspaper3k
-jusText
 langchain-groq
 langchainhub
 sentence-transformers
@@ -25,4 +23,5 @@ google-generativeai
 langchain-google-genai
 langchain-anthropic
 langchain-openai
-vertexai

 Unidecode
 BeautifulSoup4
 google-api-python-client
 langchain-groq
 langchainhub
 sentence-transformers
 langchain-google-genai
 langchain-anthropic
 langchain-openai
+vertexai
+html2text