Spaces:
Runtime error
Runtime error
merge main + multi pdfs + updated html cleaning + better references
Browse files- ai_generate.py +9 -6
- app.py +38 -43
- plagiarism.py +62 -29
- requirements.txt +2 -3
ai_generate.py
CHANGED
@@ -77,17 +77,20 @@ rag_llms = {
|
|
77 |
|
78 |
|
79 |
def create_db_with_langchain(path):
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
85 |
|
86 |
# create the open-source embedding function
|
87 |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
88 |
|
89 |
# load it into Chroma
|
90 |
-
db = Chroma.from_documents(
|
91 |
return db
|
92 |
|
93 |
|
|
|
77 |
|
78 |
|
79 |
def create_db_with_langchain(path):
|
80 |
+
all_docs = []
|
81 |
+
for file in path:
|
82 |
+
loader = PyMuPDFLoader(file)
|
83 |
+
data = loader.load()
|
84 |
+
# split it into chunks
|
85 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
86 |
+
docs = text_splitter.split_documents(data)
|
87 |
+
all_docs.extend(docs)
|
88 |
|
89 |
# create the open-source embedding function
|
90 |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
91 |
|
92 |
# load it into Chroma
|
93 |
+
db = Chroma.from_documents(all_docs, embedding_function)
|
94 |
return db
|
95 |
|
96 |
|
app.py
CHANGED
@@ -64,6 +64,11 @@ def clean_text(text: str) -> str:
|
|
64 |
return "\n".join(cleaned_paragraphs)
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
67 |
def split_text_from_refs(text: str, sep="\n"):
|
68 |
lines = text.split("\n")
|
69 |
references = []
|
@@ -72,25 +77,37 @@ def split_text_from_refs(text: str, sep="\n"):
|
|
72 |
in_references = False
|
73 |
|
74 |
for line in lines:
|
|
|
|
|
|
|
75 |
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
76 |
in_references = True
|
77 |
continue
|
78 |
if line.strip().lower().startswith("references:"):
|
79 |
in_references = True
|
|
|
|
|
|
|
80 |
if in_references:
|
81 |
matches = index_pattern.split(line)
|
82 |
for match in matches:
|
83 |
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
84 |
references.append(match.strip())
|
85 |
else:
|
86 |
-
article_text.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
89 |
-
for i, ref in enumerate(references, 1):
|
90 |
-
ref = remove_bracketed_numbers(ref)
|
91 |
-
formatted_refs.append(f"[{i}] {ref}{sep}")
|
92 |
|
93 |
-
return
|
94 |
|
95 |
|
96 |
def ends_with_references(text):
|
@@ -225,7 +242,7 @@ def ai_generated_test_gptzero(text):
|
|
225 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
226 |
body, references = split_text_from_refs(text, "<br>")
|
227 |
score, text = detection_polygraf(text=body, model=model)
|
228 |
-
text = text +
|
229 |
return score, text
|
230 |
|
231 |
|
@@ -262,8 +279,10 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
262 |
- Include {settings['num_examples']} relevant examples or case studies
|
263 |
- Incorporate data or statistics from {', '.join(settings['references'])}
|
264 |
- End with a {settings['conclusion_type']} conclusion
|
265 |
-
- Add a "References" section in the format "References
|
|
|
266 |
- Do not make any headline, title bold.
|
|
|
267 |
{settings['sources']}
|
268 |
|
269 |
Ensure proper paragraph breaks for better readability.
|
@@ -284,6 +303,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
284 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
285 |
- Keep the references the same as the given text in the same format.
|
286 |
- Do not make any headline, title bold.
|
|
|
287 |
{settings['sources']}
|
288 |
|
289 |
Ensure proper paragraph breaks for better readability.
|
@@ -355,6 +375,7 @@ def humanize(
|
|
355 |
top_k: int = 50,
|
356 |
length_penalty: float = 1,
|
357 |
) -> str:
|
|
|
358 |
body, references = split_text_from_refs(text)
|
359 |
result = paraphrase_text(
|
360 |
text=body,
|
@@ -364,7 +385,7 @@ def humanize(
|
|
364 |
top_k=top_k,
|
365 |
length_penalty=length_penalty,
|
366 |
)
|
367 |
-
result = result +
|
368 |
return format_and_correct_language_check(result)
|
369 |
|
370 |
|
@@ -375,35 +396,6 @@ def update_visibility_api(model: str):
|
|
375 |
return gr.update(visible=False)
|
376 |
|
377 |
|
378 |
-
def format_references(text: str) -> str:
|
379 |
-
lines = text.split("\n")
|
380 |
-
references = []
|
381 |
-
article_text = []
|
382 |
-
index_pattern = re.compile(r"\[(\d+)\]")
|
383 |
-
in_references = False
|
384 |
-
|
385 |
-
for line in lines:
|
386 |
-
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
387 |
-
in_references = True
|
388 |
-
continue
|
389 |
-
if line.strip().lower().startswith("references:"):
|
390 |
-
in_references = True
|
391 |
-
if in_references:
|
392 |
-
matches = index_pattern.split(line)
|
393 |
-
for match in matches:
|
394 |
-
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
395 |
-
references.append(match.strip())
|
396 |
-
else:
|
397 |
-
article_text.append(line)
|
398 |
-
|
399 |
-
formatted_refs = []
|
400 |
-
for i, ref in enumerate(references, 1):
|
401 |
-
ref = remove_bracketed_numbers(ref)
|
402 |
-
formatted_refs.append(f"[{i}] {ref}\n")
|
403 |
-
|
404 |
-
return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
|
405 |
-
|
406 |
-
|
407 |
def generate_and_format(
|
408 |
input_role,
|
409 |
topic,
|
@@ -450,7 +442,7 @@ def generate_and_format(
|
|
450 |
print(f"Google Search Query: {final_query}")
|
451 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
452 |
content_string = "\n".join(
|
453 |
-
f"{url.strip()}: \n{content.strip()[:
|
454 |
)
|
455 |
content_string = (
|
456 |
"Use the trusted information here from the URLs and add them as References:\n" + content_string
|
@@ -627,9 +619,12 @@ def create_interface():
|
|
627 |
elem_classes="input-highlight-turquoise",
|
628 |
)
|
629 |
gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
|
|
|
630 |
with gr.Row():
|
631 |
-
google_search_check = gr.Checkbox(
|
632 |
-
|
|
|
|
|
633 |
with gr.Row():
|
634 |
include_sites = gr.Textbox(
|
635 |
label="Include Specific Websites",
|
@@ -669,8 +664,8 @@ def create_interface():
|
|
669 |
day_to = gr.Textbox(label="To Day", value=d1[0])
|
670 |
year_to = gr.Textbox(label="To Year", value=d1[2])
|
671 |
|
672 |
-
gr.Markdown("# Add Optional PDF
|
673 |
-
pdf_file_input = gr.File(label="Upload PDF")
|
674 |
|
675 |
with gr.Group():
|
676 |
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
|
|
64 |
return "\n".join(cleaned_paragraphs)
|
65 |
|
66 |
|
67 |
+
def format_references(text: str) -> str:
|
68 |
+
body, references = split_text_from_refs(text)
|
69 |
+
return body + references
|
70 |
+
|
71 |
+
|
72 |
def split_text_from_refs(text: str, sep="\n"):
|
73 |
lines = text.split("\n")
|
74 |
references = []
|
|
|
77 |
in_references = False
|
78 |
|
79 |
for line in lines:
|
80 |
+
if line == "":
|
81 |
+
continue
|
82 |
+
match = re.search(r"[Rr]eferences:", line, re.DOTALL)
|
83 |
if line.strip().lower() == "references" or line.strip().lower() == "references:":
|
84 |
in_references = True
|
85 |
continue
|
86 |
if line.strip().lower().startswith("references:"):
|
87 |
in_references = True
|
88 |
+
if match:
|
89 |
+
in_references = True
|
90 |
+
line = line[match.end() :]
|
91 |
if in_references:
|
92 |
matches = index_pattern.split(line)
|
93 |
for match in matches:
|
94 |
if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
|
95 |
references.append(match.strip())
|
96 |
else:
|
97 |
+
article_text.append(line.strip())
|
98 |
+
|
99 |
+
if len(references) > 0:
|
100 |
+
formatted_refs = []
|
101 |
+
for i, ref in enumerate(references, 1):
|
102 |
+
ref = remove_bracketed_numbers(ref)
|
103 |
+
formatted_refs.append(f"[{i}] {ref}{sep}")
|
104 |
+
formatted_refs = f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
|
105 |
+
else:
|
106 |
+
formatted_refs = ""
|
107 |
|
108 |
+
body = f"{sep}{sep}".join(article_text)
|
|
|
|
|
|
|
109 |
|
110 |
+
return body, formatted_refs
|
111 |
|
112 |
|
113 |
def ends_with_references(text):
|
|
|
242 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
243 |
body, references = split_text_from_refs(text, "<br>")
|
244 |
score, text = detection_polygraf(text=body, model=model)
|
245 |
+
text = text + references
|
246 |
return score, text
|
247 |
|
248 |
|
|
|
279 |
- Include {settings['num_examples']} relevant examples or case studies
|
280 |
- Incorporate data or statistics from {', '.join(settings['references'])}
|
281 |
- End with a {settings['conclusion_type']} conclusion
|
282 |
+
- Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line
|
283 |
+
- Do not repeat sources
|
284 |
- Do not make any headline, title bold.
|
285 |
+
|
286 |
{settings['sources']}
|
287 |
|
288 |
Ensure proper paragraph breaks for better readability.
|
|
|
303 |
- The original content should not be changed. Make minor modifications based on user comments above.
|
304 |
- Keep the references the same as the given text in the same format.
|
305 |
- Do not make any headline, title bold.
|
306 |
+
|
307 |
{settings['sources']}
|
308 |
|
309 |
Ensure proper paragraph breaks for better readability.
|
|
|
375 |
top_k: int = 50,
|
376 |
length_penalty: float = 1,
|
377 |
) -> str:
|
378 |
+
print("Humanizing text...")
|
379 |
body, references = split_text_from_refs(text)
|
380 |
result = paraphrase_text(
|
381 |
text=body,
|
|
|
385 |
top_k=top_k,
|
386 |
length_penalty=length_penalty,
|
387 |
)
|
388 |
+
result = result + references
|
389 |
return format_and_correct_language_check(result)
|
390 |
|
391 |
|
|
|
396 |
return gr.update(visible=False)
|
397 |
|
398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
399 |
def generate_and_format(
|
400 |
input_role,
|
401 |
topic,
|
|
|
442 |
print(f"Google Search Query: {final_query}")
|
443 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
444 |
content_string = "\n".join(
|
445 |
+
f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
|
446 |
)
|
447 |
content_string = (
|
448 |
"Use the trusted information here from the URLs and add them as References:\n" + content_string
|
|
|
619 |
elem_classes="input-highlight-turquoise",
|
620 |
)
|
621 |
gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
|
622 |
+
google_default = False
|
623 |
with gr.Row():
|
624 |
+
google_search_check = gr.Checkbox(
|
625 |
+
label="Enable Google Search For Recent Sources", value=google_default
|
626 |
+
)
|
627 |
+
with gr.Group(visible=google_default) as search_options:
|
628 |
with gr.Row():
|
629 |
include_sites = gr.Textbox(
|
630 |
label="Include Specific Websites",
|
|
|
664 |
day_to = gr.Textbox(label="To Day", value=d1[0])
|
665 |
year_to = gr.Textbox(label="To Year", value=d1[2])
|
666 |
|
667 |
+
gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
|
668 |
+
pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
|
669 |
|
670 |
with gr.Group():
|
671 |
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
|
plagiarism.py
CHANGED
@@ -4,24 +4,72 @@ from googleapiclient.discovery import build
|
|
4 |
import asyncio
|
5 |
import httpx
|
6 |
from bs4 import BeautifulSoup
|
7 |
-
import justext
|
8 |
-
import newspaper
|
9 |
from dotenv import load_dotenv
|
|
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def clean_html(text):
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
months = {
|
@@ -112,21 +160,6 @@ def google_search(
|
|
112 |
api_key,
|
113 |
cse_id,
|
114 |
)
|
115 |
-
print("
|
116 |
-
|
117 |
-
# Scrape URLs in list
|
118 |
-
start_time = time.perf_counter()
|
119 |
-
soups = asyncio.run(parallel_scrap(url_list))
|
120 |
-
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
121 |
-
result_content = {}
|
122 |
-
num_pages = 3
|
123 |
-
count = 0
|
124 |
-
for url, soup in zip(url_list, soups):
|
125 |
-
if count >= num_pages:
|
126 |
-
break
|
127 |
-
if soup:
|
128 |
-
text = clean_html(soup.text)
|
129 |
-
if len(text) > 500:
|
130 |
-
result_content[url] = text
|
131 |
-
count += 1
|
132 |
return result_content
|
|
|
4 |
import asyncio
|
5 |
import httpx
|
6 |
from bs4 import BeautifulSoup
|
|
|
|
|
7 |
from dotenv import load_dotenv
|
8 |
+
import html2text
|
9 |
+
import requests
|
10 |
|
11 |
load_dotenv()
|
12 |
|
13 |
+
# load html2text and set up configs
|
14 |
+
h2t = html2text.HTML2Text()
|
15 |
+
h2t.bodywidth = 0 # No wrapping
|
16 |
+
h2t.ignore_links = True # Ignore hyperlinks
|
17 |
+
h2t.ignore_images = True # Ignore images
|
18 |
+
h2t.ignore_emphasis = True # Ignore emphasis
|
19 |
+
h2t.ignore_tables = False # Include tables
|
20 |
+
h2t.skip_internal_links = True # Skip internal links
|
21 |
+
h2t.skip_external_links = True # Skip external links
|
22 |
+
h2t.single_line_break = True # Use single line breaks
|
23 |
+
h2t.protect_links = True # Protect links from being split
|
24 |
+
h2t.default_image_alt = "[image]" # Default alt text for images
|
25 |
+
|
26 |
|
27 |
def clean_html(text):
|
28 |
+
return h2t.handle(text)
|
29 |
+
|
30 |
+
|
31 |
+
def build_results_beautifulsoup(url_list):
|
32 |
+
# Scrape URLs in list
|
33 |
+
start_time = time.perf_counter()
|
34 |
+
soups = asyncio.run(parallel_scrap(url_list))
|
35 |
+
print("Scraping processing time: ", time.perf_counter() - start_time)
|
36 |
+
result_content = {}
|
37 |
+
num_pages = 3
|
38 |
+
count = 0
|
39 |
+
for url, soup in zip(url_list, soups):
|
40 |
+
if count >= num_pages:
|
41 |
+
break
|
42 |
+
if soup:
|
43 |
+
text = clean_html(soup.text)
|
44 |
+
if len(text) > 500:
|
45 |
+
result_content[url] = text
|
46 |
+
count += 1
|
47 |
+
return result_content
|
48 |
+
|
49 |
+
|
50 |
+
def build_results_extractor(url_list):
|
51 |
+
try:
|
52 |
+
endpoint = "https://extractorapi.com/api/v1/extractor"
|
53 |
+
result_content = {}
|
54 |
+
num_pages = 3
|
55 |
+
count = 0
|
56 |
+
for url in url_list:
|
57 |
+
if count >= num_pages:
|
58 |
+
break
|
59 |
+
params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
|
60 |
+
r = requests.get(endpoint, params=params)
|
61 |
+
if r.status_code == 200:
|
62 |
+
text = r.json()["text"]
|
63 |
+
if len(text) > 500:
|
64 |
+
result_content[url] = text
|
65 |
+
count += 1
|
66 |
+
if r.status_code == 403:
|
67 |
+
raise Exception(f"Error with API; using default implementaion instead")
|
68 |
+
return result_content
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
print(e)
|
72 |
+
return build_results_beautifulsoup(url_list)
|
73 |
|
74 |
|
75 |
months = {
|
|
|
160 |
api_key,
|
161 |
cse_id,
|
162 |
)
|
163 |
+
print("Google Search processing time: ", time.perf_counter() - start_time)
|
164 |
+
result_content = build_results_beautifulsoup(url_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
return result_content
|
requirements.txt
CHANGED
@@ -11,8 +11,6 @@ scipy
|
|
11 |
Unidecode
|
12 |
BeautifulSoup4
|
13 |
google-api-python-client
|
14 |
-
newspaper3k
|
15 |
-
jusText
|
16 |
langchain-groq
|
17 |
langchainhub
|
18 |
sentence-transformers
|
@@ -25,4 +23,5 @@ google-generativeai
|
|
25 |
langchain-google-genai
|
26 |
langchain-anthropic
|
27 |
langchain-openai
|
28 |
-
vertexai
|
|
|
|
11 |
Unidecode
|
12 |
BeautifulSoup4
|
13 |
google-api-python-client
|
|
|
|
|
14 |
langchain-groq
|
15 |
langchainhub
|
16 |
sentence-transformers
|
|
|
23 |
langchain-google-genai
|
24 |
langchain-anthropic
|
25 |
langchain-openai
|
26 |
+
vertexai
|
27 |
+
html2text
|