minko186 commited on
Commit
43d4e83
·
1 Parent(s): 48d4d11

merge main + multi pdfs + updated html cleaning + better references

Browse files
Files changed (4) hide show
  1. ai_generate.py +9 -6
  2. app.py +38 -43
  3. plagiarism.py +62 -29
  4. requirements.txt +2 -3
ai_generate.py CHANGED
@@ -77,17 +77,20 @@ rag_llms = {
77
 
78
 
79
  def create_db_with_langchain(path):
80
- loader = PyMuPDFLoader(path)
81
- data = loader.load()
82
- # split it into chunks
83
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
84
- docs = text_splitter.split_documents(data)
 
 
 
85
 
86
  # create the open-source embedding function
87
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
88
 
89
  # load it into Chroma
90
- db = Chroma.from_documents(docs, embedding_function)
91
  return db
92
 
93
 
 
77
 
78
 
79
  def create_db_with_langchain(path):
80
+ all_docs = []
81
+ for file in path:
82
+ loader = PyMuPDFLoader(file)
83
+ data = loader.load()
84
+ # split it into chunks
85
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
86
+ docs = text_splitter.split_documents(data)
87
+ all_docs.extend(docs)
88
 
89
  # create the open-source embedding function
90
  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
91
 
92
  # load it into Chroma
93
+ db = Chroma.from_documents(all_docs, embedding_function)
94
  return db
95
 
96
 
app.py CHANGED
@@ -64,6 +64,11 @@ def clean_text(text: str) -> str:
64
  return "\n".join(cleaned_paragraphs)
65
 
66
 
 
 
 
 
 
67
  def split_text_from_refs(text: str, sep="\n"):
68
  lines = text.split("\n")
69
  references = []
@@ -72,25 +77,37 @@ def split_text_from_refs(text: str, sep="\n"):
72
  in_references = False
73
 
74
  for line in lines:
 
 
 
75
  if line.strip().lower() == "references" or line.strip().lower() == "references:":
76
  in_references = True
77
  continue
78
  if line.strip().lower().startswith("references:"):
79
  in_references = True
 
 
 
80
  if in_references:
81
  matches = index_pattern.split(line)
82
  for match in matches:
83
  if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
84
  references.append(match.strip())
85
  else:
86
- article_text.append(line)
 
 
 
 
 
 
 
 
 
87
 
88
- formatted_refs = []
89
- for i, ref in enumerate(references, 1):
90
- ref = remove_bracketed_numbers(ref)
91
- formatted_refs.append(f"[{i}] {ref}{sep}")
92
 
93
- return "\n\n".join(article_text), f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
94
 
95
 
96
  def ends_with_references(text):
@@ -225,7 +242,7 @@ def ai_generated_test_gptzero(text):
225
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
226
  body, references = split_text_from_refs(text, "<br>")
227
  score, text = detection_polygraf(text=body, model=model)
228
- text = text + "<br>" + references
229
  return score, text
230
 
231
 
@@ -262,8 +279,10 @@ def generate_prompt(settings: Dict[str, str]) -> str:
262
  - Include {settings['num_examples']} relevant examples or case studies
263
  - Incorporate data or statistics from {', '.join(settings['references'])}
264
  - End with a {settings['conclusion_type']} conclusion
265
- - Add a "References" section in the format "References:\n" at the end with at least 3 credible sources, formatted as [1], [2], etc. with each source on their own line
 
266
  - Do not make any headline, title bold.
 
267
  {settings['sources']}
268
 
269
  Ensure proper paragraph breaks for better readability.
@@ -284,6 +303,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
284
  - The original content should not be changed. Make minor modifications based on user comments above.
285
  - Keep the references the same as the given text in the same format.
286
  - Do not make any headline, title bold.
 
287
  {settings['sources']}
288
 
289
  Ensure proper paragraph breaks for better readability.
@@ -355,6 +375,7 @@ def humanize(
355
  top_k: int = 50,
356
  length_penalty: float = 1,
357
  ) -> str:
 
358
  body, references = split_text_from_refs(text)
359
  result = paraphrase_text(
360
  text=body,
@@ -364,7 +385,7 @@ def humanize(
364
  top_k=top_k,
365
  length_penalty=length_penalty,
366
  )
367
- result = result + "\n\n" + references
368
  return format_and_correct_language_check(result)
369
 
370
 
@@ -375,35 +396,6 @@ def update_visibility_api(model: str):
375
  return gr.update(visible=False)
376
 
377
 
378
- def format_references(text: str) -> str:
379
- lines = text.split("\n")
380
- references = []
381
- article_text = []
382
- index_pattern = re.compile(r"\[(\d+)\]")
383
- in_references = False
384
-
385
- for line in lines:
386
- if line.strip().lower() == "references" or line.strip().lower() == "references:":
387
- in_references = True
388
- continue
389
- if line.strip().lower().startswith("references:"):
390
- in_references = True
391
- if in_references:
392
- matches = index_pattern.split(line)
393
- for match in matches:
394
- if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
395
- references.append(match.strip())
396
- else:
397
- article_text.append(line)
398
-
399
- formatted_refs = []
400
- for i, ref in enumerate(references, 1):
401
- ref = remove_bracketed_numbers(ref)
402
- formatted_refs.append(f"[{i}] {ref}\n")
403
-
404
- return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
405
-
406
-
407
  def generate_and_format(
408
  input_role,
409
  topic,
@@ -450,7 +442,7 @@ def generate_and_format(
450
  print(f"Google Search Query: {final_query}")
451
  url_content = google_search(final_query, sorted_date, domains_to_include)
452
  content_string = "\n".join(
453
- f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
454
  )
455
  content_string = (
456
  "Use the trusted information here from the URLs and add them as References:\n" + content_string
@@ -627,9 +619,12 @@ def create_interface():
627
  elem_classes="input-highlight-turquoise",
628
  )
629
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
 
630
  with gr.Row():
631
- google_search_check = gr.Checkbox(label="Enable Google Search For Recent Sources", value=False)
632
- with gr.Group(visible=True) as search_options:
 
 
633
  with gr.Row():
634
  include_sites = gr.Textbox(
635
  label="Include Specific Websites",
@@ -669,8 +664,8 @@ def create_interface():
669
  day_to = gr.Textbox(label="To Day", value=d1[0])
670
  year_to = gr.Textbox(label="To Year", value=d1[2])
671
 
672
- gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
673
- pdf_file_input = gr.File(label="Upload PDF")
674
 
675
  with gr.Group():
676
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
 
64
  return "\n".join(cleaned_paragraphs)
65
 
66
 
67
+ def format_references(text: str) -> str:
68
+ body, references = split_text_from_refs(text)
69
+ return body + references
70
+
71
+
72
  def split_text_from_refs(text: str, sep="\n"):
73
  lines = text.split("\n")
74
  references = []
 
77
  in_references = False
78
 
79
  for line in lines:
80
+ if line == "":
81
+ continue
82
+ match = re.search(r"[Rr]eferences:", line, re.DOTALL)
83
  if line.strip().lower() == "references" or line.strip().lower() == "references:":
84
  in_references = True
85
  continue
86
  if line.strip().lower().startswith("references:"):
87
  in_references = True
88
+ if match:
89
+ in_references = True
90
+ line = line[match.end() :]
91
  if in_references:
92
  matches = index_pattern.split(line)
93
  for match in matches:
94
  if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"):
95
  references.append(match.strip())
96
  else:
97
+ article_text.append(line.strip())
98
+
99
+ if len(references) > 0:
100
+ formatted_refs = []
101
+ for i, ref in enumerate(references, 1):
102
+ ref = remove_bracketed_numbers(ref)
103
+ formatted_refs.append(f"[{i}] {ref}{sep}")
104
+ formatted_refs = f"{sep}{sep}References:{sep}" + f"{sep}".join(formatted_refs)
105
+ else:
106
+ formatted_refs = ""
107
 
108
+ body = f"{sep}{sep}".join(article_text)
 
 
 
109
 
110
+ return body, formatted_refs
111
 
112
 
113
  def ends_with_references(text):
 
242
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
243
  body, references = split_text_from_refs(text, "<br>")
244
  score, text = detection_polygraf(text=body, model=model)
245
+ text = text + references
246
  return score, text
247
 
248
 
 
279
  - Include {settings['num_examples']} relevant examples or case studies
280
  - Incorporate data or statistics from {', '.join(settings['references'])}
281
  - End with a {settings['conclusion_type']} conclusion
282
+ - Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line
283
+ - Do not repeat sources
284
  - Do not make any headline, title bold.
285
+
286
  {settings['sources']}
287
 
288
  Ensure proper paragraph breaks for better readability.
 
303
  - The original content should not be changed. Make minor modifications based on user comments above.
304
  - Keep the references the same as the given text in the same format.
305
  - Do not make any headline, title bold.
306
+
307
  {settings['sources']}
308
 
309
  Ensure proper paragraph breaks for better readability.
 
375
  top_k: int = 50,
376
  length_penalty: float = 1,
377
  ) -> str:
378
+ print("Humanizing text...")
379
  body, references = split_text_from_refs(text)
380
  result = paraphrase_text(
381
  text=body,
 
385
  top_k=top_k,
386
  length_penalty=length_penalty,
387
  )
388
+ result = result + references
389
  return format_and_correct_language_check(result)
390
 
391
 
 
396
  return gr.update(visible=False)
397
 
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  def generate_and_format(
400
  input_role,
401
  topic,
 
442
  print(f"Google Search Query: {final_query}")
443
  url_content = google_search(final_query, sorted_date, domains_to_include)
444
  content_string = "\n".join(
445
+ f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items()
446
  )
447
  content_string = (
448
  "Use the trusted information here from the URLs and add them as References:\n" + content_string
 
619
  elem_classes="input-highlight-turquoise",
620
  )
621
  gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
622
+ google_default = False
623
  with gr.Row():
624
+ google_search_check = gr.Checkbox(
625
+ label="Enable Google Search For Recent Sources", value=google_default
626
+ )
627
+ with gr.Group(visible=google_default) as search_options:
628
  with gr.Row():
629
  include_sites = gr.Textbox(
630
  label="Include Specific Websites",
 
664
  day_to = gr.Textbox(label="To Day", value=d1[0])
665
  year_to = gr.Textbox(label="To Year", value=d1[2])
666
 
667
+ gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
668
+ pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
669
 
670
  with gr.Group():
671
  gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
plagiarism.py CHANGED
@@ -4,24 +4,72 @@ from googleapiclient.discovery import build
4
  import asyncio
5
  import httpx
6
  from bs4 import BeautifulSoup
7
- import justext
8
- import newspaper
9
  from dotenv import load_dotenv
 
 
10
 
11
  load_dotenv()
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def clean_html(text):
15
- result = ""
16
- article = newspaper.Article(url=" ")
17
- article.set_html(text)
18
- article.parse()
19
- result += article.title + "\n"
20
- paragraphs = justext.justext(text, justext.get_stoplist("English"))
21
- for paragraph in paragraphs:
22
- if not paragraph.is_boilerplate:
23
- result += paragraph.text
24
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
27
  months = {
@@ -112,21 +160,6 @@ def google_search(
112
  api_key,
113
  cse_id,
114
  )
115
- print("URLS: ", url_list)
116
- print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
117
- # Scrape URLs in list
118
- start_time = time.perf_counter()
119
- soups = asyncio.run(parallel_scrap(url_list))
120
- print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
121
- result_content = {}
122
- num_pages = 3
123
- count = 0
124
- for url, soup in zip(url_list, soups):
125
- if count >= num_pages:
126
- break
127
- if soup:
128
- text = clean_html(soup.text)
129
- if len(text) > 500:
130
- result_content[url] = text
131
- count += 1
132
  return result_content
 
4
  import asyncio
5
  import httpx
6
  from bs4 import BeautifulSoup
 
 
7
  from dotenv import load_dotenv
8
+ import html2text
9
+ import requests
10
 
11
  load_dotenv()
12
 
13
+ # load html2text and set up configs
14
+ h2t = html2text.HTML2Text()
15
+ h2t.bodywidth = 0 # No wrapping
16
+ h2t.ignore_links = True # Ignore hyperlinks
17
+ h2t.ignore_images = True # Ignore images
18
+ h2t.ignore_emphasis = True # Ignore emphasis
19
+ h2t.ignore_tables = False # Include tables
20
+ h2t.skip_internal_links = True # Skip internal links
21
+ h2t.skip_external_links = True # Skip external links
22
+ h2t.single_line_break = True # Use single line breaks
23
+ h2t.protect_links = True # Protect links from being split
24
+ h2t.default_image_alt = "[image]" # Default alt text for images
25
+
26
 
27
  def clean_html(text):
28
+ return h2t.handle(text)
29
+
30
+
31
+ def build_results_beautifulsoup(url_list):
32
+ # Scrape URLs in list
33
+ start_time = time.perf_counter()
34
+ soups = asyncio.run(parallel_scrap(url_list))
35
+ print("Scraping processing time: ", time.perf_counter() - start_time)
36
+ result_content = {}
37
+ num_pages = 3
38
+ count = 0
39
+ for url, soup in zip(url_list, soups):
40
+ if count >= num_pages:
41
+ break
42
+ if soup:
43
+ text = clean_html(soup.text)
44
+ if len(text) > 500:
45
+ result_content[url] = text
46
+ count += 1
47
+ return result_content
48
+
49
+
50
+ def build_results_extractor(url_list):
51
+ try:
52
+ endpoint = "https://extractorapi.com/api/v1/extractor"
53
+ result_content = {}
54
+ num_pages = 3
55
+ count = 0
56
+ for url in url_list:
57
+ if count >= num_pages:
58
+ break
59
+ params = {"apikey": os.environ.get("EXTRACTOR_API_KEY"), "url": url}
60
+ r = requests.get(endpoint, params=params)
61
+ if r.status_code == 200:
62
+ text = r.json()["text"]
63
+ if len(text) > 500:
64
+ result_content[url] = text
65
+ count += 1
66
+ if r.status_code == 403:
67
+ raise Exception(f"Error with API; using default implementaion instead")
68
+ return result_content
69
+
70
+ except Exception as e:
71
+ print(e)
72
+ return build_results_beautifulsoup(url_list)
73
 
74
 
75
  months = {
 
160
  api_key,
161
  cse_id,
162
  )
163
+ print("Google Search processing time: ", time.perf_counter() - start_time)
164
+ result_content = build_results_beautifulsoup(url_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  return result_content
requirements.txt CHANGED
@@ -11,8 +11,6 @@ scipy
11
  Unidecode
12
  BeautifulSoup4
13
  google-api-python-client
14
- newspaper3k
15
- jusText
16
  langchain-groq
17
  langchainhub
18
  sentence-transformers
@@ -25,4 +23,5 @@ google-generativeai
25
  langchain-google-genai
26
  langchain-anthropic
27
  langchain-openai
28
- vertexai
 
 
11
  Unidecode
12
  BeautifulSoup4
13
  google-api-python-client
 
 
14
  langchain-groq
15
  langchainhub
16
  sentence-transformers
 
23
  langchain-google-genai
24
  langchain-anthropic
25
  langchain-openai
26
+ vertexai
27
+ html2text