eljanmahammadli commited on
Commit
ef88cd6
·
1 Parent(s): fb4d683

added exact keyword match search

Browse files
Files changed (5) hide show
  1. .gitignore +1 -1
  2. ai_generate.py +3 -0
  3. app.py +18 -9
  4. humanize.py +0 -1
  5. plagiarism.py +1 -5
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- _pycache_
2
  .env
3
  nohup.out
4
  *.out
 
1
+ __pycache__/
2
  .env
3
  nohup.out
4
  *.out
ai_generate.py CHANGED
@@ -16,12 +16,14 @@ from langchain_core.runnables import RunnablePassthrough
16
  from langchain.chains import RetrievalQA
17
  from langchain_groq import ChatGroq
18
  from dotenv import load_dotenv
 
19
  load_dotenv()
20
 
21
  groq_client = Groq(
22
  api_key=os.environ.get("GROQ_API_KEY"),
23
  )
24
 
 
25
  def create_db_with_langchain(path):
26
  loader = PyMuPDFLoader(path)
27
  data = loader.load()
@@ -75,6 +77,7 @@ def generate_groq_base(text, model):
75
  response += chunk.choices[0].delta.content or ""
76
  return response
77
 
 
78
  def generate_groq(text, model, path):
79
  if path:
80
  return generate_groq_rag(text, model, path)
 
16
  from langchain.chains import RetrievalQA
17
  from langchain_groq import ChatGroq
18
  from dotenv import load_dotenv
19
+
20
  load_dotenv()
21
 
22
  groq_client = Groq(
23
  api_key=os.environ.get("GROQ_API_KEY"),
24
  )
25
 
26
+
27
  def create_db_with_langchain(path):
28
  loader = PyMuPDFLoader(path)
29
  data = loader.load()
 
77
  response += chunk.choices[0].delta.content or ""
78
  return response
79
 
80
+
81
  def generate_groq(text, model, path):
82
  if path:
83
  return generate_groq_rag(text, model, path)
app.py CHANGED
@@ -5,7 +5,7 @@ import re
5
  from humanize import paraphrase_text
6
  from ai_generate import generate
7
  import requests
8
- import language_tool_python
9
  import torch
10
  from gradio_client import Client
11
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
@@ -83,6 +83,7 @@ def format_and_correct_language_check(text: str) -> str:
83
  tool = language_tool_python.LanguageTool("en-US")
84
  return tool.correct(text)
85
 
 
86
  def predict(model, tokenizer, text):
87
  text = remove_special_characters(text)
88
  bc_token_size = 256
@@ -415,6 +416,7 @@ def generate_and_format(
415
  month_to,
416
  day_to,
417
  domains_to_include,
 
418
  pdf_file_input,
419
  generated_article: str = None,
420
  user_comments: str = None,
@@ -423,8 +425,13 @@ def generate_and_format(
423
  date_to = build_date(year_to, month_to, day_to)
424
  sorted_date = f"date:r:{date_from}:{date_to}"
425
  content_string = ""
 
 
 
 
 
426
  if google_search_check:
427
- url_content = google_search(topic, sorted_date, domains_to_include)
428
  content_string = "\n".join(
429
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
430
  )
@@ -622,6 +629,12 @@ def create_interface():
622
  multiselect=True,
623
  label="Domains To Include",
624
  )
 
 
 
 
 
 
625
  gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
626
  pdf_file_input = gr.File(label="Upload PDF")
627
 
@@ -734,6 +747,7 @@ def create_interface():
734
  month_to,
735
  day_to,
736
  domains_to_include,
 
737
  pdf_file_input,
738
  ],
739
  outputs=[output_article],
@@ -767,6 +781,7 @@ def create_interface():
767
  domains_to_include,
768
  pdf_file_input,
769
  output_article,
 
770
  ai_comments,
771
  ],
772
  outputs=[output_article],
@@ -791,12 +806,6 @@ def create_interface():
791
  outputs=[humanized_output],
792
  )
793
 
794
-
795
-
796
-
797
-
798
-
799
-
800
  copy_to_input_btn.click(
801
  fn=copy_to_input,
802
  inputs=[humanized_output],
@@ -809,4 +818,4 @@ def create_interface():
809
  if __name__ == "__main__":
810
  demo = create_interface()
811
  # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
812
- demo.launch(server_name="0.0.0.0")
 
5
  from humanize import paraphrase_text
6
  from ai_generate import generate
7
  import requests
8
+ import language_tool_python
9
  import torch
10
  from gradio_client import Client
11
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 
83
  tool = language_tool_python.LanguageTool("en-US")
84
  return tool.correct(text)
85
 
86
+
87
  def predict(model, tokenizer, text):
88
  text = remove_special_characters(text)
89
  bc_token_size = 256
 
416
  month_to,
417
  day_to,
418
  domains_to_include,
419
+ search_keywords,
420
  pdf_file_input,
421
  generated_article: str = None,
422
  user_comments: str = None,
 
425
  date_to = build_date(year_to, month_to, day_to)
426
  sorted_date = f"date:r:{date_from}:{date_to}"
427
  content_string = ""
428
+ final_query = topic
429
+ if search_keywords != "":
430
+ quoted_keywords = [f'"{keyword.strip()}"' for keyword in search_keywords.split(",")]
431
+ final_query = final_query + " " + " ".join(quoted_keywords)
432
+ print(final_query)
433
  if google_search_check:
434
+ url_content = google_search(final_query, sorted_date, domains_to_include)
435
  content_string = "\n".join(
436
  f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
437
  )
 
629
  multiselect=True,
630
  label="Domains To Include",
631
  )
632
+ with gr.Row():
633
+ search_keywords = gr.Textbox(
634
+ label="Keywords",
635
+ placeholder="Enter comma-separated keywords",
636
+ elem_classes="input-highlight-yellow",
637
+ )
638
  gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
639
  pdf_file_input = gr.File(label="Upload PDF")
640
 
 
747
  month_to,
748
  day_to,
749
  domains_to_include,
750
+ search_keywords,
751
  pdf_file_input,
752
  ],
753
  outputs=[output_article],
 
781
  domains_to_include,
782
  pdf_file_input,
783
  output_article,
784
+ search_keywords,
785
  ai_comments,
786
  ],
787
  outputs=[output_article],
 
806
  outputs=[humanized_output],
807
  )
808
 
 
 
 
 
 
 
809
  copy_to_input_btn.click(
810
  fn=copy_to_input,
811
  inputs=[humanized_output],
 
818
  if __name__ == "__main__":
819
  demo = create_interface()
820
  # demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
821
+ demo.launch(server_name="0.0.0.0")
humanize.py CHANGED
@@ -19,7 +19,6 @@ else:
19
  device = torch.device("cpu")
20
 
21
 
22
-
23
  # Configuration for models and their adapters
24
  model_config = {
25
  "Base Model": "polygraf-ai/poly-humanizer-base",
 
19
  device = torch.device("cpu")
20
 
21
 
 
22
  # Configuration for models and their adapters
23
  model_config = {
24
  "Base Model": "polygraf-ai/poly-humanizer-base",
plagiarism.py CHANGED
@@ -15,8 +15,7 @@ def clean_html(text):
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
- if not paragraph.is_boilerplate:
19
- result += paragraph.text
20
  return result
21
 
22
 
@@ -130,7 +129,4 @@ def google_search(
130
  text = clean_html(soup.text)
131
  result_content[url] = text
132
  count += 1
133
- # for key, value in result_content.items():
134
- # print("-------------------URL: ", key)
135
- # print(value[:30])
136
  return result_content
 
15
  result += article.title + "\n"
16
  paragraphs = justext.justext(text, justext.get_stoplist("English"))
17
  for paragraph in paragraphs:
18
+ result += paragraph.text
 
19
  return result
20
 
21
 
 
129
  text = clean_html(soup.text)
130
  result_content[url] = text
131
  count += 1
 
 
 
132
  return result_content