Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
ef88cd6
1
Parent(s):
fb4d683
added exact keyword match search
Browse files- .gitignore +1 -1
- ai_generate.py +3 -0
- app.py +18 -9
- humanize.py +0 -1
- plagiarism.py +1 -5
.gitignore
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
.env
|
3 |
nohup.out
|
4 |
*.out
|
|
|
1 |
+
__pycache__/
|
2 |
.env
|
3 |
nohup.out
|
4 |
*.out
|
ai_generate.py
CHANGED
@@ -16,12 +16,14 @@ from langchain_core.runnables import RunnablePassthrough
|
|
16 |
from langchain.chains import RetrievalQA
|
17 |
from langchain_groq import ChatGroq
|
18 |
from dotenv import load_dotenv
|
|
|
19 |
load_dotenv()
|
20 |
|
21 |
groq_client = Groq(
|
22 |
api_key=os.environ.get("GROQ_API_KEY"),
|
23 |
)
|
24 |
|
|
|
25 |
def create_db_with_langchain(path):
|
26 |
loader = PyMuPDFLoader(path)
|
27 |
data = loader.load()
|
@@ -75,6 +77,7 @@ def generate_groq_base(text, model):
|
|
75 |
response += chunk.choices[0].delta.content or ""
|
76 |
return response
|
77 |
|
|
|
78 |
def generate_groq(text, model, path):
|
79 |
if path:
|
80 |
return generate_groq_rag(text, model, path)
|
|
|
16 |
from langchain.chains import RetrievalQA
|
17 |
from langchain_groq import ChatGroq
|
18 |
from dotenv import load_dotenv
|
19 |
+
|
20 |
load_dotenv()
|
21 |
|
22 |
groq_client = Groq(
|
23 |
api_key=os.environ.get("GROQ_API_KEY"),
|
24 |
)
|
25 |
|
26 |
+
|
27 |
def create_db_with_langchain(path):
|
28 |
loader = PyMuPDFLoader(path)
|
29 |
data = loader.load()
|
|
|
77 |
response += chunk.choices[0].delta.content or ""
|
78 |
return response
|
79 |
|
80 |
+
|
81 |
def generate_groq(text, model, path):
|
82 |
if path:
|
83 |
return generate_groq_rag(text, model, path)
|
app.py
CHANGED
@@ -5,7 +5,7 @@ import re
|
|
5 |
from humanize import paraphrase_text
|
6 |
from ai_generate import generate
|
7 |
import requests
|
8 |
-
import language_tool_python
|
9 |
import torch
|
10 |
from gradio_client import Client
|
11 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
@@ -83,6 +83,7 @@ def format_and_correct_language_check(text: str) -> str:
|
|
83 |
tool = language_tool_python.LanguageTool("en-US")
|
84 |
return tool.correct(text)
|
85 |
|
|
|
86 |
def predict(model, tokenizer, text):
|
87 |
text = remove_special_characters(text)
|
88 |
bc_token_size = 256
|
@@ -415,6 +416,7 @@ def generate_and_format(
|
|
415 |
month_to,
|
416 |
day_to,
|
417 |
domains_to_include,
|
|
|
418 |
pdf_file_input,
|
419 |
generated_article: str = None,
|
420 |
user_comments: str = None,
|
@@ -423,8 +425,13 @@ def generate_and_format(
|
|
423 |
date_to = build_date(year_to, month_to, day_to)
|
424 |
sorted_date = f"date:r:{date_from}:{date_to}"
|
425 |
content_string = ""
|
|
|
|
|
|
|
|
|
|
|
426 |
if google_search_check:
|
427 |
-
url_content = google_search(
|
428 |
content_string = "\n".join(
|
429 |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
|
430 |
)
|
@@ -622,6 +629,12 @@ def create_interface():
|
|
622 |
multiselect=True,
|
623 |
label="Domains To Include",
|
624 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
625 |
gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
|
626 |
pdf_file_input = gr.File(label="Upload PDF")
|
627 |
|
@@ -734,6 +747,7 @@ def create_interface():
|
|
734 |
month_to,
|
735 |
day_to,
|
736 |
domains_to_include,
|
|
|
737 |
pdf_file_input,
|
738 |
],
|
739 |
outputs=[output_article],
|
@@ -767,6 +781,7 @@ def create_interface():
|
|
767 |
domains_to_include,
|
768 |
pdf_file_input,
|
769 |
output_article,
|
|
|
770 |
ai_comments,
|
771 |
],
|
772 |
outputs=[output_article],
|
@@ -791,12 +806,6 @@ def create_interface():
|
|
791 |
outputs=[humanized_output],
|
792 |
)
|
793 |
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
copy_to_input_btn.click(
|
801 |
fn=copy_to_input,
|
802 |
inputs=[humanized_output],
|
@@ -809,4 +818,4 @@ def create_interface():
|
|
809 |
if __name__ == "__main__":
|
810 |
demo = create_interface()
|
811 |
# demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
|
812 |
-
demo.launch(server_name="0.0.0.0")
|
|
|
5 |
from humanize import paraphrase_text
|
6 |
from ai_generate import generate
|
7 |
import requests
|
8 |
+
import language_tool_python
|
9 |
import torch
|
10 |
from gradio_client import Client
|
11 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
|
|
83 |
tool = language_tool_python.LanguageTool("en-US")
|
84 |
return tool.correct(text)
|
85 |
|
86 |
+
|
87 |
def predict(model, tokenizer, text):
|
88 |
text = remove_special_characters(text)
|
89 |
bc_token_size = 256
|
|
|
416 |
month_to,
|
417 |
day_to,
|
418 |
domains_to_include,
|
419 |
+
search_keywords,
|
420 |
pdf_file_input,
|
421 |
generated_article: str = None,
|
422 |
user_comments: str = None,
|
|
|
425 |
date_to = build_date(year_to, month_to, day_to)
|
426 |
sorted_date = f"date:r:{date_from}:{date_to}"
|
427 |
content_string = ""
|
428 |
+
final_query = topic
|
429 |
+
if search_keywords != "":
|
430 |
+
quoted_keywords = [f'"{keyword.strip()}"' for keyword in search_keywords.split(",")]
|
431 |
+
final_query = final_query + " " + " ".join(quoted_keywords)
|
432 |
+
print(final_query)
|
433 |
if google_search_check:
|
434 |
+
url_content = google_search(final_query, sorted_date, domains_to_include)
|
435 |
content_string = "\n".join(
|
436 |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in url_content.items()
|
437 |
)
|
|
|
629 |
multiselect=True,
|
630 |
label="Domains To Include",
|
631 |
)
|
632 |
+
with gr.Row():
|
633 |
+
search_keywords = gr.Textbox(
|
634 |
+
label="Keywords",
|
635 |
+
placeholder="Enter comma-separated keywords",
|
636 |
+
elem_classes="input-highlight-yellow",
|
637 |
+
)
|
638 |
gr.Markdown("# Add Optional PDF File with Information", elem_classes="text-center text-3xl mb-6")
|
639 |
pdf_file_input = gr.File(label="Upload PDF")
|
640 |
|
|
|
747 |
month_to,
|
748 |
day_to,
|
749 |
domains_to_include,
|
750 |
+
search_keywords,
|
751 |
pdf_file_input,
|
752 |
],
|
753 |
outputs=[output_article],
|
|
|
781 |
domains_to_include,
|
782 |
pdf_file_input,
|
783 |
output_article,
|
784 |
+
search_keywords,
|
785 |
ai_comments,
|
786 |
],
|
787 |
outputs=[output_article],
|
|
|
806 |
outputs=[humanized_output],
|
807 |
)
|
808 |
|
|
|
|
|
|
|
|
|
|
|
|
|
809 |
copy_to_input_btn.click(
|
810 |
fn=copy_to_input,
|
811 |
inputs=[humanized_output],
|
|
|
818 |
if __name__ == "__main__":
|
819 |
demo = create_interface()
|
820 |
# demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
|
821 |
+
demo.launch(server_name="0.0.0.0")
|
humanize.py
CHANGED
@@ -19,7 +19,6 @@ else:
|
|
19 |
device = torch.device("cpu")
|
20 |
|
21 |
|
22 |
-
|
23 |
# Configuration for models and their adapters
|
24 |
model_config = {
|
25 |
"Base Model": "polygraf-ai/poly-humanizer-base",
|
|
|
19 |
device = torch.device("cpu")
|
20 |
|
21 |
|
|
|
22 |
# Configuration for models and their adapters
|
23 |
model_config = {
|
24 |
"Base Model": "polygraf-ai/poly-humanizer-base",
|
plagiarism.py
CHANGED
@@ -15,8 +15,7 @@ def clean_html(text):
|
|
15 |
result += article.title + "\n"
|
16 |
paragraphs = justext.justext(text, justext.get_stoplist("English"))
|
17 |
for paragraph in paragraphs:
|
18 |
-
|
19 |
-
result += paragraph.text
|
20 |
return result
|
21 |
|
22 |
|
@@ -130,7 +129,4 @@ def google_search(
|
|
130 |
text = clean_html(soup.text)
|
131 |
result_content[url] = text
|
132 |
count += 1
|
133 |
-
# for key, value in result_content.items():
|
134 |
-
# print("-------------------URL: ", key)
|
135 |
-
# print(value[:30])
|
136 |
return result_content
|
|
|
15 |
result += article.title + "\n"
|
16 |
paragraphs = justext.justext(text, justext.get_stoplist("English"))
|
17 |
for paragraph in paragraphs:
|
18 |
+
result += paragraph.text
|
|
|
19 |
return result
|
20 |
|
21 |
|
|
|
129 |
text = clean_html(soup.text)
|
130 |
result_content[url] = text
|
131 |
count += 1
|
|
|
|
|
|
|
132 |
return result_content
|