Spaces:
Runtime error
Runtime error
import openai | |
import gradio as gr | |
from typing import Dict, List | |
import re | |
from humanize import paraphrase_text | |
from ai_generate import generate | |
import requests | |
import language_tool_python | |
import torch | |
from gradio_client import Client | |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
from scipy.special import softmax | |
from collections import defaultdict | |
import nltk | |
from utils import remove_special_characters | |
from plagiarism import google_search, months, domain_list, build_date | |
from datetime import date | |
# Check if CUDA is available | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
models = { | |
"Polygraf AI Watson (Base Model)": AutoModelForSequenceClassification.from_pretrained( | |
"polygraf-ai/bc-roberta-openai-2sent" | |
).to(device), | |
"Polygraf AI Sherlock (Advanced Model)": AutoModelForSequenceClassification.from_pretrained( | |
"polygraf-ai/bc_combined_3sent" | |
).to(device), | |
} | |
tokenizers = { | |
"Polygraf AI Watson (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"), | |
"Polygraf AI Sherlock (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"), | |
} | |
# Function to move model to the appropriate device | |
def to_device(model): | |
return model.to(device) | |
def copy_to_input(text): | |
return text | |
def remove_bracketed_numbers(text): | |
pattern = r"^\[\d+\]" | |
cleaned_text = re.sub(pattern, "", text) | |
return cleaned_text | |
def clean_text(text: str) -> str: | |
paragraphs = text.split("\n\n") | |
cleaned_paragraphs = [] | |
for paragraph in paragraphs: | |
cleaned = re.sub(r"\s+", " ", paragraph).strip() | |
cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned) | |
cleaned_paragraphs.append(cleaned) | |
return "\n".join(cleaned_paragraphs) | |
def format_and_correct(text: str) -> str: | |
prompt = f""" | |
Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content: | |
{text} | |
""" | |
corrected_text = generate(prompt, "Groq", None) | |
return clean_text(corrected_text) | |
def format_and_correct_para(text: str) -> str: | |
paragraphs = text.split("\n") | |
corrected_paragraphs = [] | |
for paragraph in paragraphs: | |
corrected = format_and_correct(paragraph) | |
corrected_paragraphs.append(corrected) | |
corrected_text = "\n\n".join(corrected_paragraphs) | |
return corrected_text | |
def format_and_correct_language_check(text: str) -> str: | |
tool = language_tool_python.LanguageTool("en-US") | |
return tool.correct(text) | |
def predict(model, tokenizer, text): | |
text = remove_special_characters(text) | |
bc_token_size = 256 | |
with torch.no_grad(): | |
model.eval() | |
tokens = tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=bc_token_size, | |
return_tensors="pt", | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]} | |
return output_norm | |
def ai_generated_test(text, model="BC Original"): | |
return predict(models[model], tokenizers[model], text) | |
def process_text(text, model="BC Original"): | |
# sentences = split_into_sentences(text) | |
sentences = nltk.sent_tokenize(text) | |
num_sentences = len(sentences) | |
scores = defaultdict(list) | |
overall_scores = [] | |
# Process each chunk of 3 sentences and store the score for each sentence in the chunk | |
for i in range(num_sentences): | |
chunk = " ".join(sentences[i : i + 3]) | |
if chunk: | |
# result = classifier(chunk) | |
result = ai_generated_test(chunk, model) | |
score = result["AI"] | |
for j in range(i, min(i + 3, num_sentences)): | |
scores[j].append(score) | |
# Calculate the average score for each sentence and apply color coding | |
paragraphs = text.split("\n") | |
paragraphs = [s for s in paragraphs if s.strip()] | |
colored_paragraphs = [] | |
i = 0 | |
for paragraph in paragraphs: | |
temp_sentences = nltk.sent_tokenize(paragraph) | |
colored_sentences = [] | |
for sentence in temp_sentences: | |
if scores[i]: | |
avg_score = sum(scores[i]) / len(scores[i]) | |
if avg_score >= 0.65: | |
colored_sentence = f"<span style='background-color:red;'>{sentence}</span>" | |
else: | |
colored_sentence = sentence | |
colored_sentences.append(colored_sentence) | |
overall_scores.append(avg_score) | |
i = i + 1 | |
combined_sentences = " ".join(colored_sentences) | |
print(combined_sentences) | |
colored_paragraphs.append(combined_sentences) | |
overall_score = sum(overall_scores) / len(overall_scores) | |
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score} | |
return overall_score, format_references("<br><br>".join(colored_paragraphs)) | |
ai_check_options = [ | |
"Polygraf AI Watson (Base Model)", | |
"Polygraf AI Sherlock (Advanced Model)", | |
] | |
def ai_generated_test_sapling(text: str) -> Dict: | |
response = requests.post( | |
"https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"} | |
) | |
return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]} | |
class GPT2PPL: | |
def __init__(self): | |
self.device = device | |
self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2")) | |
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") | |
def __call__(self, text): | |
encodings = self.tokenizer(text, return_tensors="pt") | |
encodings = {k: v.to(self.device) for k, v in encodings.items()} | |
max_length = self.model.config.n_positions | |
stride = 512 | |
seq_len = encodings.input_ids.size(1) | |
nlls = [] | |
for i in range(0, seq_len, stride): | |
begin_loc = max(i + stride - max_length, 0) | |
end_loc = min(i + stride, seq_len) | |
trg_len = end_loc - i | |
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device) | |
target_ids = input_ids.clone() | |
target_ids[:, :-trg_len] = -100 | |
with torch.no_grad(): | |
outputs = self.model(input_ids, labels=target_ids) | |
neg_log_likelihood = outputs.loss * trg_len | |
nlls.append(neg_log_likelihood) | |
ppl = torch.exp(torch.stack(nlls).sum() / end_loc) | |
return {"AI": float(ppl), "HUMAN": 1 - float(ppl)} | |
def ai_generated_test_gptzero(text): | |
gptzero_model = GPT2PPL() | |
result = gptzero_model(text) | |
print(result) | |
return result, None | |
def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"): | |
return process_text(text=text, model=model) | |
def ai_check(text: str, option: str): | |
if option.startswith("Polygraf AI"): | |
return highlighter_polygraf(text, option) | |
elif option == "Sapling AI": | |
return ai_generated_test_sapling(text) | |
elif option == "GPTZero": | |
return ai_generated_test_gptzero(text) | |
else: | |
return highlighter_polygraf(text, option) | |
def generate_prompt(settings: Dict[str, str]) -> str: | |
content_string = "\n".join( | |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items() | |
) | |
prompt = f""" | |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}. | |
Style and Tone: | |
- Writing style: {settings['writing_style']} | |
- Tone: {settings['tone']} | |
- Target audience: {settings['user_category']} | |
Content: | |
- Depth: {settings['depth_of_content']} | |
- Structure: {', '.join(settings['structure'])} | |
Keywords to incorporate: | |
{', '.join(settings['keywords'])} | |
Additional requirements: | |
- Include {settings['num_examples']} relevant examples or case studies | |
- Incorporate data or statistics from {', '.join(settings['references'])} | |
- End with a {settings['conclusion_type']} conclusion | |
- Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc. | |
- Do not make any headline, title bold. | |
Use the content here from the URLs I've found for you: | |
{content_string} | |
Ensure proper paragraph breaks for better readability. | |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc. | |
""" | |
return prompt | |
def regenerate_prompt(settings: Dict[str, str]) -> str: | |
content_string = "\n".join( | |
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items() | |
) | |
prompt = f""" | |
"{settings['generated_article']}" | |
Edit the given text based on user comments. | |
Comments: | |
- {settings['user_comments']} | |
- The original content should not be changed. Make minor modifications based on user comments above. | |
- Keep the references the same as the given text in the same format. | |
- Do not make any headline, title bold. | |
Use the content here from the URLs I've found for you: | |
{content_string} | |
Ensure proper paragraph breaks for better readability. | |
Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc. | |
""" | |
return prompt | |
def generate_article( | |
topic: str, | |
keywords: str, | |
article_length: str, | |
format: str, | |
writing_style: str, | |
tone: str, | |
user_category: str, | |
depth_of_content: str, | |
structure: str, | |
references: str, | |
num_examples: str, | |
conclusion_type: str, | |
ai_model: str, | |
sorted_date, | |
domains_to_skip, | |
api_key: str = None, | |
generated_article: str = None, | |
user_comments: str = None, | |
) -> str: | |
url_content = google_search(topic, sorted_date, domains_to_skip) | |
settings = { | |
"topic": topic, | |
"keywords": [k.strip() for k in keywords.split(",")], | |
"article_length": article_length, | |
"format": format, | |
"writing_style": writing_style, | |
"tone": tone, | |
"user_category": user_category, | |
"depth_of_content": depth_of_content, | |
"structure": [s.strip() for s in structure.split(",")], | |
"references": [r.strip() for r in references.split(",")], | |
"num_examples": num_examples, | |
"conclusion_type": conclusion_type, | |
"sources": url_content, | |
"generated_article": generated_article, | |
"user_comments": user_comments, | |
} | |
if generated_article: | |
prompt = regenerate_prompt(settings) | |
else: | |
prompt = generate_prompt(settings) | |
print(prompt) | |
if ai_model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]: | |
response = openai.ChatCompletion.create( | |
model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo", | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a professional content writer with expertise in various fields.", | |
}, | |
{"role": "user", "content": prompt}, | |
], | |
max_tokens=3000, | |
n=1, | |
stop=None, | |
temperature=0.7, | |
) | |
article = response.choices[0].message.content.strip() | |
else: | |
article = generate(prompt, ai_model, api_key) | |
return clean_text(article) | |
def humanize( | |
text: str, | |
model: str, | |
temperature: float = 1.2, | |
repetition_penalty: float = 1, | |
top_k: int = 50, | |
length_penalty: float = 1, | |
) -> str: | |
result = paraphrase_text( | |
text=text, | |
model_name=model, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
top_k=top_k, | |
length_penalty=length_penalty, | |
) | |
return format_and_correct_language_check(result) | |
def update_visibility_api(model: str): | |
if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]: | |
return gr.update(visible=True) | |
else: | |
return gr.update(visible=False) | |
def format_references(text: str) -> str: | |
lines = text.split("\n") | |
references = [] | |
article_text = [] | |
in_references = False | |
for line in lines: | |
if line.strip().lower() == "references" or line.strip().lower() == "references:": | |
in_references = True | |
continue | |
if in_references: | |
references.append(line.strip()) | |
else: | |
article_text.append(line) | |
formatted_refs = [] | |
for i, ref in enumerate(references, 1): | |
ref = remove_bracketed_numbers(ref) | |
formatted_refs.append(f"[{i}] {ref}\n") | |
return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs) | |
def generate_and_format( | |
topic, | |
keywords, | |
article_length, | |
format, | |
writing_style, | |
tone, | |
user_category, | |
depth_of_content, | |
structure, | |
references, | |
num_examples, | |
conclusion_type, | |
ai_model, | |
api_key, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
generated_article: str = None, | |
user_comments: str = None, | |
): | |
date_from = build_date(year_from, month_from, day_from) | |
date_to = build_date(year_to, month_to, day_to) | |
sorted_date = f"date:r:{date_from}:{date_to}" | |
article = generate_article( | |
topic, | |
keywords, | |
article_length, | |
format, | |
writing_style, | |
tone, | |
user_category, | |
depth_of_content, | |
structure, | |
references, | |
num_examples, | |
conclusion_type, | |
ai_model, | |
api_key, | |
sorted_date, | |
domains_to_skip, | |
generated_article, | |
user_comments, | |
) | |
return format_references(article) | |
def create_interface(): | |
with gr.Blocks( | |
theme=gr.themes.Default( | |
primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray | |
), | |
css=""" | |
.input-highlight-pink block_label {background-color: #008080} | |
""", | |
) as demo: | |
today = date.today() | |
# dd/mm/YY | |
d1 = today.strftime("%d/%B/%Y") | |
d1 = d1.split("/") | |
gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
with gr.Group(): | |
gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4") | |
input_topic = gr.Textbox( | |
label="Topic", | |
placeholder="Enter the main topic of your article", | |
elem_classes="input-highlight-pink", | |
) | |
input_keywords = gr.Textbox( | |
label="Keywords", | |
placeholder="Enter comma-separated keywords", | |
elem_classes="input-highlight-yellow", | |
) | |
with gr.Row(): | |
input_format = gr.Dropdown( | |
choices=[ | |
"Article", | |
"Essay", | |
"Blog post", | |
"Report", | |
"Research paper", | |
"News article", | |
"White paper", | |
], | |
value="Article", | |
label="Format", | |
elem_classes="input-highlight-turquoise", | |
) | |
input_length = gr.Slider( | |
minimum=50, | |
maximum=5000, | |
step=50, | |
value=300, | |
label="Article Length", | |
elem_classes="input-highlight-pink", | |
) | |
with gr.Row(): | |
input_writing_style = gr.Dropdown( | |
choices=[ | |
"Formal", | |
"Informal", | |
"Technical", | |
"Conversational", | |
"Journalistic", | |
"Academic", | |
"Creative", | |
], | |
value="Formal", | |
label="Writing Style", | |
elem_classes="input-highlight-yellow", | |
) | |
input_tone = gr.Dropdown( | |
choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"], | |
value="Professional", | |
label="Tone", | |
elem_classes="input-highlight-turquoise", | |
) | |
input_user_category = gr.Dropdown( | |
choices=[ | |
"Students", | |
"Professionals", | |
"Researchers", | |
"General Public", | |
"Policymakers", | |
"Entrepreneurs", | |
], | |
value="General Public", | |
label="Target Audience", | |
elem_classes="input-highlight-pink", | |
) | |
input_depth = gr.Dropdown( | |
choices=[ | |
"Surface-level overview", | |
"Moderate analysis", | |
"In-depth research", | |
"Comprehensive study", | |
], | |
value="Moderate analysis", | |
label="Depth of Content", | |
elem_classes="input-highlight-yellow", | |
) | |
input_structure = gr.Dropdown( | |
choices=[ | |
"Introduction, Body, Conclusion", | |
"Abstract, Introduction, Methods, Results, Discussion, Conclusion", | |
"Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion", | |
"Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion", | |
], | |
value="Introduction, Body, Conclusion", | |
label="Structure", | |
elem_classes="input-highlight-turquoise", | |
) | |
input_references = gr.Dropdown( | |
choices=[ | |
"Academic journals", | |
"Industry reports", | |
"Government publications", | |
"News outlets", | |
"Expert interviews", | |
"Case studies", | |
], | |
value="News outlets", | |
label="References", | |
elem_classes="input-highlight-pink", | |
) | |
input_num_examples = gr.Dropdown( | |
choices=["1-2", "3-4", "5+"], | |
value="1-2", | |
label="Number of Examples/Case Studies", | |
elem_classes="input-highlight-yellow", | |
) | |
input_conclusion = gr.Dropdown( | |
choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"], | |
value="Call to Action", | |
label="Conclusion Type", | |
elem_classes="input-highlight-turquoise", | |
) | |
gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6") | |
with gr.Group(): | |
with gr.Row(): | |
month_from = gr.Dropdown( | |
choices=months, | |
label="From Month", | |
value="January", | |
interactive=True, | |
) | |
day_from = gr.Textbox(label="From Day", value="01") | |
year_from = gr.Textbox(label="From Year", value="2000") | |
with gr.Row(): | |
month_to = gr.Dropdown( | |
choices=months, | |
label="To Month", | |
value=d1[1], | |
interactive=True, | |
) | |
day_to = gr.Textbox(label="To Day", value=d1[0]) | |
year_to = gr.Textbox(label="To Year", value=d1[2]) | |
with gr.Row(): | |
domains_to_skip = gr.Dropdown( | |
domain_list, | |
multiselect=True, | |
label="Domain To Skip", | |
) | |
with gr.Group(): | |
gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4") | |
ai_generator = gr.Dropdown( | |
choices=["Llama 3", "Groq", "Mistral", "Gemma", "OpenAI GPT 3.5", "OpenAI GPT 4"], | |
value="Llama 3", | |
label="AI Model", | |
elem_classes="input-highlight-pink", | |
) | |
input_api = gr.Textbox(label="API Key", visible=False) | |
ai_generator.change(update_visibility_api, ai_generator, input_api) | |
generate_btn = gr.Button("Generate Article", variant="primary") | |
with gr.Accordion("Advanced Humanizer Settings", open=False): | |
with gr.Row(): | |
model_dropdown = gr.Radio( | |
choices=[ | |
"Base Model", | |
"Large Model", | |
"XL Model", | |
# "XL Law Model", | |
# "XL Marketing Model", | |
# "XL Child Style Model", | |
], | |
value="Large Model", | |
label="Humanizer Model Version", | |
) | |
with gr.Row(): | |
temperature_slider = gr.Slider( | |
minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature" | |
) | |
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k") | |
with gr.Row(): | |
repetition_penalty_slider = gr.Slider( | |
minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty" | |
) | |
length_penalty_slider = gr.Slider( | |
minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty" | |
) | |
with gr.Column(scale=3): | |
output_article = gr.Textbox(label="Generated Article", lines=20) | |
ai_comments = gr.Textbox( | |
label="Add comments to help edit generated text", interactive=True, visible=False | |
) | |
regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False) | |
ai_detector_dropdown = gr.Radio( | |
choices=ai_check_options, label="Select AI Detector", value="Polygraf AI" | |
) | |
ai_check_btn = gr.Button("AI Check") | |
with gr.Accordion("AI Detection Results", open=True): | |
ai_check_result = gr.Label(label="AI Check Result") | |
highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False) | |
humanize_btn = gr.Button("Humanize") | |
# humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"]) | |
humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True) | |
copy_to_input_btn = gr.Button("Copy to Input for AI Check") | |
def become_visible(text): | |
if text: | |
return gr.update(visible=True) | |
else: | |
return gr.update(visible=False) | |
def highlight_visible(text): | |
if text.startswith("Polygraf"): | |
return gr.update(visible=True) | |
else: | |
return gr.update(visible=False) | |
ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text) | |
output_article.change(become_visible, inputs=output_article, outputs=ai_comments) | |
ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn) | |
ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text) | |
generate_btn.click( | |
fn=generate_and_format, | |
inputs=[ | |
input_topic, | |
input_keywords, | |
input_length, | |
input_format, | |
input_writing_style, | |
input_tone, | |
input_user_category, | |
input_depth, | |
input_structure, | |
input_references, | |
input_num_examples, | |
input_conclusion, | |
ai_generator, | |
input_api, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
], | |
outputs=[output_article], | |
) | |
regenerate_btn.click( | |
fn=generate_and_format, | |
inputs=[ | |
input_topic, | |
input_keywords, | |
input_length, | |
input_format, | |
input_writing_style, | |
input_tone, | |
input_user_category, | |
input_depth, | |
input_structure, | |
input_references, | |
input_num_examples, | |
input_conclusion, | |
ai_generator, | |
input_api, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
output_article, | |
ai_comments, | |
], | |
outputs=[output_article], | |
) | |
ai_check_btn.click( | |
fn=ai_check, | |
inputs=[output_article, ai_detector_dropdown], | |
outputs=[ai_check_result, highlighted_text], | |
) | |
humanize_btn.click( | |
fn=humanize, | |
inputs=[ | |
output_article, | |
model_dropdown, | |
temperature_slider, | |
repetition_penalty_slider, | |
top_k_slider, | |
length_penalty_slider, | |
], | |
outputs=[humanized_output], | |
) | |
copy_to_input_btn.click( | |
fn=copy_to_input, | |
inputs=[humanized_output], | |
outputs=[output_article], | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_interface() | |
# demo.launch(server_name="0.0.0.0", share=True, server_port=7890) | |
demo.launch(server_name="0.0.0.0") | |