Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / app.py

minko186

changed default values of some fields

c85110b 6 months ago

raw

history blame

28.2 kB

	import openai
	import gradio as gr
	from typing import Dict, List
	import re
	from humanize import paraphrase_text
	from ai_generate import generate
	import requests
	import language_tool_python
	import torch
	from gradio_client import Client
	from transformers import GPT2LMHeadModel, GPT2TokenizerFast
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
	from scipy.special import softmax
	from collections import defaultdict
	import nltk
	from utils import remove_special_characters
	from plagiarism import google_search, months, domain_list, build_date
	from datetime import date

	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	models = {
	"Polygraf AI Watson (Base Model)": AutoModelForSequenceClassification.from_pretrained(
	"polygraf-ai/bc-roberta-openai-2sent"
	).to(device),
	"Polygraf AI Sherlock (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
	"polygraf-ai/bc_combined_3sent"
	).to(device),
	}
	tokenizers = {
	"Polygraf AI Watson (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
	"Polygraf AI Sherlock (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
	}


	# Function to move model to the appropriate device
	def to_device(model):
	return model.to(device)


	def copy_to_input(text):
	return text


	def remove_bracketed_numbers(text):
	pattern = r"^\[\d+\]"
	cleaned_text = re.sub(pattern, "", text)
	return cleaned_text


	def clean_text(text: str) -> str:
	paragraphs = text.split("\n\n")
	cleaned_paragraphs = []
	for paragraph in paragraphs:
	cleaned = re.sub(r"\s+", " ", paragraph).strip()
	cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
	cleaned_paragraphs.append(cleaned)
	return "\n".join(cleaned_paragraphs)


	def format_and_correct(text: str) -> str:
	prompt = f"""
	Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
	{text}
	"""
	corrected_text = generate(prompt, "Groq", None)
	return clean_text(corrected_text)


	def format_and_correct_para(text: str) -> str:
	paragraphs = text.split("\n")
	corrected_paragraphs = []
	for paragraph in paragraphs:
	corrected = format_and_correct(paragraph)
	corrected_paragraphs.append(corrected)
	corrected_text = "\n\n".join(corrected_paragraphs)
	return corrected_text


	def format_and_correct_language_check(text: str) -> str:
	tool = language_tool_python.LanguageTool("en-US")
	return tool.correct(text)


	def predict(model, tokenizer, text):
	text = remove_special_characters(text)
	bc_token_size = 256
	with torch.no_grad():
	model.eval()
	tokens = tokenizer(
	text,
	padding="max_length",
	truncation=True,
	max_length=bc_token_size,
	return_tensors="pt",
	).to(device)
	output = model(**tokens)
	output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
	output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
	return output_norm


	def ai_generated_test(text, model="BC Original"):
	return predict(models[model], tokenizers[model], text)


	def process_text(text, model="BC Original"):
	# sentences = split_into_sentences(text)
	sentences = nltk.sent_tokenize(text)
	num_sentences = len(sentences)
	scores = defaultdict(list)

	overall_scores = []

	# Process each chunk of 3 sentences and store the score for each sentence in the chunk
	for i in range(num_sentences):
	chunk = " ".join(sentences[i : i + 3])
	if chunk:
	# result = classifier(chunk)
	result = ai_generated_test(chunk, model)
	score = result["AI"]
	for j in range(i, min(i + 3, num_sentences)):
	scores[j].append(score)

	# Calculate the average score for each sentence and apply color coding
	paragraphs = text.split("\n")
	paragraphs = [s for s in paragraphs if s.strip()]
	colored_paragraphs = []
	i = 0
	for paragraph in paragraphs:
	temp_sentences = nltk.sent_tokenize(paragraph)
	colored_sentences = []
	for sentence in temp_sentences:
	if scores[i]:
	avg_score = sum(scores[i]) / len(scores[i])
	if avg_score >= 0.65:
	colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
	else:
	colored_sentence = sentence
	colored_sentences.append(colored_sentence)
	overall_scores.append(avg_score)
	i = i + 1
	combined_sentences = " ".join(colored_sentences)
	print(combined_sentences)
	colored_paragraphs.append(combined_sentences)

	overall_score = sum(overall_scores) / len(overall_scores)
	overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
	return overall_score, format_references("<br><br>".join(colored_paragraphs))


	ai_check_options = [
	"Polygraf AI Watson (Base Model)",
	"Polygraf AI Sherlock (Advanced Model)",
	]


	def ai_generated_test_sapling(text: str) -> Dict:
	response = requests.post(
	"https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
	)
	return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}


	class GPT2PPL:
	def __init__(self):
	self.device = device
	self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
	self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

	def __call__(self, text):
	encodings = self.tokenizer(text, return_tensors="pt")
	encodings = {k: v.to(self.device) for k, v in encodings.items()}
	max_length = self.model.config.n_positions
	stride = 512
	seq_len = encodings.input_ids.size(1)

	nlls = []
	for i in range(0, seq_len, stride):
	begin_loc = max(i + stride - max_length, 0)
	end_loc = min(i + stride, seq_len)
	trg_len = end_loc - i
	input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
	target_ids = input_ids.clone()
	target_ids[:, :-trg_len] = -100

	with torch.no_grad():
	outputs = self.model(input_ids, labels=target_ids)
	neg_log_likelihood = outputs.loss * trg_len

	nlls.append(neg_log_likelihood)

	ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
	return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}


	def ai_generated_test_gptzero(text):
	gptzero_model = GPT2PPL()
	result = gptzero_model(text)
	print(result)
	return result, None


	def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
	return process_text(text=text, model=model)


	def ai_check(text: str, option: str):
	if option.startswith("Polygraf AI"):
	return highlighter_polygraf(text, option)
	elif option == "Sapling AI":
	return ai_generated_test_sapling(text)
	elif option == "GPTZero":
	return ai_generated_test_gptzero(text)
	else:
	return highlighter_polygraf(text, option)


	def generate_prompt(settings: Dict[str, str]) -> str:
	content_string = "\n".join(
	f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
	)

	prompt = f"""
	Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.

	Style and Tone:
	- Writing style: {settings['writing_style']}
	- Tone: {settings['tone']}
	- Target audience: {settings['user_category']}

	Content:
	- Depth: {settings['depth_of_content']}
	- Structure: {', '.join(settings['structure'])}

	Keywords to incorporate:
	{', '.join(settings['keywords'])}

	Additional requirements:
	- Include {settings['num_examples']} relevant examples or case studies
	- Incorporate data or statistics from {', '.join(settings['references'])}
	- End with a {settings['conclusion_type']} conclusion
	- Add a "References" section at the end with at least 3 credible sources, formatted as [1], [2], etc.
	- Do not make any headline, title bold.

	Use the content here from the URLs I've found for you:
	{content_string}

	Ensure proper paragraph breaks for better readability.
	Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
	"""
	return prompt


	def regenerate_prompt(settings: Dict[str, str]) -> str:
	content_string = "\n".join(
	f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
	)

	prompt = f"""
	"{settings['generated_article']}"

	Edit the given text based on user comments.

	Comments:
	- {settings['user_comments']}
	- The original content should not be changed. Make minor modifications based on user comments above.
	- Keep the references the same as the given text in the same format.
	- Do not make any headline, title bold.
	Use the content here from the URLs I've found for you:
	{content_string}

	Ensure proper paragraph breaks for better readability.
	Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
	"""
	return prompt


	def generate_article(
	topic: str,
	keywords: str,
	article_length: str,
	format: str,
	writing_style: str,
	tone: str,
	user_category: str,
	depth_of_content: str,
	structure: str,
	references: str,
	num_examples: str,
	conclusion_type: str,
	ai_model: str,
	sorted_date,
	domains_to_skip,
	api_key: str = None,
	generated_article: str = None,
	user_comments: str = None,
	) -> str:

	url_content = google_search(topic, sorted_date, domains_to_skip)
	settings = {
	"topic": topic,
	"keywords": [k.strip() for k in keywords.split(",")],
	"article_length": article_length,
	"format": format,
	"writing_style": writing_style,
	"tone": tone,
	"user_category": user_category,
	"depth_of_content": depth_of_content,
	"structure": [s.strip() for s in structure.split(",")],
	"references": [r.strip() for r in references.split(",")],
	"num_examples": num_examples,
	"conclusion_type": conclusion_type,
	"sources": url_content,
	"generated_article": generated_article,
	"user_comments": user_comments,
	}

	if generated_article:
	prompt = regenerate_prompt(settings)
	else:
	prompt = generate_prompt(settings)

	print(prompt)
	if ai_model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
	response = openai.ChatCompletion.create(
	model="gpt-4" if ai_model == "OpenAI GPT 4" else "gpt-3.5-turbo",
	messages=[
	{
	"role": "system",
	"content": "You are a professional content writer with expertise in various fields.",
	},
	{"role": "user", "content": prompt},
	],
	max_tokens=3000,
	n=1,
	stop=None,
	temperature=0.7,
	)
	article = response.choices[0].message.content.strip()
	else:
	article = generate(prompt, ai_model, api_key)

	return clean_text(article)


	def humanize(
	text: str,
	model: str,
	temperature: float = 1.2,
	repetition_penalty: float = 1,
	top_k: int = 50,
	length_penalty: float = 1,
	) -> str:
	result = paraphrase_text(
	text=text,
	model_name=model,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	top_k=top_k,
	length_penalty=length_penalty,
	)
	return format_and_correct_language_check(result)


	def update_visibility_api(model: str):
	if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)


	def format_references(text: str) -> str:
	lines = text.split("\n")
	references = []
	article_text = []
	in_references = False

	for line in lines:
	if line.strip().lower() == "references" or line.strip().lower() == "references:":
	in_references = True
	continue
	if in_references:
	references.append(line.strip())
	else:
	article_text.append(line)

	formatted_refs = []
	for i, ref in enumerate(references, 1):
	ref = remove_bracketed_numbers(ref)
	formatted_refs.append(f"[{i}] {ref}\n")

	return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)


	def generate_and_format(
	topic,
	keywords,
	article_length,
	format,
	writing_style,
	tone,
	user_category,
	depth_of_content,
	structure,
	references,
	num_examples,
	conclusion_type,
	ai_model,
	api_key,
	year_from,
	month_from,
	day_from,
	year_to,
	month_to,
	day_to,
	domains_to_skip,
	generated_article: str = None,
	user_comments: str = None,
	):
	date_from = build_date(year_from, month_from, day_from)
	date_to = build_date(year_to, month_to, day_to)
	sorted_date = f"date:r:{date_from}:{date_to}"
	article = generate_article(
	topic,
	keywords,
	article_length,
	format,
	writing_style,
	tone,
	user_category,
	depth_of_content,
	structure,
	references,
	num_examples,
	conclusion_type,
	ai_model,
	api_key,
	sorted_date,
	domains_to_skip,
	generated_article,
	user_comments,
	)
	return format_references(article)


	def create_interface():
	with gr.Blocks(
	theme=gr.themes.Default(
	primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray
	),
	css="""
	.input-highlight-pink block_label {background-color: #008080}
	""",
	) as demo:
	today = date.today()
	# dd/mm/YY
	d1 = today.strftime("%d/%B/%Y")
	d1 = d1.split("/")
	gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6")

	with gr.Row():
	with gr.Column(scale=2):
	with gr.Group():
	gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4")
	input_topic = gr.Textbox(
	label="Topic",
	placeholder="Enter the main topic of your article",
	elem_classes="input-highlight-pink",
	)
	input_keywords = gr.Textbox(
	label="Keywords",
	placeholder="Enter comma-separated keywords",
	elem_classes="input-highlight-yellow",
	)

	with gr.Row():
	input_format = gr.Dropdown(
	choices=[
	"Article",
	"Essay",
	"Blog post",
	"Report",
	"Research paper",
	"News article",
	"White paper",
	],
	value="Article",
	label="Format",
	elem_classes="input-highlight-turquoise",
	)

	input_length = gr.Slider(
	minimum=50,
	maximum=5000,
	step=50,
	value=300,
	label="Article Length",
	elem_classes="input-highlight-pink",
	)

	with gr.Row():
	input_writing_style = gr.Dropdown(
	choices=[
	"Formal",
	"Informal",
	"Technical",
	"Conversational",
	"Journalistic",
	"Academic",
	"Creative",
	],
	value="Formal",
	label="Writing Style",
	elem_classes="input-highlight-yellow",
	)
	input_tone = gr.Dropdown(
	choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"],
	value="Professional",
	label="Tone",
	elem_classes="input-highlight-turquoise",
	)

	input_user_category = gr.Dropdown(
	choices=[
	"Students",
	"Professionals",
	"Researchers",
	"General Public",
	"Policymakers",
	"Entrepreneurs",
	],
	value="General Public",
	label="Target Audience",
	elem_classes="input-highlight-pink",
	)
	input_depth = gr.Dropdown(
	choices=[
	"Surface-level overview",
	"Moderate analysis",
	"In-depth research",
	"Comprehensive study",
	],
	value="Moderate analysis",
	label="Depth of Content",
	elem_classes="input-highlight-yellow",
	)
	input_structure = gr.Dropdown(
	choices=[
	"Introduction, Body, Conclusion",
	"Abstract, Introduction, Methods, Results, Discussion, Conclusion",
	"Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion",
	"Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion",
	],
	value="Introduction, Body, Conclusion",
	label="Structure",
	elem_classes="input-highlight-turquoise",
	)
	input_references = gr.Dropdown(
	choices=[
	"Academic journals",
	"Industry reports",
	"Government publications",
	"News outlets",
	"Expert interviews",
	"Case studies",
	],
	value="News outlets",
	label="References",
	elem_classes="input-highlight-pink",
	)
	input_num_examples = gr.Dropdown(
	choices=["1-2", "3-4", "5+"],
	value="1-2",
	label="Number of Examples/Case Studies",
	elem_classes="input-highlight-yellow",
	)
	input_conclusion = gr.Dropdown(
	choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"],
	value="Call to Action",
	label="Conclusion Type",
	elem_classes="input-highlight-turquoise",
	)
	gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6")
	with gr.Group():
	with gr.Row():
	month_from = gr.Dropdown(
	choices=months,
	label="From Month",
	value="January",
	interactive=True,
	)
	day_from = gr.Textbox(label="From Day", value="01")
	year_from = gr.Textbox(label="From Year", value="2000")

	with gr.Row():
	month_to = gr.Dropdown(
	choices=months,
	label="To Month",
	value=d1[1],
	interactive=True,
	)
	day_to = gr.Textbox(label="To Day", value=d1[0])
	year_to = gr.Textbox(label="To Year", value=d1[2])

	with gr.Row():
	domains_to_skip = gr.Dropdown(
	domain_list,
	multiselect=True,
	label="Domain To Skip",
	)

	with gr.Group():
	gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4")
	ai_generator = gr.Dropdown(
	choices=["Llama 3", "Groq", "Mistral", "Gemma", "OpenAI GPT 3.5", "OpenAI GPT 4"],
	value="Llama 3",
	label="AI Model",
	elem_classes="input-highlight-pink",
	)
	input_api = gr.Textbox(label="API Key", visible=False)
	ai_generator.change(update_visibility_api, ai_generator, input_api)

	generate_btn = gr.Button("Generate Article", variant="primary")

	with gr.Accordion("Advanced Humanizer Settings", open=False):
	with gr.Row():
	model_dropdown = gr.Radio(
	choices=[
	"Base Model",
	"Large Model",
	"XL Model",
	# "XL Law Model",
	# "XL Marketing Model",
	# "XL Child Style Model",
	],
	value="Large Model",
	label="Humanizer Model Version",
	)
	with gr.Row():
	temperature_slider = gr.Slider(
	minimum=0.5, maximum=2.0, step=0.1, value=1.3, label="Temperature"
	)
	top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=50, label="Top k")
	with gr.Row():
	repetition_penalty_slider = gr.Slider(
	minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty"
	)
	length_penalty_slider = gr.Slider(
	minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty"
	)

	with gr.Column(scale=3):
	output_article = gr.Textbox(label="Generated Article", lines=20)
	ai_comments = gr.Textbox(
	label="Add comments to help edit generated text", interactive=True, visible=False
	)
	regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
	ai_detector_dropdown = gr.Radio(
	choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
	)
	ai_check_btn = gr.Button("AI Check")

	with gr.Accordion("AI Detection Results", open=True):
	ai_check_result = gr.Label(label="AI Check Result")
	highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
	humanize_btn = gr.Button("Humanize")
	# humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
	humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
	copy_to_input_btn = gr.Button("Copy to Input for AI Check")

	def become_visible(text):
	if text:
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	def highlight_visible(text):
	if text.startswith("Polygraf"):
	return gr.update(visible=True)
	else:
	return gr.update(visible=False)

	ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
	output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
	ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
	ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)

	generate_btn.click(
	fn=generate_and_format,
	inputs=[
	input_topic,
	input_keywords,
	input_length,
	input_format,
	input_writing_style,
	input_tone,
	input_user_category,
	input_depth,
	input_structure,
	input_references,
	input_num_examples,
	input_conclusion,
	ai_generator,
	input_api,
	year_from,
	month_from,
	day_from,
	year_to,
	month_to,
	day_to,
	domains_to_skip,
	],
	outputs=[output_article],
	)

	regenerate_btn.click(
	fn=generate_and_format,
	inputs=[
	input_topic,
	input_keywords,
	input_length,
	input_format,
	input_writing_style,
	input_tone,
	input_user_category,
	input_depth,
	input_structure,
	input_references,
	input_num_examples,
	input_conclusion,
	ai_generator,
	input_api,
	year_from,
	month_from,
	day_from,
	year_to,
	month_to,
	day_to,
	domains_to_skip,
	output_article,
	ai_comments,
	],
	outputs=[output_article],
	)

	ai_check_btn.click(
	fn=ai_check,
	inputs=[output_article, ai_detector_dropdown],
	outputs=[ai_check_result, highlighted_text],
	)

	humanize_btn.click(
	fn=humanize,
	inputs=[
	output_article,
	model_dropdown,
	temperature_slider,
	repetition_penalty_slider,
	top_k_slider,
	length_penalty_slider,
	],
	outputs=[humanized_output],
	)

	copy_to_input_btn.click(
	fn=copy_to_input,
	inputs=[humanized_output],
	outputs=[output_article],
	)

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	# demo.launch(server_name="0.0.0.0", share=True, server_port=7890)
	demo.launch(server_name="0.0.0.0")