""" nohup python3 app.py & """ import re import requests from typing import Dict from collections import defaultdict from datetime import date, datetime import gradio as gr from scipy.special import softmax import language_tool_python import nltk import torch import numpy as np from transformers import GPT2LMHeadModel, GPT2TokenizerFast from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline from utils import remove_special_characters, split_text_allow_complete_sentences_nltk from google_search import google_search, months, domain_list, build_date from humanize import paraphrase_text, device from ai_generate import generate print(f"Using device: {device}") models = { "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained( "polygraf-ai/bc-roberta-openai-2sent" ).to(device), "Polygraf AI (Advanced Model)": AutoModelForSequenceClassification.from_pretrained( "polygraf-ai/bc_combined_3sent" ).to(device), } tokenizers = { "Polygraf AI (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"), "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"), } # grammar correction tool tool = language_tool_python.LanguageTool("en-US") # Function to move model to the appropriate device def to_device(model): return model.to(device) def copy_to_input(text): return text def remove_bracketed_numbers(text): pattern = r"^\[\d+\]" cleaned_text = re.sub(pattern, "", text) return cleaned_text def clean_text(text: str) -> str: paragraphs = text.split("\n\n") cleaned_paragraphs = [] for paragraph in paragraphs: cleaned = re.sub(r"\s+", " ", paragraph).strip() cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned) cleaned_paragraphs.append(cleaned) return "\n".join(cleaned_paragraphs) def format_references(text: str) -> str: body, references = split_text_from_refs(text) return body + references def split_text_from_refs(text: str, sep="\n"): lines = text.split("\n") references = [] article_text = [] index_pattern = re.compile(r"\[(\d+)\]") in_references = False for line in lines: if line == "": continue match = re.search(r"[Rr]eferences:", line, re.DOTALL) if line.strip().lower() == "references" or line.strip().lower() == "references:": in_references = True continue if line.strip().lower().startswith("references:"): in_references = True if match: in_references = True line = line[match.end() :] if in_references: matches = index_pattern.split(line) for match in matches: if match.strip() and not match.isdigit() and not match.strip().lower().startswith("references:"): references.append(match.strip()) else: article_text.append(line.strip()) if len(references) > 0: formatted_refs = [] for i, ref in enumerate(references, 1): ref = remove_bracketed_numbers(ref) formatted_refs.append(f"[{i}] {ref}{sep}") formatted_refs = f"{sep}{sep}References:{sep}{sep}" + f"{sep}".join(formatted_refs) else: formatted_refs = "" body = f"{sep}{sep}".join(article_text) return body, formatted_refs def ends_with_references(text): # Define a regular expression pattern for variations of "References:" pattern = re.compile(r"\b[Rr]eferences:\s*$", re.IGNORECASE | re.MULTILINE) # Check if the text ends with any form of "References:" return bool(pattern.search(text.strip())) def format_and_correct_language_check(text: str) -> str: return tool.correct(text) def predict(model, tokenizer, text): text = remove_special_characters(text) bc_token_size = 256 with torch.no_grad(): model.eval() tokens = tokenizer( text, padding="max_length", truncation=True, max_length=bc_token_size, return_tensors="pt", ).to(device) output = model(**tokens) output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]} return output_norm def ai_generated_test(text, model="BC Original"): return predict(models[model], tokenizers[model], text) def detection_polygraf(text, model="BC Original"): # sentences = split_into_sentences(text) sentences = nltk.sent_tokenize(text) num_sentences = len(sentences) scores = defaultdict(list) overall_scores = [] # Process each chunk of 3 sentences and store the score for each sentence in the chunk for i in range(num_sentences): chunk = " ".join(sentences[i : i + 3]) if chunk: # result = classifier(chunk) result = ai_generated_test(chunk, model) score = result["AI"] for j in range(i, min(i + 3, num_sentences)): scores[j].append(score) # Calculate the average score for each sentence and apply color coding paragraphs = text.split("\n") paragraphs = [s for s in paragraphs if s.strip()] colored_paragraphs = [] i = 0 for paragraph in paragraphs: temp_sentences = nltk.sent_tokenize(paragraph) colored_sentences = [] for sentence in temp_sentences: if scores[i]: avg_score = sum(scores[i]) / len(scores[i]) if avg_score >= 0.70: colored_sentence = f"{sentence}" elif avg_score >= 0.55: colored_sentence = f"{sentence}" else: colored_sentence = sentence colored_sentences.append(colored_sentence) overall_scores.append(avg_score) i = i + 1 combined_sentences = " ".join(colored_sentences) colored_paragraphs.append(combined_sentences) overall_score = sum(overall_scores) / len(overall_scores) overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score} return overall_score, "

".join(colored_paragraphs) ai_check_options = [ "Polygraf AI (Base Model)", "Polygraf AI (Advanced Model)", ] MC_TOKEN_SIZE = 256 TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model" MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"] text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH) text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device) def predict_mc(text): with torch.no_grad(): text_mc_model.eval() tokens = text_mc_tokenizer( text, padding="max_length", truncation=True, return_tensors="pt", max_length=MC_TOKEN_SIZE, ).to(device) output = text_mc_model(**tokens) output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] return output_norm def predict_mc_scores(input, bc_score): mc_scores = [] segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer) samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer)) for i in range(samples_len_mc): cleaned_text_mc = remove_special_characters(segments_mc[i]) mc_score = predict_mc(cleaned_text_mc) mc_scores.append(mc_score) mc_scores_array = np.array(mc_scores) average_mc_scores = np.mean(mc_scores_array, axis=0) mc_score_list = average_mc_scores.tolist() mc_score = {} for score, label in zip(mc_score_list, MC_LABEL_MAP): mc_score[label.upper()] = score sum_prob = 1 - bc_score["HUMAN"] for key, value in mc_score.items(): mc_score[key] = value * sum_prob print("MC Score:", mc_score) if sum_prob < 0.01: mc_score = {} return mc_score def highlighter_polygraf(text, model="Polygraf AI (Base Model)"): body, references = split_text_from_refs(text) score, text = detection_polygraf(text=body, model=model) mc_score = predict_mc_scores(body, score) # mc score text = text + references.replace("\n", "
") return score, text, mc_score def ai_check(text: str, option: str): if option.startswith("Polygraf AI"): return highlighter_polygraf(text, option) else: return highlighter_polygraf(text, option) def generate_prompt(settings: Dict[str, str]) -> str: prompt = f""" I am a {settings['role']} Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}. Context: - {settings['context']} Style and Tone: - Writing style: {settings['writing_style']} - Tone: {settings['tone']} - Target audience: {settings['user_category']} Content: - Depth: {settings['depth_of_content']} - Structure: {', '.join(settings['structure'])} Keywords to incorporate: {', '.join(settings['keywords'])} Additional requirements: - Don't start with "Here is a...", start with the requested text directly - Include {settings['num_examples']} relevant examples or case studies - Incorporate data or statistics from {', '.join(settings['references'])} - End with a {settings['conclusion_type']} conclusion - Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line - Do not repeat sources - Do not make any headline, title bold. Ensure proper paragraph breaks for better readability. Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc. """ return prompt def regenerate_prompt(settings: Dict[str, str]) -> str: prompt = f""" I am a {settings['role']} "{settings['generated_article']}" Edit the given text based on user comments. User Comments: - {settings['user_comments']} Requirements: - Don't start with "Here is a...", start with the requested text directly - The original content should not be changed. Make minor modifications based on user comments above. - Keep the references the same as the given text in the same format. - Do not make any headline, title bold. Context: - {settings['context']} Ensure proper paragraph breaks for better readability. Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc. """ return prompt def generate_article( input_role: str, topic: str, context: str, keywords: str, article_length: str, format: str, writing_style: str, tone: str, user_category: str, depth_of_content: str, structure: str, references: str, num_examples: str, conclusion_type: str, ai_model: str, content_string: str, url_content: str = None, api_key: str = None, pdf_file_input: list[str] = None, generated_article: str = None, user_comments: str = None, ) -> str: settings = { "role": input_role, "topic": topic, "context": context, "keywords": [k.strip() for k in keywords.split(",")], "article_length": article_length, "format": format, "writing_style": writing_style, "tone": tone, "user_category": user_category, "depth_of_content": depth_of_content, "structure": [s.strip() for s in structure.split(",")], "references": [r.strip() for r in references.split(",")], "num_examples": num_examples, "conclusion_type": conclusion_type, "sources": content_string, "generated_article": generated_article, "user_comments": user_comments, } if generated_article: prompt = regenerate_prompt(settings) else: prompt = generate_prompt(settings) print("Generated Prompt...\n", prompt) article = generate( prompt=prompt, topic=topic, model=ai_model, url_content=url_content, path=pdf_file_input, temperature=1, max_length=2048, api_key=api_key, sys_message="", ) return clean_text(article) def get_history(history): return history def clear_history(): # Return empty list for history state and display return [], [] def humanize( text: str, model: str, temperature: float = 1.2, repetition_penalty: float = 1, top_k: int = 50, length_penalty: float = 1, history=None, ) -> str: print("Humanizing text...") body, references = split_text_from_refs(text) result = paraphrase_text( text=body, model_name=model, temperature=temperature, repetition_penalty=repetition_penalty, top_k=top_k, length_penalty=length_penalty, ) result = result + references corrected_text = format_and_correct_language_check(result) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history.append((f"Humanized Text | {timestamp}\nInput: {model}", corrected_text)) return corrected_text, history def update_visibility_api(model: str): if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]: return gr.update(visible=True) else: return gr.update(visible=False) # Function to update the default selected structure based on the selected format def update_structure(format_choice): # List of formats that should use "Plain Text" plain_text_formats = [ "TikTok Video Content", "Instagram Video Content", "LinkedIn post", "X (Twitter) post", "Facebook post", "Email", ] # Set the appropriate default structure based on the selected format if format_choice in plain_text_formats: return gr.update(value="Plain Text", interactive=True) else: return gr.update(value="Introduction, Body, Conclusion", interactive=True) def generate_and_format( input_role, topic, context, keywords, article_length, format, writing_style, tone, user_category, depth_of_content, structure, references, num_examples, conclusion_type, google_search_check, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_include, include_sites, exclude_sites, pdf_file_input, history=None, ai_model="OpenAI GPT 4o", api_key=None, generated_article: str = None, user_comments: str = None, ): content_string = "" url_content = None if google_search_check: date_from = build_date(year_from, month_from, day_from) date_to = build_date(year_to, month_to, day_to) sorted_date = f"date:r:{date_from}:{date_to}" final_query = topic if include_sites: site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")] final_query += " " + " OR ".join(site_queries) if exclude_sites: exclude_queries = [f"-site:{site.strip()}" for site in exclude_sites.split(",")] final_query += " " + " ".join(exclude_queries) print(f"Google Search Query: {final_query}") url_content = google_search(final_query, sorted_date, domains_to_include) content_string = "\n".join( f"{url.strip()}: \n{content.strip()[:2500]}" for url, content in url_content.items() ) content_string = ( "Use the trusted information here from the URLs and add them as References:\n" + content_string ) topic_context = topic + ", " + context article = generate_article( input_role, topic_context, context, keywords, article_length, format, writing_style, tone, user_category, depth_of_content, structure, references, num_examples, conclusion_type, ai_model, content_string, url_content, api_key, pdf_file_input, generated_article, user_comments, ) if ends_with_references(article) and url_content is not None: for url in url_content.keys(): article += f"\n{url}" reference_formatted = format_references(article) timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") history.append((f"Generated Text | {timestamp}\nInput: {topic}", reference_formatted)) return reference_formatted, history def create_interface(): with gr.Blocks( theme=gr.themes.Default( primary_hue=gr.themes.colors.pink, secondary_hue=gr.themes.colors.yellow, neutral_hue=gr.themes.colors.gray ), css=""" .input-highlight-pink block_label {background-color: #008080} """, ) as demo: history = gr.State([]) today = date.today() # dd/mm/YY d1 = today.strftime("%d/%B/%Y") d1 = d1.split("/") gr.Markdown("# Polygraf AI Content Writer", elem_classes="text-center text-3xl mb-6") with gr.Row(): with gr.Column(scale=2): with gr.Group(): gr.Markdown("## Article Configuration", elem_classes="text-xl mb-4") input_role = gr.Textbox(label="I am a", placeholder="Enter your role", value="Student") input_topic = gr.Textbox( label="Topic", placeholder="Enter the main topic of your article", elem_classes="input-highlight-pink", ) input_context = gr.Textbox( label="Context", placeholder="Provide some context for your topic", elem_classes="input-highlight-pink", ) input_keywords = gr.Textbox( label="Keywords", placeholder="Enter comma-separated keywords", elem_classes="input-highlight-yellow", ) with gr.Row(): input_format = gr.Dropdown( choices=[ "Article", "Essay", "Blog post", "Report", "Research paper", "News article", "White paper", "Email", "LinkedIn post", "X (Twitter) post", "Instagram Video Content", "TikTok Video Content", "Facebook post", ], value="Article", label="Format", elem_classes="input-highlight-turquoise", ) input_length = gr.Slider( minimum=50, maximum=5000, step=50, value=300, label="Article Length", elem_classes="input-highlight-pink", ) with gr.Row(): input_writing_style = gr.Dropdown( choices=[ "Formal", "Informal", "Technical", "Conversational", "Journalistic", "Academic", "Creative", ], value="Formal", label="Writing Style", elem_classes="input-highlight-yellow", ) input_tone = gr.Dropdown( choices=["Friendly", "Professional", "Neutral", "Enthusiastic", "Skeptical", "Humorous"], value="Professional", label="Tone", elem_classes="input-highlight-turquoise", ) input_user_category = gr.Dropdown( choices=[ "Students", "Professionals", "Researchers", "General Public", "Policymakers", "Entrepreneurs", ], value="General Public", label="Target Audience", elem_classes="input-highlight-pink", ) input_depth = gr.Dropdown( choices=[ "Surface-level overview", "Moderate analysis", "In-depth research", "Comprehensive study", ], value="Moderate analysis", label="Depth of Content", elem_classes="input-highlight-yellow", ) input_structure = gr.Dropdown( choices=[ "Introduction, Body, Conclusion", "Abstract, Introduction, Methods, Results, Discussion, Conclusion", "Executive Summary, Problem Statement, Analysis, Recommendations, Conclusion", "Introduction, Literature Review, Methodology, Findings, Analysis, Conclusion", "Plain Text", ], value="Introduction, Body, Conclusion", label="Structure", elem_classes="input-highlight-turquoise", interactive=True, ) input_references = gr.Dropdown( choices=[ "Academic journals", "Industry reports", "Government publications", "News outlets", "Expert interviews", "Case studies", ], value="News outlets", label="References", elem_classes="input-highlight-pink", ) input_num_examples = gr.Dropdown( choices=["1-2", "3-4", "5+"], value="1-2", label="Number of Examples/Case Studies", elem_classes="input-highlight-yellow", ) input_conclusion = gr.Dropdown( choices=["Summary", "Call to Action", "Future Outlook", "Thought-provoking Question"], value="Call to Action", label="Conclusion Type", elem_classes="input-highlight-turquoise", ) gr.Markdown("# Search Options", elem_classes="text-center text-3xl mb-6") google_default = False with gr.Row(): google_search_check = gr.Checkbox( label="Enable Internet Search For Recent Sources", value=google_default ) with gr.Group(visible=google_default) as search_options: with gr.Row(): include_sites = gr.Textbox( label="Include Specific Websites", placeholder="Enter comma-separated keywords", elem_classes="input-highlight-yellow", ) with gr.Row(): exclude_sites = gr.Textbox( label="Exclude Specific Websites", placeholder="Enter comma-separated keywords", elem_classes="input-highlight-yellow", ) with gr.Row(): domains_to_include = gr.Dropdown( domain_list, value=domain_list, multiselect=True, label="Domains To Include", ) with gr.Row(): month_from = gr.Dropdown( choices=months, label="From Month", value="January", interactive=True, ) day_from = gr.Textbox(label="From Day", value="01") year_from = gr.Textbox(label="From Year", value="2000") with gr.Row(): month_to = gr.Dropdown( choices=months, label="To Month", value=d1[1], interactive=True, ) day_to = gr.Textbox(label="To Day", value=d1[0]) year_to = gr.Textbox(label="To Year", value=d1[2]) gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6") pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"]) """ # NOTE: HIDE AI MODEL SELECTION with gr.Group(): gr.Markdown("## AI Model Configuration", elem_classes="text-xl mb-4") ai_generator = gr.Dropdown( choices=[ "OpenAI GPT 4", "OpenAI GPT 4o", "OpenAI GPT 4o Mini", "Claude Sonnet 3.5", "Gemini 1.5 Pro", "LLaMA 3", ], value="OpenAI GPT 4o Mini", label="AI Model", elem_classes="input-highlight-pink", ) input_api = gr.Textbox(label="API Key", visible=False) ai_generator.change(update_visibility_api, ai_generator, input_api) """ generate_btn = gr.Button("Generate Article", variant="primary") with gr.Accordion("Advanced Humanizer Settings", open=False): with gr.Row(): model_dropdown = gr.Radio( choices=[ "Base Model", "Large Model", "XL Model", ], value="XL Model", label="Humanizer Model Version", ) with gr.Row(): temperature_slider = gr.Slider( minimum=0.5, maximum=2.0, step=0.1, value=1.1, label="Temperature" ) top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k") with gr.Row(): repetition_penalty_slider = gr.Slider( minimum=1.0, maximum=2.0, step=0.1, value=1, label="Repetition Penalty" ) length_penalty_slider = gr.Slider( minimum=0.0, maximum=2.0, step=0.1, value=1.0, label="Length Penalty" ) with gr.Column(scale=3): with gr.Tab("Text Generator"): output_article = gr.Textbox(label="Generated Article", lines=20) ai_comments = gr.Textbox( label="Add comments to help edit generated text", interactive=True, visible=False ) regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False) ai_detector_dropdown = gr.Radio( choices=ai_check_options, label="Select AI Detector", value="Polygraf AI" ) ai_check_btn = gr.Button("AI Check") with gr.Accordion("AI Detection Results", open=True): ai_check_result = gr.Label(label="AI Check Result") mc_check_result = gr.Label(label="Creator Check Result") highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False) humanize_btn = gr.Button("Humanize") # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True) # copy_to_input_btn = gr.Button("Copy to Input for AI Check") with gr.Tab("History"): history_chat = gr.Chatbot(label="Generation History", height=1000) clear_history_btn = gr.Button("Clear History") clear_history_btn.click(clear_history, outputs=[history, history_chat]) """ # NOTE: REMOVED REFRESH BUTTON refresh_button = gr.Button("Refresh History") refresh_button.click(get_history, outputs=history_chat) """ def regenerate_visible(text): if text: return gr.update(visible=True) else: return gr.update(visible=False) def highlight_visible(text): if text.startswith("Polygraf"): return gr.update(visible=True) else: return gr.update(visible=False) def search_visible(toggle): if toggle: return gr.update(visible=True) else: return gr.update(visible=False) google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options) ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text) output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments) ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn) ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text) input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure) generate_btn.click( fn=generate_and_format, inputs=[ input_role, input_topic, input_context, input_keywords, input_length, input_format, input_writing_style, input_tone, input_user_category, input_depth, input_structure, input_references, input_num_examples, input_conclusion, # ai_generator, # input_api, google_search_check, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_include, include_sites, exclude_sites, pdf_file_input, history, ], outputs=[output_article, history], ) regenerate_btn.click( fn=generate_and_format, inputs=[ input_role, input_topic, input_context, input_keywords, input_length, input_format, input_writing_style, input_tone, input_user_category, input_depth, input_structure, input_references, input_num_examples, input_conclusion, # ai_generator, # input_api, google_search_check, year_from, month_from, day_from, year_to, month_to, day_to, domains_to_include, pdf_file_input, history, output_article, include_sites, exclude_sites, ai_comments, ], outputs=[output_article, history], ) ai_check_btn.click( fn=ai_check, inputs=[output_article, ai_detector_dropdown], outputs=[ai_check_result, highlighted_text, mc_check_result], ) humanize_btn.click( fn=humanize, inputs=[ output_article, model_dropdown, temperature_slider, repetition_penalty_slider, top_k_slider, length_penalty_slider, history, ], outputs=[output_article, history], ) generate_btn.click(get_history, inputs=[history], outputs=[history_chat]) regenerate_btn.click(get_history, inputs=[history], outputs=[history_chat]) humanize_btn.click(get_history, inputs=[history], outputs=[history_chat]) return demo if __name__ == "__main__": demo = create_interface() # demo.queue( # max_size=8, # default_concurrency_limit=8, # ).launch(server_name="0.0.0.0", share=True, server_port=7890) demo.launch(server_name="0.0.0.0")