Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
5534eb0
1
Parent(s):
d09cdf3
Added MC model to UI and removed some unnecessary code
Browse files
app.py
CHANGED
@@ -12,10 +12,11 @@ from scipy.special import softmax
|
|
12 |
import language_tool_python
|
13 |
import nltk
|
14 |
import torch
|
|
|
15 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
16 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
17 |
|
18 |
-
from utils import remove_special_characters
|
19 |
from google_search import google_search, months, domain_list, build_date
|
20 |
from humanize import paraphrase_text, device
|
21 |
from ai_generate import generate
|
@@ -196,65 +197,64 @@ ai_check_options = [
|
|
196 |
]
|
197 |
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
class GPT2PPL:
|
207 |
-
def __init__(self):
|
208 |
-
self.device = device
|
209 |
-
self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
|
210 |
-
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
211 |
|
212 |
-
def __call__(self, text):
|
213 |
-
encodings = self.tokenizer(text, return_tensors="pt")
|
214 |
-
encodings = {k: v.to(self.device) for k, v in encodings.items()}
|
215 |
-
max_length = self.model.config.n_positions
|
216 |
-
stride = 512
|
217 |
-
seq_len = encodings.input_ids.size(1)
|
218 |
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
231 |
|
232 |
-
nlls.append(neg_log_likelihood)
|
233 |
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
-
|
239 |
-
gptzero_model = GPT2PPL()
|
240 |
-
result = gptzero_model(text)
|
241 |
-
return result, None
|
242 |
|
243 |
|
244 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
245 |
body, references = split_text_from_refs(text)
|
246 |
score, text = detection_polygraf(text=body, model=model)
|
|
|
247 |
text = text + references.replace("\n", "<br>")
|
248 |
-
return score, text
|
249 |
|
250 |
|
251 |
def ai_check(text: str, option: str):
|
252 |
if option.startswith("Polygraf AI"):
|
253 |
return highlighter_polygraf(text, option)
|
254 |
-
elif option == "Sapling AI":
|
255 |
-
return ai_generated_test_sapling(text)
|
256 |
-
elif option == "GPTZero":
|
257 |
-
return ai_generated_test_gptzero(text)
|
258 |
else:
|
259 |
return highlighter_polygraf(text, option)
|
260 |
|
@@ -788,6 +788,7 @@ def create_interface():
|
|
788 |
|
789 |
with gr.Accordion("AI Detection Results", open=True):
|
790 |
ai_check_result = gr.Label(label="AI Check Result")
|
|
|
791 |
highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
|
792 |
humanize_btn = gr.Button("Humanize")
|
793 |
# humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
|
@@ -904,7 +905,7 @@ def create_interface():
|
|
904 |
ai_check_btn.click(
|
905 |
fn=ai_check,
|
906 |
inputs=[output_article, ai_detector_dropdown],
|
907 |
-
outputs=[ai_check_result, highlighted_text],
|
908 |
)
|
909 |
|
910 |
humanize_btn.click(
|
|
|
12 |
import language_tool_python
|
13 |
import nltk
|
14 |
import torch
|
15 |
+
import numpy as np
|
16 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
17 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
18 |
|
19 |
+
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
20 |
from google_search import google_search, months, domain_list, build_date
|
21 |
from humanize import paraphrase_text, device
|
22 |
from ai_generate import generate
|
|
|
197 |
]
|
198 |
|
199 |
|
200 |
+
MC_TOKEN_SIZE = 256
|
201 |
+
TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
|
202 |
+
MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
|
203 |
+
text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
|
204 |
+
text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
+
def predict_mc(text):
|
208 |
+
with torch.no_grad():
|
209 |
+
text_mc_model.eval()
|
210 |
+
tokens = text_mc_tokenizer(
|
211 |
+
text,
|
212 |
+
padding="max_length",
|
213 |
+
truncation=True,
|
214 |
+
return_tensors="pt",
|
215 |
+
max_length=MC_TOKEN_SIZE,
|
216 |
+
).to(device)
|
217 |
+
output = text_mc_model(**tokens)
|
218 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
219 |
+
return output_norm
|
220 |
|
|
|
221 |
|
222 |
+
def predict_mc_scores(input, bc_score):
|
223 |
+
mc_scores = []
|
224 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer)
|
225 |
+
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer))
|
226 |
+
for i in range(samples_len_mc):
|
227 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
228 |
+
mc_score = predict_mc(cleaned_text_mc)
|
229 |
+
mc_scores.append(mc_score)
|
230 |
+
mc_scores_array = np.array(mc_scores)
|
231 |
+
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
232 |
+
mc_score_list = average_mc_scores.tolist()
|
233 |
+
mc_score = {}
|
234 |
+
for score, label in zip(mc_score_list, MC_LABEL_MAP):
|
235 |
+
mc_score[label.upper()] = score
|
236 |
|
237 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
238 |
+
for key, value in mc_score.items():
|
239 |
+
mc_score[key] = value * sum_prob
|
240 |
+
print("MC Score:", mc_score)
|
241 |
+
if sum_prob < 0.01:
|
242 |
+
mc_score = {}
|
243 |
|
244 |
+
return mc_score
|
|
|
|
|
|
|
245 |
|
246 |
|
247 |
def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
|
248 |
body, references = split_text_from_refs(text)
|
249 |
score, text = detection_polygraf(text=body, model=model)
|
250 |
+
mc_score = predict_mc_scores(body, score) # mc score
|
251 |
text = text + references.replace("\n", "<br>")
|
252 |
+
return score, text, mc_score
|
253 |
|
254 |
|
255 |
def ai_check(text: str, option: str):
|
256 |
if option.startswith("Polygraf AI"):
|
257 |
return highlighter_polygraf(text, option)
|
|
|
|
|
|
|
|
|
258 |
else:
|
259 |
return highlighter_polygraf(text, option)
|
260 |
|
|
|
788 |
|
789 |
with gr.Accordion("AI Detection Results", open=True):
|
790 |
ai_check_result = gr.Label(label="AI Check Result")
|
791 |
+
mc_check_result = gr.Label(label="Creator Check Result")
|
792 |
highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
|
793 |
humanize_btn = gr.Button("Humanize")
|
794 |
# humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
|
|
|
905 |
ai_check_btn.click(
|
906 |
fn=ai_check,
|
907 |
inputs=[output_article, ai_detector_dropdown],
|
908 |
+
outputs=[ai_check_result, highlighted_text, mc_check_result],
|
909 |
)
|
910 |
|
911 |
humanize_btn.click(
|
utils.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import re
|
2 |
from unidecode import unidecode
|
|
|
|
|
3 |
# from transformers import AutoTokenizer
|
4 |
# import yaml
|
5 |
# import fitz
|
@@ -7,33 +9,39 @@ from unidecode import unidecode
|
|
7 |
# from bs4 import BeautifulSoup
|
8 |
# from collections import defaultdict
|
9 |
|
|
|
10 |
def remove_accents(input_str):
|
11 |
text_no_accents = unidecode(input_str)
|
12 |
return text_no_accents
|
13 |
|
|
|
14 |
def remove_special_characters(text):
|
15 |
-
text = re.sub(r
|
16 |
-
emoji_pattern = re.compile(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
"
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
text =
|
33 |
-
text = re.sub(r
|
34 |
-
text = re.sub(r'\s
|
|
|
|
|
|
|
35 |
return text
|
36 |
|
|
|
37 |
def remove_special_characters_2(text):
|
38 |
pattern = r"[^a-zA-Z0-9 ]+"
|
39 |
text = re.sub(pattern, "", text)
|
@@ -41,5 +49,56 @@ def remove_special_characters_2(text):
|
|
41 |
|
42 |
|
43 |
def split_into_sentences(text):
|
44 |
-
sentences = re.split(r
|
45 |
-
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
from unidecode import unidecode
|
3 |
+
from nltk import sent_tokenize
|
4 |
+
|
5 |
# from transformers import AutoTokenizer
|
6 |
# import yaml
|
7 |
# import fitz
|
|
|
9 |
# from bs4 import BeautifulSoup
|
10 |
# from collections import defaultdict
|
11 |
|
12 |
+
|
13 |
def remove_accents(input_str):
|
14 |
text_no_accents = unidecode(input_str)
|
15 |
return text_no_accents
|
16 |
|
17 |
+
|
18 |
def remove_special_characters(text):
|
19 |
+
text = re.sub(r"https?://\S+|www\.\S+", "", text)
|
20 |
+
emoji_pattern = re.compile(
|
21 |
+
"["
|
22 |
+
"\U0001F600-\U0001F64F" # emoticons
|
23 |
+
"\U0001F300-\U0001F5FF" # symbols & pictographs
|
24 |
+
"\U0001F680-\U0001F6FF" # transport & map symbols
|
25 |
+
"\U0001F700-\U0001F77F" # alchemical symbols
|
26 |
+
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
|
27 |
+
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
|
28 |
+
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
|
29 |
+
"\U0001FA00-\U0001FA6F" # Chess Symbols
|
30 |
+
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
|
31 |
+
"\U00002702-\U000027B0" # Dingbats
|
32 |
+
"\U000024C2-\U0001F251"
|
33 |
+
"]+",
|
34 |
+
flags=re.UNICODE,
|
35 |
+
)
|
36 |
+
text = emoji_pattern.sub("", text)
|
37 |
+
text = re.sub(r"#\w+", "", text)
|
38 |
+
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text)
|
39 |
+
text = re.sub(r"\s+([.,!?;])", r"\1", text)
|
40 |
+
text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text)
|
41 |
+
text = re.sub(r"\s+", " ", text).strip()
|
42 |
return text
|
43 |
|
44 |
+
|
45 |
def remove_special_characters_2(text):
|
46 |
pattern = r"[^a-zA-Z0-9 ]+"
|
47 |
text = re.sub(pattern, "", text)
|
|
|
49 |
|
50 |
|
51 |
def split_into_sentences(text):
|
52 |
+
sentences = re.split(r"(?<=[.!?]) +", text)
|
53 |
+
return sentences
|
54 |
+
|
55 |
+
|
56 |
+
def get_token_length(tokenizer, sentence):
|
57 |
+
return len(tokenizer.tokenize(sentence))
|
58 |
+
|
59 |
+
|
60 |
+
MC_TOKEN_SIZE = 256
|
61 |
+
BC_TOKEN_SIZE = 333
|
62 |
+
|
63 |
+
|
64 |
+
def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None):
|
65 |
+
sentences = sent_tokenize(text)
|
66 |
+
chunks = []
|
67 |
+
current_chunk = []
|
68 |
+
current_length = 0
|
69 |
+
if type_det == "bc":
|
70 |
+
max_tokens = BC_TOKEN_SIZE
|
71 |
+
elif type_det == "mc":
|
72 |
+
max_tokens = MC_TOKEN_SIZE
|
73 |
+
elif type_det == "quillbot":
|
74 |
+
max_tokens = 256
|
75 |
+
|
76 |
+
def add_sentence_to_chunk(sentence):
|
77 |
+
nonlocal current_chunk, current_length
|
78 |
+
sentence_length = get_token_length(tokenizer, sentence)
|
79 |
+
if current_length + sentence_length > max_tokens:
|
80 |
+
chunks.append((current_chunk, current_length))
|
81 |
+
current_chunk = []
|
82 |
+
current_length = 0
|
83 |
+
current_chunk.append(sentence)
|
84 |
+
current_length += sentence_length
|
85 |
+
|
86 |
+
for sentence in sentences:
|
87 |
+
add_sentence_to_chunk(sentence)
|
88 |
+
if current_chunk:
|
89 |
+
chunks.append((current_chunk, current_length))
|
90 |
+
adjusted_chunks = []
|
91 |
+
while chunks:
|
92 |
+
chunk = chunks.pop(0)
|
93 |
+
if len(chunks) > 0 and chunk[1] < max_tokens / 2:
|
94 |
+
next_chunk = chunks.pop(0)
|
95 |
+
combined_length = chunk[1] + next_chunk[1]
|
96 |
+
if combined_length <= max_tokens:
|
97 |
+
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
|
98 |
+
else:
|
99 |
+
adjusted_chunks.append(chunk)
|
100 |
+
chunks.insert(0, next_chunk)
|
101 |
+
else:
|
102 |
+
adjusted_chunks.append(chunk)
|
103 |
+
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
|
104 |
+
return result_chunks
|