eljanmahammadli commited on
Commit
5534eb0
·
1 Parent(s): d09cdf3

Added MC model to UI and removed some unnecessary code

Browse files
Files changed (2) hide show
  1. app.py +45 -44
  2. utils.py +81 -22
app.py CHANGED
@@ -12,10 +12,11 @@ from scipy.special import softmax
12
  import language_tool_python
13
  import nltk
14
  import torch
 
15
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
16
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
17
 
18
- from utils import remove_special_characters
19
  from google_search import google_search, months, domain_list, build_date
20
  from humanize import paraphrase_text, device
21
  from ai_generate import generate
@@ -196,65 +197,64 @@ ai_check_options = [
196
  ]
197
 
198
 
199
- def ai_generated_test_sapling(text: str) -> Dict:
200
- response = requests.post(
201
- "https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
202
- )
203
- return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
204
-
205
-
206
- class GPT2PPL:
207
- def __init__(self):
208
- self.device = device
209
- self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
210
- self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
211
 
212
- def __call__(self, text):
213
- encodings = self.tokenizer(text, return_tensors="pt")
214
- encodings = {k: v.to(self.device) for k, v in encodings.items()}
215
- max_length = self.model.config.n_positions
216
- stride = 512
217
- seq_len = encodings.input_ids.size(1)
218
 
219
- nlls = []
220
- for i in range(0, seq_len, stride):
221
- begin_loc = max(i + stride - max_length, 0)
222
- end_loc = min(i + stride, seq_len)
223
- trg_len = end_loc - i
224
- input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
225
- target_ids = input_ids.clone()
226
- target_ids[:, :-trg_len] = -100
227
-
228
- with torch.no_grad():
229
- outputs = self.model(input_ids, labels=target_ids)
230
- neg_log_likelihood = outputs.loss * trg_len
 
231
 
232
- nlls.append(neg_log_likelihood)
233
 
234
- ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
235
- return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
 
 
 
 
 
 
 
 
 
 
 
 
236
 
 
 
 
 
 
 
237
 
238
- def ai_generated_test_gptzero(text):
239
- gptzero_model = GPT2PPL()
240
- result = gptzero_model(text)
241
- return result, None
242
 
243
 
244
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
245
  body, references = split_text_from_refs(text)
246
  score, text = detection_polygraf(text=body, model=model)
 
247
  text = text + references.replace("\n", "<br>")
248
- return score, text
249
 
250
 
251
  def ai_check(text: str, option: str):
252
  if option.startswith("Polygraf AI"):
253
  return highlighter_polygraf(text, option)
254
- elif option == "Sapling AI":
255
- return ai_generated_test_sapling(text)
256
- elif option == "GPTZero":
257
- return ai_generated_test_gptzero(text)
258
  else:
259
  return highlighter_polygraf(text, option)
260
 
@@ -788,6 +788,7 @@ def create_interface():
788
 
789
  with gr.Accordion("AI Detection Results", open=True):
790
  ai_check_result = gr.Label(label="AI Check Result")
 
791
  highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
792
  humanize_btn = gr.Button("Humanize")
793
  # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
@@ -904,7 +905,7 @@ def create_interface():
904
  ai_check_btn.click(
905
  fn=ai_check,
906
  inputs=[output_article, ai_detector_dropdown],
907
- outputs=[ai_check_result, highlighted_text],
908
  )
909
 
910
  humanize_btn.click(
 
12
  import language_tool_python
13
  import nltk
14
  import torch
15
+ import numpy as np
16
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
17
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
18
 
19
+ from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
  from humanize import paraphrase_text, device
22
  from ai_generate import generate
 
197
  ]
198
 
199
 
200
+ MC_TOKEN_SIZE = 256
201
+ TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
202
+ MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
203
+ text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
204
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
 
 
 
 
 
 
 
205
 
 
 
 
 
 
 
206
 
207
+ def predict_mc(text):
208
+ with torch.no_grad():
209
+ text_mc_model.eval()
210
+ tokens = text_mc_tokenizer(
211
+ text,
212
+ padding="max_length",
213
+ truncation=True,
214
+ return_tensors="pt",
215
+ max_length=MC_TOKEN_SIZE,
216
+ ).to(device)
217
+ output = text_mc_model(**tokens)
218
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
219
+ return output_norm
220
 
 
221
 
222
+ def predict_mc_scores(input, bc_score):
223
+ mc_scores = []
224
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer)
225
+ samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det="mc", tokenizer=text_mc_tokenizer))
226
+ for i in range(samples_len_mc):
227
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
228
+ mc_score = predict_mc(cleaned_text_mc)
229
+ mc_scores.append(mc_score)
230
+ mc_scores_array = np.array(mc_scores)
231
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
232
+ mc_score_list = average_mc_scores.tolist()
233
+ mc_score = {}
234
+ for score, label in zip(mc_score_list, MC_LABEL_MAP):
235
+ mc_score[label.upper()] = score
236
 
237
+ sum_prob = 1 - bc_score["HUMAN"]
238
+ for key, value in mc_score.items():
239
+ mc_score[key] = value * sum_prob
240
+ print("MC Score:", mc_score)
241
+ if sum_prob < 0.01:
242
+ mc_score = {}
243
 
244
+ return mc_score
 
 
 
245
 
246
 
247
  def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
248
  body, references = split_text_from_refs(text)
249
  score, text = detection_polygraf(text=body, model=model)
250
+ mc_score = predict_mc_scores(body, score) # mc score
251
  text = text + references.replace("\n", "<br>")
252
+ return score, text, mc_score
253
 
254
 
255
  def ai_check(text: str, option: str):
256
  if option.startswith("Polygraf AI"):
257
  return highlighter_polygraf(text, option)
 
 
 
 
258
  else:
259
  return highlighter_polygraf(text, option)
260
 
 
788
 
789
  with gr.Accordion("AI Detection Results", open=True):
790
  ai_check_result = gr.Label(label="AI Check Result")
791
+ mc_check_result = gr.Label(label="Creator Check Result")
792
  highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
793
  humanize_btn = gr.Button("Humanize")
794
  # humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
 
905
  ai_check_btn.click(
906
  fn=ai_check,
907
  inputs=[output_article, ai_detector_dropdown],
908
+ outputs=[ai_check_result, highlighted_text, mc_check_result],
909
  )
910
 
911
  humanize_btn.click(
utils.py CHANGED
@@ -1,5 +1,7 @@
1
  import re
2
  from unidecode import unidecode
 
 
3
  # from transformers import AutoTokenizer
4
  # import yaml
5
  # import fitz
@@ -7,33 +9,39 @@ from unidecode import unidecode
7
  # from bs4 import BeautifulSoup
8
  # from collections import defaultdict
9
 
 
10
  def remove_accents(input_str):
11
  text_no_accents = unidecode(input_str)
12
  return text_no_accents
13
 
 
14
  def remove_special_characters(text):
15
- text = re.sub(r'https?://\S+|www\.\S+', '', text)
16
- emoji_pattern = re.compile("["
17
- u"\U0001F600-\U0001F64F" # emoticons
18
- u"\U0001F300-\U0001F5FF" # symbols & pictographs
19
- u"\U0001F680-\U0001F6FF" # transport & map symbols
20
- u"\U0001F700-\U0001F77F" # alchemical symbols
21
- u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
22
- u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
23
- u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
24
- u"\U0001FA00-\U0001FA6F" # Chess Symbols
25
- u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
26
- u"\U00002702-\U000027B0" # Dingbats
27
- u"\U000024C2-\U0001F251"
28
- "]+", flags=re.UNICODE)
29
- text = emoji_pattern.sub('', text)
30
- text = re.sub(r'#\w+', '', text)
31
- text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
32
- text = re.sub(r'\s+([.,!?;])', r'\1', text)
33
- text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
34
- text = re.sub(r'\s+', ' ', text).strip()
 
 
 
35
  return text
36
 
 
37
  def remove_special_characters_2(text):
38
  pattern = r"[^a-zA-Z0-9 ]+"
39
  text = re.sub(pattern, "", text)
@@ -41,5 +49,56 @@ def remove_special_characters_2(text):
41
 
42
 
43
  def split_into_sentences(text):
44
- sentences = re.split(r'(?<=[.!?]) +', text)
45
- return sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  from unidecode import unidecode
3
+ from nltk import sent_tokenize
4
+
5
  # from transformers import AutoTokenizer
6
  # import yaml
7
  # import fitz
 
9
  # from bs4 import BeautifulSoup
10
  # from collections import defaultdict
11
 
12
+
13
  def remove_accents(input_str):
14
  text_no_accents = unidecode(input_str)
15
  return text_no_accents
16
 
17
+
18
  def remove_special_characters(text):
19
+ text = re.sub(r"https?://\S+|www\.\S+", "", text)
20
+ emoji_pattern = re.compile(
21
+ "["
22
+ "\U0001F600-\U0001F64F" # emoticons
23
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
24
+ "\U0001F680-\U0001F6FF" # transport & map symbols
25
+ "\U0001F700-\U0001F77F" # alchemical symbols
26
+ "\U0001F780-\U0001F7FF" # Geometric Shapes Extended
27
+ "\U0001F800-\U0001F8FF" # Supplemental Arrows-C
28
+ "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
29
+ "\U0001FA00-\U0001FA6F" # Chess Symbols
30
+ "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
31
+ "\U00002702-\U000027B0" # Dingbats
32
+ "\U000024C2-\U0001F251"
33
+ "]+",
34
+ flags=re.UNICODE,
35
+ )
36
+ text = emoji_pattern.sub("", text)
37
+ text = re.sub(r"#\w+", "", text)
38
+ text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text)
39
+ text = re.sub(r"\s+([.,!?;])", r"\1", text)
40
+ text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text)
41
+ text = re.sub(r"\s+", " ", text).strip()
42
  return text
43
 
44
+
45
  def remove_special_characters_2(text):
46
  pattern = r"[^a-zA-Z0-9 ]+"
47
  text = re.sub(pattern, "", text)
 
49
 
50
 
51
  def split_into_sentences(text):
52
+ sentences = re.split(r"(?<=[.!?]) +", text)
53
+ return sentences
54
+
55
+
56
+ def get_token_length(tokenizer, sentence):
57
+ return len(tokenizer.tokenize(sentence))
58
+
59
+
60
+ MC_TOKEN_SIZE = 256
61
+ BC_TOKEN_SIZE = 333
62
+
63
+
64
+ def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None):
65
+ sentences = sent_tokenize(text)
66
+ chunks = []
67
+ current_chunk = []
68
+ current_length = 0
69
+ if type_det == "bc":
70
+ max_tokens = BC_TOKEN_SIZE
71
+ elif type_det == "mc":
72
+ max_tokens = MC_TOKEN_SIZE
73
+ elif type_det == "quillbot":
74
+ max_tokens = 256
75
+
76
+ def add_sentence_to_chunk(sentence):
77
+ nonlocal current_chunk, current_length
78
+ sentence_length = get_token_length(tokenizer, sentence)
79
+ if current_length + sentence_length > max_tokens:
80
+ chunks.append((current_chunk, current_length))
81
+ current_chunk = []
82
+ current_length = 0
83
+ current_chunk.append(sentence)
84
+ current_length += sentence_length
85
+
86
+ for sentence in sentences:
87
+ add_sentence_to_chunk(sentence)
88
+ if current_chunk:
89
+ chunks.append((current_chunk, current_length))
90
+ adjusted_chunks = []
91
+ while chunks:
92
+ chunk = chunks.pop(0)
93
+ if len(chunks) > 0 and chunk[1] < max_tokens / 2:
94
+ next_chunk = chunks.pop(0)
95
+ combined_length = chunk[1] + next_chunk[1]
96
+ if combined_length <= max_tokens:
97
+ adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
98
+ else:
99
+ adjusted_chunks.append(chunk)
100
+ chunks.insert(0, next_chunk)
101
+ else:
102
+ adjusted_chunks.append(chunk)
103
+ result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
104
+ return result_chunks