Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
a54c1ef
1
Parent(s):
b72ef7f
speed up humanizer: batch generation
Browse files- app.py +3 -1
- humanize.py +67 -59
app.py
CHANGED
@@ -35,6 +35,9 @@ tokenizers = {
|
|
35 |
"Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
|
36 |
}
|
37 |
|
|
|
|
|
|
|
38 |
|
39 |
# Function to move model to the appropriate device
|
40 |
def to_device(model):
|
@@ -99,7 +102,6 @@ def ends_with_references(text):
|
|
99 |
|
100 |
|
101 |
def format_and_correct_language_check(text: str) -> str:
|
102 |
-
tool = language_tool_python.LanguageTool("en-US")
|
103 |
return tool.correct(text)
|
104 |
|
105 |
|
|
|
35 |
"Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
|
36 |
}
|
37 |
|
38 |
+
# grammar correction tool
|
39 |
+
tool = language_tool_python.LanguageTool("en-US")
|
40 |
+
|
41 |
|
42 |
# Function to move model to the appropriate device
|
43 |
def to_device(model):
|
|
|
102 |
|
103 |
|
104 |
def format_and_correct_language_check(text: str) -> str:
|
|
|
105 |
return tool.correct(text)
|
106 |
|
107 |
|
humanize.py
CHANGED
@@ -1,8 +1,10 @@
|
|
|
|
1 |
import torch
|
2 |
from nltk import sent_tokenize
|
3 |
import nltk
|
4 |
from tqdm import tqdm
|
5 |
import gradio as gr
|
|
|
6 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
7 |
|
8 |
nltk.download("punkt")
|
@@ -18,35 +20,46 @@ else:
|
|
18 |
print("CUDA is not available. Using CPU instead.")
|
19 |
device = torch.device("cpu")
|
20 |
|
|
|
21 |
|
22 |
# Configuration for models and their adapters
|
23 |
model_config = {
|
24 |
"Base Model": "polygraf-ai/poly-humanizer-base",
|
25 |
"Large Model": "polygraf-ai/poly-humanizer-large",
|
26 |
-
"XL Model":
|
27 |
-
"path": "google/flan-t5-xl",
|
28 |
-
"adapters": {
|
29 |
-
"XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
|
30 |
-
# "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
|
31 |
-
# "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
|
32 |
-
# "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
|
33 |
-
},
|
34 |
-
},
|
35 |
}
|
36 |
|
37 |
# cache the base models, tokenizers, and adapters
|
|
|
38 |
models, tokenizers = {}, {}
|
39 |
-
for name,
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
def paraphrase_text(
|
@@ -58,51 +71,46 @@ def paraphrase_text(
|
|
58 |
top_k=50,
|
59 |
length_penalty=1.0,
|
60 |
):
|
|
|
|
|
|
|
|
|
61 |
progress(0, desc="Starting to Humanize")
|
62 |
progress(0.05)
|
63 |
-
#
|
64 |
-
|
65 |
-
|
66 |
-
tokenizer, model = tokenizers["XL Model"], models["XL Model"]
|
67 |
-
# set the adapter if it's not already set
|
68 |
-
if model.active_adapters() != [f"{model_name} Adapter"]:
|
69 |
-
model.set_adapter(f"{model_name} Adapter")
|
70 |
-
print(f"Using adapter: {model_name} Adapter")
|
71 |
-
else:
|
72 |
-
tokenizer = tokenizers[model_name]
|
73 |
-
model = models[model_name]
|
74 |
|
75 |
-
# Split the text into paragraphs
|
76 |
paragraphs = text.split("\n")
|
77 |
-
|
|
|
78 |
|
79 |
-
for paragraph in
|
80 |
-
# paraphrase each chunk of text
|
81 |
sentences = sent_tokenize(paragraph)
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
humanized_paragraphs.append(combined_paraphrase)
|
106 |
|
107 |
humanized_text = "\n".join(humanized_paragraphs)
|
108 |
return humanized_text
|
|
|
1 |
+
import gc
|
2 |
import torch
|
3 |
from nltk import sent_tokenize
|
4 |
import nltk
|
5 |
from tqdm import tqdm
|
6 |
import gradio as gr
|
7 |
+
from peft import PeftModel
|
8 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
9 |
|
10 |
nltk.download("punkt")
|
|
|
20 |
print("CUDA is not available. Using CPU instead.")
|
21 |
device = torch.device("cpu")
|
22 |
|
23 |
+
batch_size = 64
|
24 |
|
25 |
# Configuration for models and their adapters
|
26 |
model_config = {
|
27 |
"Base Model": "polygraf-ai/poly-humanizer-base",
|
28 |
"Large Model": "polygraf-ai/poly-humanizer-large",
|
29 |
+
"XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
}
|
31 |
|
32 |
# cache the base models, tokenizers, and adapters
|
33 |
+
# initialize model and tokenizer
|
34 |
models, tokenizers = {}, {}
|
35 |
+
for name, path in model_config.items():
|
36 |
+
if name == "XL Model":
|
37 |
+
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
|
38 |
+
model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
|
39 |
+
model = model.merge_and_unload()
|
40 |
+
models[name] = model
|
41 |
+
tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
|
42 |
+
else:
|
43 |
+
model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
|
44 |
+
models[name] = model
|
45 |
+
tokenizers[name] = T5Tokenizer.from_pretrained(path)
|
46 |
+
print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")
|
47 |
+
|
48 |
+
|
49 |
+
def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
|
50 |
+
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
|
51 |
+
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
|
52 |
+
outputs = model.generate(
|
53 |
+
**inputs,
|
54 |
+
do_sample=True,
|
55 |
+
temperature=temperature,
|
56 |
+
repetition_penalty=repetition_penalty,
|
57 |
+
max_length=128,
|
58 |
+
top_k=top_k,
|
59 |
+
length_penalty=length_penalty,
|
60 |
+
)
|
61 |
+
answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
62 |
+
return answers
|
63 |
|
64 |
|
65 |
def paraphrase_text(
|
|
|
71 |
top_k=50,
|
72 |
length_penalty=1.0,
|
73 |
):
|
74 |
+
"""
|
75 |
+
Optimization here is to feed all sentences at once to the model.
|
76 |
+
Paragraphs are stored as a number of sentences per paragraph.
|
77 |
+
"""
|
78 |
progress(0, desc="Starting to Humanize")
|
79 |
progress(0.05)
|
80 |
+
# Select the model, tokenizer, and adapter
|
81 |
+
tokenizer = tokenizers[model_name]
|
82 |
+
model = models[model_name].to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
# Split the text into paragraphs and then into sentences
|
85 |
paragraphs = text.split("\n")
|
86 |
+
all_sentences = []
|
87 |
+
sentences_per_paragraph = []
|
88 |
|
89 |
+
for paragraph in paragraphs:
|
|
|
90 |
sentences = sent_tokenize(paragraph)
|
91 |
+
sentences_per_paragraph.append(len(sentences))
|
92 |
+
all_sentences.extend(sentences)
|
93 |
+
|
94 |
+
# Process all sentences in batches
|
95 |
+
paraphrased_sentences = []
|
96 |
+
for i in range(0, len(all_sentences), batch_size):
|
97 |
+
batch_sentences = all_sentences[i : i + batch_size]
|
98 |
+
paraphrased_batch = paraphrase_sentences(
|
99 |
+
model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
|
100 |
+
)
|
101 |
+
paraphrased_sentences.extend(paraphrased_batch)
|
102 |
+
|
103 |
+
# Clear memory
|
104 |
+
torch.cuda.empty_cache()
|
105 |
+
gc.collect()
|
106 |
+
|
107 |
+
# Reconstruct paragraphs
|
108 |
+
humanized_paragraphs = []
|
109 |
+
sentence_index = 0
|
110 |
+
for num_sentences in sentences_per_paragraph:
|
111 |
+
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
|
112 |
+
humanized_paragraphs.append(humanized_paragraph)
|
113 |
+
sentence_index += num_sentences
|
|
|
114 |
|
115 |
humanized_text = "\n".join(humanized_paragraphs)
|
116 |
return humanized_text
|