eljanmahammadli commited on
Commit
a54c1ef
·
1 Parent(s): b72ef7f

speed up humanizer: batch generation

Browse files
Files changed (2) hide show
  1. app.py +3 -1
  2. humanize.py +67 -59
app.py CHANGED
@@ -35,6 +35,9 @@ tokenizers = {
35
  "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
36
  }
37
 
 
 
 
38
 
39
  # Function to move model to the appropriate device
40
  def to_device(model):
@@ -99,7 +102,6 @@ def ends_with_references(text):
99
 
100
 
101
  def format_and_correct_language_check(text: str) -> str:
102
- tool = language_tool_python.LanguageTool("en-US")
103
  return tool.correct(text)
104
 
105
 
 
35
  "Polygraf AI (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
36
  }
37
 
38
+ # grammar correction tool
39
+ tool = language_tool_python.LanguageTool("en-US")
40
+
41
 
42
  # Function to move model to the appropriate device
43
  def to_device(model):
 
102
 
103
 
104
  def format_and_correct_language_check(text: str) -> str:
 
105
  return tool.correct(text)
106
 
107
 
humanize.py CHANGED
@@ -1,8 +1,10 @@
 
1
  import torch
2
  from nltk import sent_tokenize
3
  import nltk
4
  from tqdm import tqdm
5
  import gradio as gr
 
6
  from transformers import T5ForConditionalGeneration, T5Tokenizer
7
 
8
  nltk.download("punkt")
@@ -18,35 +20,46 @@ else:
18
  print("CUDA is not available. Using CPU instead.")
19
  device = torch.device("cpu")
20
 
 
21
 
22
  # Configuration for models and their adapters
23
  model_config = {
24
  "Base Model": "polygraf-ai/poly-humanizer-base",
25
  "Large Model": "polygraf-ai/poly-humanizer-large",
26
- "XL Model": {
27
- "path": "google/flan-t5-xl",
28
- "adapters": {
29
- "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
30
- # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
31
- # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
32
- # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
33
- },
34
- },
35
  }
36
 
37
  # cache the base models, tokenizers, and adapters
 
38
  models, tokenizers = {}, {}
39
- for name, config in model_config.items():
40
- path = config if isinstance(config, str) else config["path"]
41
- # initialize model and tokenizer
42
- model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
43
- models[name] = model
44
- tokenizers[name] = T5Tokenizer.from_pretrained(path)
45
- # load all avalable adapters, each being additional roughly 150M parameters
46
- if isinstance(config, dict) and "adapters" in config:
47
- for adapter_name, adapter_path in config["adapters"].items():
48
- model.load_adapter(adapter_path, adapter_name=adapter_name)
49
- print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def paraphrase_text(
@@ -58,51 +71,46 @@ def paraphrase_text(
58
  top_k=50,
59
  length_penalty=1.0,
60
  ):
 
 
 
 
61
  progress(0, desc="Starting to Humanize")
62
  progress(0.05)
63
- # select the model, tokenizer and adapter
64
- if "XL" in model_name: # dynamic adapter load/unload for XL models
65
- # all adapter models use the XL model as the base
66
- tokenizer, model = tokenizers["XL Model"], models["XL Model"]
67
- # set the adapter if it's not already set
68
- if model.active_adapters() != [f"{model_name} Adapter"]:
69
- model.set_adapter(f"{model_name} Adapter")
70
- print(f"Using adapter: {model_name} Adapter")
71
- else:
72
- tokenizer = tokenizers[model_name]
73
- model = models[model_name]
74
 
75
- # Split the text into paragraphs
76
  paragraphs = text.split("\n")
77
- humanized_paragraphs = []
 
78
 
79
- for paragraph in progress.tqdm(paragraphs, desc="Humanizing"):
80
- # paraphrase each chunk of text
81
  sentences = sent_tokenize(paragraph)
82
- paraphrases = []
83
- for sentence in sentences:
84
- sentence = sentence.strip()
85
- if len(sentence) == 0:
86
- continue
87
- inputs = tokenizer(
88
- "Please paraphrase this sentence: " + sentence,
89
- return_tensors="pt",
90
- ).to(device)
91
- outputs = model.generate(
92
- **inputs,
93
- do_sample=True,
94
- temperature=temperature,
95
- repetition_penalty=repetition_penalty,
96
- max_length=128,
97
- top_k=top_k,
98
- length_penalty=length_penalty,
99
- )
100
- paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
101
- paraphrases.append(paraphrased_sentence)
102
- print(f"\nOriginal: {sentence}")
103
- print(f"Paraphrased: {paraphrased_sentence}")
104
- combined_paraphrase = " ".join(paraphrases)
105
- humanized_paragraphs.append(combined_paraphrase)
106
 
107
  humanized_text = "\n".join(humanized_paragraphs)
108
  return humanized_text
 
1
+ import gc
2
  import torch
3
  from nltk import sent_tokenize
4
  import nltk
5
  from tqdm import tqdm
6
  import gradio as gr
7
+ from peft import PeftModel
8
  from transformers import T5ForConditionalGeneration, T5Tokenizer
9
 
10
  nltk.download("punkt")
 
20
  print("CUDA is not available. Using CPU instead.")
21
  device = torch.device("cpu")
22
 
23
+ batch_size = 64
24
 
25
  # Configuration for models and their adapters
26
  model_config = {
27
  "Base Model": "polygraf-ai/poly-humanizer-base",
28
  "Large Model": "polygraf-ai/poly-humanizer-large",
29
+ "XL Model": "polygraf-ai/poly-humanizer-XL-adapter",
 
 
 
 
 
 
 
 
30
  }
31
 
32
  # cache the base models, tokenizers, and adapters
33
+ # initialize model and tokenizer
34
  models, tokenizers = {}, {}
35
+ for name, path in model_config.items():
36
+ if name == "XL Model":
37
+ model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", torch_dtype=torch.bfloat16).to(device)
38
+ model = PeftModel.from_pretrained(model, path, torch_dtype=torch.bfloat16, is_trainable=False)
39
+ model = model.merge_and_unload()
40
+ models[name] = model
41
+ tokenizers[name] = T5Tokenizer.from_pretrained("google/flan-t5-xl")
42
+ else:
43
+ model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
44
+ models[name] = model
45
+ tokenizers[name] = T5Tokenizer.from_pretrained(path)
46
+ print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")
47
+
48
+
49
+ def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
50
+ inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
51
+ inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
52
+ outputs = model.generate(
53
+ **inputs,
54
+ do_sample=True,
55
+ temperature=temperature,
56
+ repetition_penalty=repetition_penalty,
57
+ max_length=128,
58
+ top_k=top_k,
59
+ length_penalty=length_penalty,
60
+ )
61
+ answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
62
+ return answers
63
 
64
 
65
  def paraphrase_text(
 
71
  top_k=50,
72
  length_penalty=1.0,
73
  ):
74
+ """
75
+ Optimization here is to feed all sentences at once to the model.
76
+ Paragraphs are stored as a number of sentences per paragraph.
77
+ """
78
  progress(0, desc="Starting to Humanize")
79
  progress(0.05)
80
+ # Select the model, tokenizer, and adapter
81
+ tokenizer = tokenizers[model_name]
82
+ model = models[model_name].to(device)
 
 
 
 
 
 
 
 
83
 
84
+ # Split the text into paragraphs and then into sentences
85
  paragraphs = text.split("\n")
86
+ all_sentences = []
87
+ sentences_per_paragraph = []
88
 
89
+ for paragraph in paragraphs:
 
90
  sentences = sent_tokenize(paragraph)
91
+ sentences_per_paragraph.append(len(sentences))
92
+ all_sentences.extend(sentences)
93
+
94
+ # Process all sentences in batches
95
+ paraphrased_sentences = []
96
+ for i in range(0, len(all_sentences), batch_size):
97
+ batch_sentences = all_sentences[i : i + batch_size]
98
+ paraphrased_batch = paraphrase_sentences(
99
+ model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
100
+ )
101
+ paraphrased_sentences.extend(paraphrased_batch)
102
+
103
+ # Clear memory
104
+ torch.cuda.empty_cache()
105
+ gc.collect()
106
+
107
+ # Reconstruct paragraphs
108
+ humanized_paragraphs = []
109
+ sentence_index = 0
110
+ for num_sentences in sentences_per_paragraph:
111
+ humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
112
+ humanized_paragraphs.append(humanized_paragraph)
113
+ sentence_index += num_sentences
 
114
 
115
  humanized_text = "\n".join(humanized_paragraphs)
116
  return humanized_text