minko186 commited on
Commit
9d1ac35
·
verified ·
1 Parent(s): b900928

Create humanize.py

Browse files
Files changed (1) hide show
  1. humanize.py +93 -0
humanize.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from nltk import sent_tokenize
3
+ import nltk
4
+ from tqdm import tqdm
5
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
6
+
7
+ nltk.download("punkt")
8
+ # autodetect the available device
9
+ GPU_IDX = 1 # which GPU to use
10
+ if torch.cuda.is_available():
11
+ num_gpus = torch.cuda.device_count()
12
+ print(f"Number of available GPUs: {num_gpus}")
13
+ assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
14
+ device = torch.device(f"cuda:{GPU_IDX}")
15
+ print(f"Using GPU: {GPU_IDX}")
16
+ else:
17
+ print("CUDA is not available. Using CPU instead.")
18
+ device = torch.device("cpu")
19
+
20
+ # Configuration for models and their adapters
21
+ model_config = {
22
+ "Base Model": "polygraf-ai/poly-humanizer-base",
23
+ "Large Model": "polygraf-ai/poly-humanizer-large",
24
+ "XL Model": {
25
+ "path": "google/flan-t5-xl",
26
+ "adapters": {
27
+ "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
28
+ "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
29
+ "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
30
+ "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
31
+ },
32
+ },
33
+ }
34
+
35
+ # cache the base models, tokenizers, and adapters
36
+ models, tokenizers = {}, {}
37
+ for name, config in model_config.items():
38
+ path = config if isinstance(config, str) else config["path"]
39
+ # initialize model and tokenizer
40
+ model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
41
+ models[name] = model
42
+ tokenizers[name] = T5Tokenizer.from_pretrained(path)
43
+ # load all avalable adapters, each being additional roughly 150M parameters
44
+ if isinstance(config, dict) and "adapters" in config:
45
+ for adapter_name, adapter_path in config["adapters"].items():
46
+ model.load_adapter(adapter_path, adapter_name=adapter_name)
47
+ print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
48
+
49
+
50
+ def paraphrase_text(
51
+ text,
52
+ model_name="Base Model",
53
+ temperature=1.2,
54
+ repetition_penalty=1.0,
55
+ top_k=50,
56
+ length_penalty=1.0,
57
+ ):
58
+ # select the model, tokenizer and adapter
59
+ if "XL" in model_name: # dynamic adapter load/unload for XL models
60
+ # all adapter models use the XL model as the base
61
+ tokenizer, model = tokenizers["XL Model"], models["XL Model"]
62
+ # set the adapter if it's not already set
63
+ if model.active_adapters() != [f"{model_name} Adapter"]:
64
+ model.set_adapter(f"{model_name} Adapter")
65
+ print(f"Using adapter: {model_name} Adapter")
66
+ else:
67
+ tokenizer = tokenizers[model_name]
68
+ model = models[model_name]
69
+
70
+ # paraphrase each chunk of text
71
+ sentences = sent_tokenize(text) # sentence boundary detection
72
+ paraphrases = []
73
+ for sentence in tqdm(sentences):
74
+ sentence = sentence.strip()
75
+ if len(sentence) == 0:
76
+ continue
77
+ inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
78
+ outputs = model.generate(
79
+ **inputs,
80
+ do_sample=True,
81
+ temperature=temperature,
82
+ repetition_penalty=repetition_penalty,
83
+ max_length=128,
84
+ top_k=top_k,
85
+ length_penalty=length_penalty,
86
+ )
87
+ paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
+ paraphrases.append(paraphrased_sentence)
89
+ print(f"\nOriginal: {sentence}")
90
+ print(f"Paraphrased: {paraphrased_sentence}")
91
+
92
+ combined_paraphrase = " ".join(paraphrases)
93
+ return combined_paraphrase