minko186 commited on
Commit
42f072b
·
verified ·
1 Parent(s): 686033f

Update humanize.py

Browse files
Files changed (1) hide show
  1. humanize.py +86 -86
humanize.py CHANGED
@@ -1,93 +1,93 @@
1
- # import torch
2
- # from nltk import sent_tokenize
3
- # import nltk
4
- # from tqdm import tqdm
5
- # from transformers import T5ForConditionalGeneration, T5Tokenizer
6
 
7
- # nltk.download("punkt")
8
- # # autodetect the available device
9
- # GPU_IDX = 1 # which GPU to use
10
- # if torch.cuda.is_available():
11
- # num_gpus = torch.cuda.device_count()
12
- # print(f"Number of available GPUs: {num_gpus}")
13
- # assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
14
- # device = torch.device(f"cuda:{GPU_IDX}")
15
- # print(f"Using GPU: {GPU_IDX}")
16
- # else:
17
- # print("CUDA is not available. Using CPU instead.")
18
- # device = torch.device("cpu")
19
 
20
- # # Configuration for models and their adapters
21
- # model_config = {
22
- # "Base Model": "polygraf-ai/poly-humanizer-base",
23
- # "Large Model": "polygraf-ai/poly-humanizer-large",
24
- # # "XL Model": {
25
- # # "path": "google/flan-t5-xl",
26
- # # "adapters": {
27
- # # "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
28
- # # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
29
- # # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
30
- # # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
31
- # # },
32
- # # },
33
- # }
34
 
35
- # # cache the base models, tokenizers, and adapters
36
- # models, tokenizers = {}, {}
37
- # for name, config in model_config.items():
38
- # path = config if isinstance(config, str) else config["path"]
39
- # # initialize model and tokenizer
40
- # model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
41
- # models[name] = model
42
- # tokenizers[name] = T5Tokenizer.from_pretrained(path)
43
- # # load all avalable adapters, each being additional roughly 150M parameters
44
- # if isinstance(config, dict) and "adapters" in config:
45
- # for adapter_name, adapter_path in config["adapters"].items():
46
- # model.load_adapter(adapter_path, adapter_name=adapter_name)
47
- # print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
48
 
49
 
50
- # def paraphrase_text(
51
- # text,
52
- # model_name="Base Model",
53
- # temperature=1.2,
54
- # repetition_penalty=1.0,
55
- # top_k=50,
56
- # length_penalty=1.0,
57
- # ):
58
- # # select the model, tokenizer and adapter
59
- # if "XL" in model_name: # dynamic adapter load/unload for XL models
60
- # # all adapter models use the XL model as the base
61
- # tokenizer, model = tokenizers["XL Model"], models["XL Model"]
62
- # # set the adapter if it's not already set
63
- # if model.active_adapters() != [f"{model_name} Adapter"]:
64
- # model.set_adapter(f"{model_name} Adapter")
65
- # print(f"Using adapter: {model_name} Adapter")
66
- # else:
67
- # tokenizer = tokenizers[model_name]
68
- # model = models[model_name]
69
 
70
- # # paraphrase each chunk of text
71
- # sentences = sent_tokenize(text) # sentence boundary detection
72
- # paraphrases = []
73
- # for sentence in tqdm(sentences):
74
- # sentence = sentence.strip()
75
- # if len(sentence) == 0:
76
- # continue
77
- # inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
78
- # outputs = model.generate(
79
- # **inputs,
80
- # do_sample=True,
81
- # temperature=temperature,
82
- # repetition_penalty=repetition_penalty,
83
- # max_length=128,
84
- # top_k=top_k,
85
- # length_penalty=length_penalty,
86
- # )
87
- # paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
- # paraphrases.append(paraphrased_sentence)
89
- # print(f"\nOriginal: {sentence}")
90
- # print(f"Paraphrased: {paraphrased_sentence}")
91
 
92
- # combined_paraphrase = " ".join(paraphrases)
93
- # return combined_paraphrase
 
1
+ import torch
2
+ from nltk import sent_tokenize
3
+ import nltk
4
+ from tqdm import tqdm
5
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
6
 
7
+ nltk.download("punkt")
8
+ # autodetect the available device
9
+ GPU_IDX = 1 # which GPU to use
10
+ if torch.cuda.is_available():
11
+ num_gpus = torch.cuda.device_count()
12
+ print(f"Number of available GPUs: {num_gpus}")
13
+ assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
14
+ device = torch.device(f"cuda:{GPU_IDX}")
15
+ print(f"Using GPU: {GPU_IDX}")
16
+ else:
17
+ print("CUDA is not available. Using CPU instead.")
18
+ device = torch.device("cpu")
19
 
20
+ # Configuration for models and their adapters
21
+ model_config = {
22
+ "Base Model": "polygraf-ai/poly-humanizer-base",
23
+ "Large Model": "polygraf-ai/poly-humanizer-large",
24
+ "XL Model": {
25
+ "path": "google/flan-t5-xl",
26
+ "adapters": {
27
+ "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
28
+ "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
29
+ "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
30
+ "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
31
+ },
32
+ },
33
+ }
34
 
35
+ # cache the base models, tokenizers, and adapters
36
+ models, tokenizers = {}, {}
37
+ for name, config in model_config.items():
38
+ path = config if isinstance(config, str) else config["path"]
39
+ # initialize model and tokenizer
40
+ model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
41
+ models[name] = model
42
+ tokenizers[name] = T5Tokenizer.from_pretrained(path)
43
+ # load all avalable adapters, each being additional roughly 150M parameters
44
+ if isinstance(config, dict) and "adapters" in config:
45
+ for adapter_name, adapter_path in config["adapters"].items():
46
+ model.load_adapter(adapter_path, adapter_name=adapter_name)
47
+ print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
48
 
49
 
50
+ def paraphrase_text(
51
+ text,
52
+ model_name="Base Model",
53
+ temperature=1.2,
54
+ repetition_penalty=1.0,
55
+ top_k=50,
56
+ length_penalty=1.0,
57
+ ):
58
+ # select the model, tokenizer and adapter
59
+ if "XL" in model_name: # dynamic adapter load/unload for XL models
60
+ # all adapter models use the XL model as the base
61
+ tokenizer, model = tokenizers["XL Model"], models["XL Model"]
62
+ # set the adapter if it's not already set
63
+ if model.active_adapters() != [f"{model_name} Adapter"]:
64
+ model.set_adapter(f"{model_name} Adapter")
65
+ print(f"Using adapter: {model_name} Adapter")
66
+ else:
67
+ tokenizer = tokenizers[model_name]
68
+ model = models[model_name]
69
 
70
+ # paraphrase each chunk of text
71
+ sentences = sent_tokenize(text) # sentence boundary detection
72
+ paraphrases = []
73
+ for sentence in tqdm(sentences):
74
+ sentence = sentence.strip()
75
+ if len(sentence) == 0:
76
+ continue
77
+ inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
78
+ outputs = model.generate(
79
+ **inputs,
80
+ do_sample=True,
81
+ temperature=temperature,
82
+ repetition_penalty=repetition_penalty,
83
+ max_length=128,
84
+ top_k=top_k,
85
+ length_penalty=length_penalty,
86
+ )
87
+ paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
+ paraphrases.append(paraphrased_sentence)
89
+ print(f"\nOriginal: {sentence}")
90
+ print(f"Paraphrased: {paraphrased_sentence}")
91
 
92
+ combined_paraphrase = " ".join(paraphrases)
93
+ return combined_paraphrase