minko186 commited on
Commit
19de1c9
·
verified ·
1 Parent(s): 7b0b548

Update humanize.py

Browse files
Files changed (1) hide show
  1. humanize.py +86 -86
humanize.py CHANGED
@@ -1,93 +1,93 @@
1
- import torch
2
- from nltk import sent_tokenize
3
- import nltk
4
- from tqdm import tqdm
5
- from transformers import T5ForConditionalGeneration, T5Tokenizer
6
 
7
- nltk.download("punkt")
8
- # autodetect the available device
9
- GPU_IDX = 1 # which GPU to use
10
- if torch.cuda.is_available():
11
- num_gpus = torch.cuda.device_count()
12
- print(f"Number of available GPUs: {num_gpus}")
13
- assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
14
- device = torch.device(f"cuda:{GPU_IDX}")
15
- print(f"Using GPU: {GPU_IDX}")
16
- else:
17
- print("CUDA is not available. Using CPU instead.")
18
- device = torch.device("cpu")
19
 
20
- # Configuration for models and their adapters
21
- model_config = {
22
- "Base Model": "polygraf-ai/poly-humanizer-base",
23
- "Large Model": "polygraf-ai/poly-humanizer-large",
24
- # "XL Model": {
25
- # "path": "google/flan-t5-xl",
26
- # "adapters": {
27
- # "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
28
- # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
29
- # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
30
- # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
31
- # },
32
- # },
33
- }
34
 
35
- # cache the base models, tokenizers, and adapters
36
- models, tokenizers = {}, {}
37
- for name, config in model_config.items():
38
- path = config if isinstance(config, str) else config["path"]
39
- # initialize model and tokenizer
40
- model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
41
- models[name] = model
42
- tokenizers[name] = T5Tokenizer.from_pretrained(path)
43
- # load all avalable adapters, each being additional roughly 150M parameters
44
- if isinstance(config, dict) and "adapters" in config:
45
- for adapter_name, adapter_path in config["adapters"].items():
46
- model.load_adapter(adapter_path, adapter_name=adapter_name)
47
- print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
48
 
49
 
50
- def paraphrase_text(
51
- text,
52
- model_name="Base Model",
53
- temperature=1.2,
54
- repetition_penalty=1.0,
55
- top_k=50,
56
- length_penalty=1.0,
57
- ):
58
- # select the model, tokenizer and adapter
59
- if "XL" in model_name: # dynamic adapter load/unload for XL models
60
- # all adapter models use the XL model as the base
61
- tokenizer, model = tokenizers["XL Model"], models["XL Model"]
62
- # set the adapter if it's not already set
63
- if model.active_adapters() != [f"{model_name} Adapter"]:
64
- model.set_adapter(f"{model_name} Adapter")
65
- print(f"Using adapter: {model_name} Adapter")
66
- else:
67
- tokenizer = tokenizers[model_name]
68
- model = models[model_name]
69
 
70
- # paraphrase each chunk of text
71
- sentences = sent_tokenize(text) # sentence boundary detection
72
- paraphrases = []
73
- for sentence in tqdm(sentences):
74
- sentence = sentence.strip()
75
- if len(sentence) == 0:
76
- continue
77
- inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
78
- outputs = model.generate(
79
- **inputs,
80
- do_sample=True,
81
- temperature=temperature,
82
- repetition_penalty=repetition_penalty,
83
- max_length=128,
84
- top_k=top_k,
85
- length_penalty=length_penalty,
86
- )
87
- paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
- paraphrases.append(paraphrased_sentence)
89
- print(f"\nOriginal: {sentence}")
90
- print(f"Paraphrased: {paraphrased_sentence}")
91
 
92
- combined_paraphrase = " ".join(paraphrases)
93
- return combined_paraphrase
 
1
+ # import torch
2
+ # from nltk import sent_tokenize
3
+ # import nltk
4
+ # from tqdm import tqdm
5
+ # from transformers import T5ForConditionalGeneration, T5Tokenizer
6
 
7
+ # nltk.download("punkt")
8
+ # # autodetect the available device
9
+ # GPU_IDX = 1 # which GPU to use
10
+ # if torch.cuda.is_available():
11
+ # num_gpus = torch.cuda.device_count()
12
+ # print(f"Number of available GPUs: {num_gpus}")
13
+ # assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
14
+ # device = torch.device(f"cuda:{GPU_IDX}")
15
+ # print(f"Using GPU: {GPU_IDX}")
16
+ # else:
17
+ # print("CUDA is not available. Using CPU instead.")
18
+ # device = torch.device("cpu")
19
 
20
+ # # Configuration for models and their adapters
21
+ # model_config = {
22
+ # "Base Model": "polygraf-ai/poly-humanizer-base",
23
+ # "Large Model": "polygraf-ai/poly-humanizer-large",
24
+ # # "XL Model": {
25
+ # # "path": "google/flan-t5-xl",
26
+ # # "adapters": {
27
+ # # "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
28
+ # # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
29
+ # # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
30
+ # # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
31
+ # # },
32
+ # # },
33
+ # }
34
 
35
+ # # cache the base models, tokenizers, and adapters
36
+ # models, tokenizers = {}, {}
37
+ # for name, config in model_config.items():
38
+ # path = config if isinstance(config, str) else config["path"]
39
+ # # initialize model and tokenizer
40
+ # model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
41
+ # models[name] = model
42
+ # tokenizers[name] = T5Tokenizer.from_pretrained(path)
43
+ # # load all avalable adapters, each being additional roughly 150M parameters
44
+ # if isinstance(config, dict) and "adapters" in config:
45
+ # for adapter_name, adapter_path in config["adapters"].items():
46
+ # model.load_adapter(adapter_path, adapter_name=adapter_name)
47
+ # print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
48
 
49
 
50
+ # def paraphrase_text(
51
+ # text,
52
+ # model_name="Base Model",
53
+ # temperature=1.2,
54
+ # repetition_penalty=1.0,
55
+ # top_k=50,
56
+ # length_penalty=1.0,
57
+ # ):
58
+ # # select the model, tokenizer and adapter
59
+ # if "XL" in model_name: # dynamic adapter load/unload for XL models
60
+ # # all adapter models use the XL model as the base
61
+ # tokenizer, model = tokenizers["XL Model"], models["XL Model"]
62
+ # # set the adapter if it's not already set
63
+ # if model.active_adapters() != [f"{model_name} Adapter"]:
64
+ # model.set_adapter(f"{model_name} Adapter")
65
+ # print(f"Using adapter: {model_name} Adapter")
66
+ # else:
67
+ # tokenizer = tokenizers[model_name]
68
+ # model = models[model_name]
69
 
70
+ # # paraphrase each chunk of text
71
+ # sentences = sent_tokenize(text) # sentence boundary detection
72
+ # paraphrases = []
73
+ # for sentence in tqdm(sentences):
74
+ # sentence = sentence.strip()
75
+ # if len(sentence) == 0:
76
+ # continue
77
+ # inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
78
+ # outputs = model.generate(
79
+ # **inputs,
80
+ # do_sample=True,
81
+ # temperature=temperature,
82
+ # repetition_penalty=repetition_penalty,
83
+ # max_length=128,
84
+ # top_k=top_k,
85
+ # length_penalty=length_penalty,
86
+ # )
87
+ # paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
+ # paraphrases.append(paraphrased_sentence)
89
+ # print(f"\nOriginal: {sentence}")
90
+ # print(f"Paraphrased: {paraphrased_sentence}")
91
 
92
+ # combined_paraphrase = " ".join(paraphrases)
93
+ # return combined_paraphrase