article_writer / humanize.py
minko186's picture
Update humanize.py
19de1c9 verified
raw
history blame
3.85 kB
# import torch
# from nltk import sent_tokenize
# import nltk
# from tqdm import tqdm
# from transformers import T5ForConditionalGeneration, T5Tokenizer
# nltk.download("punkt")
# # autodetect the available device
# GPU_IDX = 1 # which GPU to use
# if torch.cuda.is_available():
# num_gpus = torch.cuda.device_count()
# print(f"Number of available GPUs: {num_gpus}")
# assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
# device = torch.device(f"cuda:{GPU_IDX}")
# print(f"Using GPU: {GPU_IDX}")
# else:
# print("CUDA is not available. Using CPU instead.")
# device = torch.device("cpu")
# # Configuration for models and their adapters
# model_config = {
# "Base Model": "polygraf-ai/poly-humanizer-base",
# "Large Model": "polygraf-ai/poly-humanizer-large",
# # "XL Model": {
# # "path": "google/flan-t5-xl",
# # "adapters": {
# # "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
# # "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
# # "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
# # "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
# # },
# # },
# }
# # cache the base models, tokenizers, and adapters
# models, tokenizers = {}, {}
# for name, config in model_config.items():
# path = config if isinstance(config, str) else config["path"]
# # initialize model and tokenizer
# model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
# models[name] = model
# tokenizers[name] = T5Tokenizer.from_pretrained(path)
# # load all avalable adapters, each being additional roughly 150M parameters
# if isinstance(config, dict) and "adapters" in config:
# for adapter_name, adapter_path in config["adapters"].items():
# model.load_adapter(adapter_path, adapter_name=adapter_name)
# print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
# def paraphrase_text(
# text,
# model_name="Base Model",
# temperature=1.2,
# repetition_penalty=1.0,
# top_k=50,
# length_penalty=1.0,
# ):
# # select the model, tokenizer and adapter
# if "XL" in model_name: # dynamic adapter load/unload for XL models
# # all adapter models use the XL model as the base
# tokenizer, model = tokenizers["XL Model"], models["XL Model"]
# # set the adapter if it's not already set
# if model.active_adapters() != [f"{model_name} Adapter"]:
# model.set_adapter(f"{model_name} Adapter")
# print(f"Using adapter: {model_name} Adapter")
# else:
# tokenizer = tokenizers[model_name]
# model = models[model_name]
# # paraphrase each chunk of text
# sentences = sent_tokenize(text) # sentence boundary detection
# paraphrases = []
# for sentence in tqdm(sentences):
# sentence = sentence.strip()
# if len(sentence) == 0:
# continue
# inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
# outputs = model.generate(
# **inputs,
# do_sample=True,
# temperature=temperature,
# repetition_penalty=repetition_penalty,
# max_length=128,
# top_k=top_k,
# length_penalty=length_penalty,
# )
# paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
# paraphrases.append(paraphrased_sentence)
# print(f"\nOriginal: {sentence}")
# print(f"Paraphrased: {paraphrased_sentence}")
# combined_paraphrase = " ".join(paraphrases)
# return combined_paraphrase