Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

article_writer / humanize.py

eljanmahammadli

merged adapter and base model for XL

924cb86 6 months ago

raw

history blame

3.71 kB

	import gc
	import torch
	from nltk import sent_tokenize
	import nltk
	from tqdm import tqdm
	import gradio as gr
	from peft import PeftModel
	from transformers import T5ForConditionalGeneration, T5Tokenizer

	nltk.download("punkt")
	# autodetect the available device
	GPU_IDX = 1 # which GPU to use
	if torch.cuda.is_available():
	num_gpus = torch.cuda.device_count()
	print(f"Number of available GPUs: {num_gpus}")
	assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
	device = torch.device(f"cuda:{GPU_IDX}")
	print(f"Using GPU: {GPU_IDX}")
	else:
	print("CUDA is not available. Using CPU instead.")
	device = torch.device("cpu")

	batch_size = 64

	# Configuration for models and their adapters
	model_config = {
	"Base Model": "polygraf-ai/poly-humanizer-base",
	"Large Model": "polygraf-ai/poly-humanizer-large",
	"XL Model": "polygraf-ai/poly-humanizer-XL-merged-v2",
	}

	# cache the base models, tokenizers, and adapters
	# initialize model and tokenizer
	models, tokenizers = {}, {}
	for name, path in model_config.items():
	model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
	tokenizers[name] = T5Tokenizer.from_pretrained(path)
	models[name] = model
	print(f"Loaded model: {name}, Num. params: {model.num_parameters()}")


	def paraphrase_sentences(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
	inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
	inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
	outputs = model.generate(
	**inputs,
	do_sample=True,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	max_length=128,
	top_k=top_k,
	length_penalty=length_penalty,
	)
	answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
	return answers


	def paraphrase_text(
	text,
	progress=gr.Progress(),
	model_name="Base Model",
	temperature=1.2,
	repetition_penalty=1.0,
	top_k=50,
	length_penalty=1.0,
	):
	"""
	Optimization here is to feed all sentences at once to the model.
	Paragraphs are stored as a number of sentences per paragraph.
	"""
	progress(0, desc="Starting to Humanize")
	# Select the model, tokenizer, and adapter
	tokenizer = tokenizers[model_name]
	model = models[model_name].to(device)

	# Split the text into paragraphs and then into sentences
	paragraphs = text.split("\n")
	all_sentences = []
	sentences_per_paragraph = []

	for paragraph in paragraphs:
	sentences = sent_tokenize(paragraph)
	sentences_per_paragraph.append(len(sentences))
	all_sentences.extend(sentences)

	# Process all sentences in batches
	paraphrased_sentences = []
	for i in progress.tqdm(range(0, len(all_sentences), batch_size)):
	batch_sentences = all_sentences[i : i + batch_size]
	paraphrased_batch = paraphrase_sentences(
	model, tokenizer, batch_sentences, temperature, repetition_penalty, top_k, length_penalty
	)
	paraphrased_sentences.extend(paraphrased_batch)

	# Clear memory
	torch.cuda.empty_cache()
	gc.collect()

	# Reconstruct paragraphs
	humanized_paragraphs = []
	sentence_index = 0
	for num_sentences in sentences_per_paragraph:
	humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
	humanized_paragraphs.append(humanized_paragraph)
	sentence_index += num_sentences

	humanized_text = "\n".join(humanized_paragraphs)
	return humanized_text