File size: 8,522 Bytes
078999d
24a982b
078999d
e2a79fa
078999d
 
c1769c1
24a0ba5
078999d
 
e2a79fa
 
 
 
078999d
 
 
 
 
 
 
 
 
 
 
e2a79fa
 
744d9e3
 
 
 
 
 
e2a79fa
 
 
 
9d1ac35
e2a79fa
 
 
 
 
 
 
 
 
 
 
 
 
 
078999d
 
c1769c1
 
 
24a0ba5
c1769c1
 
 
24a0ba5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
078999d
 
 
 
 
a54c1ef
 
078999d
a54c1ef
 
 
078999d
 
 
 
24a0ba5
 
 
 
 
 
 
 
 
e2a79fa
 
 
 
 
 
 
24a0ba5
 
 
 
 
 
e2a79fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
078999d
 
e2a79fa
078999d
 
 
 
42f072b
078999d
 
 
 
 
e2a79fa
24a0ba5
 
 
 
e2a79fa
 
24a982b
078999d
 
 
 
24a0ba5
078999d
24a0ba5
 
078999d
 
 
 
 
 
e2a79fa
 
 
 
 
 
 
 
 
24a0ba5
 
e2a79fa
 
 
 
 
 
 
 
 
 
 
 
078999d
e2a79fa
 
 
 
 
 
 
 
 
078999d
 
 
 
 
 
 
 
24a0ba5
 
 
 
078999d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import gc
import torch
import nltk
from nltk import sent_tokenize
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer
import language_tool_python
import re

nltk.download("punkt")

GPU_IDX = 1  # which GPU to use, starts from 0
BATCH_SIZE = 64  # number of sentences to process in one batch

# autodetect the available device
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
    device = torch.device(f"cuda:{GPU_IDX}")
    print(f"Using GPU: {GPU_IDX}")
else:
    print("CUDA is not available. Using CPU instead.")
    device = torch.device("cpu")

# ----------------------------
# load encoder-decoder (sequence to sequence) language model
# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
seq2seq_model = None
seq2seq_tokenizer = None
# ----------------------------
# load decoder-only (causal) language model
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

# can only use GPU 0 when using unsloth FastLanguageModel
max_seq_length = 2048  # any can be chosed since RoPE Scaling is used
dtype = None  # None for auto detection. Float16for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage
dec_only = "polygraf-ai/phi-3-mini-rank-128"
dec_only_model, dec_only_tokenizer = FastLanguageModel.from_pretrained(
    model_name=dec_only,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map="cuda:0",
)
FastLanguageModel.for_inference(dec_only_model)  # native 2x faster inference
print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}")


# grammar correction tool
tool = language_tool_python.LanguageTool("en-US")


def format_and_correct_language_check(text: str) -> str:
    return tool.correct(text)


def extract_citations(text):
    citations = re.findall(r"<(\d+)>", text)
    return [int(citation) for citation in citations]


def remove_citations(text):
    text = re.sub(r"<\d+>", "", text)
    text = re.sub(r"[\d+]", "", text)
    return text


def humanize_batch_seq2seq(
    model,
    tokenizer,
    sentences,
    temperature,
    repetition_penalty,
    top_k,
    length_penalty,
):
    inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
    inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        max_length=128,
        top_k=top_k,
        length_penalty=length_penalty,
    )
    answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return answers


def humanize_batch_decoder_only(
    model,
    tokenizer,
    sentences,
    temperature,
    repetition_penalty,
    top_k,
    length_penalty,
):
    pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
    # Construct the messages_batch using the tokenized sentences
    messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
    # Initialize the tokenizer with the chat template
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-3",
        mapping={
            "role": "from",
            "content": "value",
            "user": "human",
            "assistant": "gpt",
        },  # ShareGPT style
    )

    # Enable native 2x faster inference
    FastLanguageModel.for_inference(model)
    # Initialize an empty list to store responses
    responses = []
    # Process each message individually
    for message in messages_batch:
        # Apply the chat template to the individual message
        inputs = tokenizer.apply_chat_template(
            [message],  # Wrap the message in a list
            tokenize=True,
            add_generation_prompt=True,  # Must add for generation
            return_tensors="pt",
        ).to("cuda")
        # Generate the response for the individual message
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=1024,
            use_cache=True,
            do_sample=True,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            top_k=top_k,
            length_penalty=length_penalty,
        )
        # Decode the output and store it
        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
        responses.append(decoded_output[0])

    # Print or return the responses
    generated_sentences = []
    for idx, response in enumerate(responses):
        generated_sentence = response.split("<|assistant|>")[1].split("<|end|>")[0].strip()
        generated_sentences.append(generated_sentence)
        print(sentences[idx])
        print(generated_sentence)
        print()

    return generated_sentences


def humanize_text(
    text,
    progress=gr.Progress(),
    model_name="Standard Model",
    temperature=1.2,
    repetition_penalty=1.0,
    top_k=50,
    length_penalty=1.0,
):
    """
    Optimization here is to feed all sentences at once to the model.
    Paragraphs are stored as a number of sentences per paragraph.
    """
    progress(0, desc="Starting to Humanize")
    # Map model names to their respective processing functions
    model_map = {
        "Standard Model": humanize_batch_seq2seq,
        "Advanced Model (Beta)": humanize_batch_decoder_only,
    }
    assert model_name in model_map, f"Invalid model name: {model_name}"
    process_function = model_map[model_name]

    # Split the text into paragraphs and then into sentences
    paragraphs = text.split("\n")
    all_sentences = []
    sentences_per_paragraph = []
    citations_per_paragraph = []
    for paragraph in paragraphs:
        citations_per_paragraph.append(extract_citations(paragraph))
        paragraph = remove_citations(paragraph)
        sentences = sent_tokenize(paragraph)
        sentences_per_paragraph.append(len(sentences))
        all_sentences.extend(sentences)

    # Process all sentences in batches
    paraphrased_sentences = []
    current_batch_size = BATCH_SIZE
    i = 0

    while i < len(all_sentences):
        try:
            batch_sentences = all_sentences[i : i + current_batch_size]

            # Call the selected processing function
            paraphrased_batch = process_function(
                (seq2seq_model if model_name == "Standard Model" else dec_only_model),
                (seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
                batch_sentences,
                temperature,
                repetition_penalty,
                top_k,
                length_penalty,
            )

            paraphrased_sentences.extend(paraphrased_batch)
            i += current_batch_size  # Move to the next batch
            torch.cuda.empty_cache()
            gc.collect()
            progress.update(i / len(all_sentences))

        except RuntimeError as e:
            if "out of memory" in str(e):
                # Reduce the batch size by half and retry
                current_batch_size = max(1, current_batch_size // 2)
                print(f"Out of memory, reducing batch size to {current_batch_size}. Retrying...")
                torch.cuda.empty_cache()
                gc.collect()
            else:
                raise e

    # Reconstruct paragraphs
    humanized_paragraphs = []
    sentence_index = 0
    for num_sentences in sentences_per_paragraph:
        humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
        humanized_paragraphs.append(humanized_paragraph)
        sentence_index += num_sentences
    for i, paragraph in enumerate(humanized_paragraphs):
        citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
        humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
    humanized_text = "\n\n".join(humanized_paragraphs)
    return humanized_text