Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Jul 16, 2024

Commit

42f072b

verified ·

1 Parent(s): 686033f

Update humanize.py

Browse files

Files changed (1) hide show

humanize.py +86 -86

humanize.py CHANGED Viewed

@@ -1,93 +1,93 @@
-# import torch
-# from nltk import sent_tokenize
-# import nltk
-# from tqdm import tqdm
-# from transformers import T5ForConditionalGeneration, T5Tokenizer
-# nltk.download("punkt")
-# # autodetect the available device
-# GPU_IDX = 1  # which GPU to use
-# if torch.cuda.is_available():
-#     num_gpus = torch.cuda.device_count()
-#     print(f"Number of available GPUs: {num_gpus}")
-#     assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
-#     device = torch.device(f"cuda:{GPU_IDX}")
-#     print(f"Using GPU: {GPU_IDX}")
-# else:
-#     print("CUDA is not available. Using CPU instead.")
-#     device = torch.device("cpu")
-# # Configuration for models and their adapters
-# model_config = {
-#     "Base Model": "polygraf-ai/poly-humanizer-base",
-#     "Large Model": "polygraf-ai/poly-humanizer-large",
-#     # "XL Model": {
-#     #     "path": "google/flan-t5-xl",
-#     #     "adapters": {
-#     #         "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
-#     #         "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
-#     #         "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
-#     #         "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
-#     #     },
-#     # },
-# }
-# # cache the base models, tokenizers, and adapters
-# models, tokenizers = {}, {}
-# for name, config in model_config.items():
-#     path = config if isinstance(config, str) else config["path"]
-#     # initialize model and tokenizer
-#     model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
-#     models[name] = model
-#     tokenizers[name] = T5Tokenizer.from_pretrained(path)
-#     # load all avalable adapters, each being additional roughly 150M parameters
-#     if isinstance(config, dict) and "adapters" in config:
-#         for adapter_name, adapter_path in config["adapters"].items():
-#             model.load_adapter(adapter_path, adapter_name=adapter_name)
-#             print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
-# def paraphrase_text(
-#     text,
-#     model_name="Base Model",
-#     temperature=1.2,
-#     repetition_penalty=1.0,
-#     top_k=50,
-#     length_penalty=1.0,
-# ):
-#     # select the model, tokenizer and adapter
-#     if "XL" in model_name:  # dynamic adapter load/unload for XL models
-#         # all adapter models use the XL model as the base
-#         tokenizer, model = tokenizers["XL Model"], models["XL Model"]
-#         # set the adapter if it's not already set
-#         if model.active_adapters() != [f"{model_name} Adapter"]:
-#             model.set_adapter(f"{model_name} Adapter")
-#             print(f"Using adapter: {model_name} Adapter")
-#     else:
-#         tokenizer = tokenizers[model_name]
-#         model = models[model_name]
-#     # paraphrase each chunk of text
-#     sentences = sent_tokenize(text)  # sentence boundary detection
-#     paraphrases = []
-#     for sentence in tqdm(sentences):
-#         sentence = sentence.strip()
-#         if len(sentence) == 0:
-#             continue
-#         inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
-#         outputs = model.generate(
-#             **inputs,
-#             do_sample=True,
-#             temperature=temperature,
-#             repetition_penalty=repetition_penalty,
-#             max_length=128,
-#             top_k=top_k,
-#             length_penalty=length_penalty,
-#         )
-#         paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
-#         paraphrases.append(paraphrased_sentence)
-#         print(f"\nOriginal: {sentence}")
-#         print(f"Paraphrased: {paraphrased_sentence}")
-#     combined_paraphrase = " ".join(paraphrases)
-#     return combined_paraphrase

+import torch
+from nltk import sent_tokenize
+import nltk
+from tqdm import tqdm
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+nltk.download("punkt")
+# autodetect the available device
+GPU_IDX = 1  # which GPU to use
+if torch.cuda.is_available():
+    num_gpus = torch.cuda.device_count()
+    print(f"Number of available GPUs: {num_gpus}")
+    assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
+    device = torch.device(f"cuda:{GPU_IDX}")
+    print(f"Using GPU: {GPU_IDX}")
+else:
+    print("CUDA is not available. Using CPU instead.")
+    device = torch.device("cpu")
+# Configuration for models and their adapters
+model_config = {
+    "Base Model": "polygraf-ai/poly-humanizer-base",
+    "Large Model": "polygraf-ai/poly-humanizer-large",
+    "XL Model": {
+        "path": "google/flan-t5-xl",
+        "adapters": {
+            "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
+            "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
+            "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
+            "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
+        },
+    },
+}
+# cache the base models, tokenizers, and adapters
+models, tokenizers = {}, {}
+for name, config in model_config.items():
+    path = config if isinstance(config, str) else config["path"]
+    # initialize model and tokenizer
+    model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
+    models[name] = model
+    tokenizers[name] = T5Tokenizer.from_pretrained(path)
+    # load all avalable adapters, each being additional roughly 150M parameters
+    if isinstance(config, dict) and "adapters" in config:
+        for adapter_name, adapter_path in config["adapters"].items():
+            model.load_adapter(adapter_path, adapter_name=adapter_name)
+            print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
+def paraphrase_text(
+    text,
+    model_name="Base Model",
+    temperature=1.2,
+    repetition_penalty=1.0,
+    top_k=50,
+    length_penalty=1.0,
+):
+    # select the model, tokenizer and adapter
+    if "XL" in model_name:  # dynamic adapter load/unload for XL models
+        # all adapter models use the XL model as the base
+        tokenizer, model = tokenizers["XL Model"], models["XL Model"]
+        # set the adapter if it's not already set
+        if model.active_adapters() != [f"{model_name} Adapter"]:
+            model.set_adapter(f"{model_name} Adapter")
+            print(f"Using adapter: {model_name} Adapter")
+    else:
+        tokenizer = tokenizers[model_name]
+        model = models[model_name]
+    # paraphrase each chunk of text
+    sentences = sent_tokenize(text)  # sentence boundary detection
+    paraphrases = []
+    for sentence in tqdm(sentences):
+        sentence = sentence.strip()
+        if len(sentence) == 0:
+            continue
+        inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
+        outputs = model.generate(
+            **inputs,
+            do_sample=True,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            max_length=128,
+            top_k=top_k,
+            length_penalty=length_penalty,
+        )
+        paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        paraphrases.append(paraphrased_sentence)
+        print(f"\nOriginal: {sentence}")
+        print(f"Paraphrased: {paraphrased_sentence}")
+    combined_paraphrase = " ".join(paraphrases)
+    return combined_paraphrase