Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Jul 16, 2024

Commit

19de1c9

verified ·

1 Parent(s): 7b0b548

Update humanize.py

Browse files

Files changed (1) hide show

humanize.py +86 -86

humanize.py CHANGED Viewed

@@ -1,93 +1,93 @@
-import torch
-from nltk import sent_tokenize
-import nltk
-from tqdm import tqdm
-from transformers import T5ForConditionalGeneration, T5Tokenizer
-nltk.download("punkt")
-# autodetect the available device
-GPU_IDX = 1  # which GPU to use
-if torch.cuda.is_available():
-    num_gpus = torch.cuda.device_count()
-    print(f"Number of available GPUs: {num_gpus}")
-    assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
-    device = torch.device(f"cuda:{GPU_IDX}")
-    print(f"Using GPU: {GPU_IDX}")
-else:
-    print("CUDA is not available. Using CPU instead.")
-    device = torch.device("cpu")
-# Configuration for models and their adapters
-model_config = {
-    "Base Model": "polygraf-ai/poly-humanizer-base",
-    "Large Model": "polygraf-ai/poly-humanizer-large",
-    # "XL Model": {
-    #     "path": "google/flan-t5-xl",
-    #     "adapters": {
-    #         "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
-    #         "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
-    #         "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
-    #         "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
-    #     },
-    # },
-}
-# cache the base models, tokenizers, and adapters
-models, tokenizers = {}, {}
-for name, config in model_config.items():
-    path = config if isinstance(config, str) else config["path"]
-    # initialize model and tokenizer
-    model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
-    models[name] = model
-    tokenizers[name] = T5Tokenizer.from_pretrained(path)
-    # load all avalable adapters, each being additional roughly 150M parameters
-    if isinstance(config, dict) and "adapters" in config:
-        for adapter_name, adapter_path in config["adapters"].items():
-            model.load_adapter(adapter_path, adapter_name=adapter_name)
-            print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
-def paraphrase_text(
-    text,
-    model_name="Base Model",
-    temperature=1.2,
-    repetition_penalty=1.0,
-    top_k=50,
-    length_penalty=1.0,
-):
-    # select the model, tokenizer and adapter
-    if "XL" in model_name:  # dynamic adapter load/unload for XL models
-        # all adapter models use the XL model as the base
-        tokenizer, model = tokenizers["XL Model"], models["XL Model"]
-        # set the adapter if it's not already set
-        if model.active_adapters() != [f"{model_name} Adapter"]:
-            model.set_adapter(f"{model_name} Adapter")
-            print(f"Using adapter: {model_name} Adapter")
-    else:
-        tokenizer = tokenizers[model_name]
-        model = models[model_name]
-    # paraphrase each chunk of text
-    sentences = sent_tokenize(text)  # sentence boundary detection
-    paraphrases = []
-    for sentence in tqdm(sentences):
-        sentence = sentence.strip()
-        if len(sentence) == 0:
-            continue
-        inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
-        outputs = model.generate(
-            **inputs,
-            do_sample=True,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            max_length=128,
-            top_k=top_k,
-            length_penalty=length_penalty,
-        )
-        paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        paraphrases.append(paraphrased_sentence)
-        print(f"\nOriginal: {sentence}")
-        print(f"Paraphrased: {paraphrased_sentence}")
-    combined_paraphrase = " ".join(paraphrases)
-    return combined_paraphrase

+# import torch
+# from nltk import sent_tokenize
+# import nltk
+# from tqdm import tqdm
+# from transformers import T5ForConditionalGeneration, T5Tokenizer
+# nltk.download("punkt")
+# # autodetect the available device
+# GPU_IDX = 1  # which GPU to use
+# if torch.cuda.is_available():
+#     num_gpus = torch.cuda.device_count()
+#     print(f"Number of available GPUs: {num_gpus}")
+#     assert GPU_IDX < num_gpus, f"GPU index {GPU_IDX} not available."
+#     device = torch.device(f"cuda:{GPU_IDX}")
+#     print(f"Using GPU: {GPU_IDX}")
+# else:
+#     print("CUDA is not available. Using CPU instead.")
+#     device = torch.device("cpu")
+# # Configuration for models and their adapters
+# model_config = {
+#     "Base Model": "polygraf-ai/poly-humanizer-base",
+#     "Large Model": "polygraf-ai/poly-humanizer-large",
+#     # "XL Model": {
+#     #     "path": "google/flan-t5-xl",
+#     #     "adapters": {
+#     #         "XL Model Adapter": "polygraf-ai/poly-humanizer-XL-adapter",
+#     #         "XL Law Model Adapter": "polygraf-ai/poly-humanizer-XL-law-adapter",
+#     #         "XL Marketing Model Adapter": "polygraf-ai/marketing-cleaned-13K-grad-acum-4-full",
+#     #         "XL Child Style Model Adapter": "polygraf-ai/poly-humanizer-XL-children-adapter-checkpoint-4000",
+#     #     },
+#     # },
+# }
+# # cache the base models, tokenizers, and adapters
+# models, tokenizers = {}, {}
+# for name, config in model_config.items():
+#     path = config if isinstance(config, str) else config["path"]
+#     # initialize model and tokenizer
+#     model = T5ForConditionalGeneration.from_pretrained(path, torch_dtype=torch.bfloat16).to(device)
+#     models[name] = model
+#     tokenizers[name] = T5Tokenizer.from_pretrained(path)
+#     # load all avalable adapters, each being additional roughly 150M parameters
+#     if isinstance(config, dict) and "adapters" in config:
+#         for adapter_name, adapter_path in config["adapters"].items():
+#             model.load_adapter(adapter_path, adapter_name=adapter_name)
+#             print(f"Loaded adapter: {adapter_name}, Num. params: {model.num_parameters()}")
+# def paraphrase_text(
+#     text,
+#     model_name="Base Model",
+#     temperature=1.2,
+#     repetition_penalty=1.0,
+#     top_k=50,
+#     length_penalty=1.0,
+# ):
+#     # select the model, tokenizer and adapter
+#     if "XL" in model_name:  # dynamic adapter load/unload for XL models
+#         # all adapter models use the XL model as the base
+#         tokenizer, model = tokenizers["XL Model"], models["XL Model"]
+#         # set the adapter if it's not already set
+#         if model.active_adapters() != [f"{model_name} Adapter"]:
+#             model.set_adapter(f"{model_name} Adapter")
+#             print(f"Using adapter: {model_name} Adapter")
+#     else:
+#         tokenizer = tokenizers[model_name]
+#         model = models[model_name]
+#     # paraphrase each chunk of text
+#     sentences = sent_tokenize(text)  # sentence boundary detection
+#     paraphrases = []
+#     for sentence in tqdm(sentences):
+#         sentence = sentence.strip()
+#         if len(sentence) == 0:
+#             continue
+#         inputs = tokenizer("Please paraphrase this sentence: " + sentence, return_tensors="pt").to(device)
+#         outputs = model.generate(
+#             **inputs,
+#             do_sample=True,
+#             temperature=temperature,
+#             repetition_penalty=repetition_penalty,
+#             max_length=128,
+#             top_k=top_k,
+#             length_penalty=length_penalty,
+#         )
+#         paraphrased_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#         paraphrases.append(paraphrased_sentence)
+#         print(f"\nOriginal: {sentence}")
+#         print(f"Paraphrased: {paraphrased_sentence}")
+#     combined_paraphrase = " ".join(paraphrases)
+#     return combined_paraphrase