Data Management Plans
Collection
This collection contains data and models usefull for working with Data Management Plans
•
8 items
•
Updated
PEFT Weigths for Qwen/Qwen2.5-14B-Instruct. Finetuned for the task of generating Data Management Plans.
Model loading:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
BASE_MODEL_NAME = 'Qwen/Qwen2.5-14B-Instruct'
PEFT_MODEL_NAME = 'frnka/qwen14b-forward-peft'
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL_NAME,
device_map="auto",
torch_dtype=torch.float16,
output_attentions=True,
return_dict_in_generate=True,
)
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_NAME).cuda()
And inference:
def message_generic():
return (f"You are Data management plan expert. "
f"Please generate a sentence preceding the following Data Management Plan snippet. ")
def message_specific(topic):
return message_generic() + f"You may talk about '{topic}'"
topic_to_talk_about = "How will the data be stored?"
topic_to_talk_about_2 = "How will the data be backed up?"
context = "Some part of a DMP that we want to generate the previous sentence for."
messages = [
{"role": "system",
"content": f"You are Data management plan expert. "
f"Please generate the rest of the data management plan. "
f"You may talk about '{topic_to_talk_about}'. If the text already talks about it, "
f"you may then move to other topics such as '{topic_to_talk_about_2}'"},
{"role": "user", "content": context},
]
with torch.no_grad():
tokenized = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
input_ids = tokenized['input_ids'].cuda()
output = model.generate(
input_ids,
attention_mask=tokenized['attention_mask'].cuda(),
max_new_tokens=700,
num_return_sequences=1,
do_sample=True,
temperature=1,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=True,
)
answer_ids = output[0][len(input_ids[0]):]
generated_text = tokenizer.decode(answer_ids, skip_special_tokens=True)
print(context + generated_text)
Base model
Qwen/Qwen2.5-14B