|
from transformers import TextStreamer |
|
from unsloth import FastLanguageModel |
|
|
|
max_seq_length = 2048 |
|
dtype = None |
|
load_in_4bit = False |
|
|
|
alpaca_prompt = """Provide a helpful and informative response to the following prompt. |
|
|
|
### Prompt: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
|
|
prompt = "What is your base model?" |
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="merged_tinyllama_base_model", |
|
max_seq_length=max_seq_length, |
|
dtype=dtype |
|
) |
|
|
|
FastLanguageModel.for_inference(model) |
|
inputs = tokenizer( |
|
[alpaca_prompt.format(prompt, "")], |
|
return_tensors="pt" |
|
).to("cuda").to(dtype) |
|
|
|
|
|
text_streamer = TextStreamer(tokenizer) |
|
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=2000) |
|
|
|
|