not run

#3
by sdyy - opened

from transformers import LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, AutoModel, StopStringCriteria, StoppingCriteriaList
import torch

Load the tokenizer and model

repo_name = "nvidia/Hymba-1.5B-Base"

tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
model = model.cuda().to(torch.bfloat16)

Chat with Hymba

prompt = input()
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
outputs = model.generate(**inputs, max_length=64, do_sample=False, temperature=0.7, use_cache=True)
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print(f"Model response: {response}")

/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
generation_config.json:โ€‡100%
โ€‡154/154โ€‡[00:00<00:00,โ€‡4.57kB/s]
hi
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:617: UserWarning: do_sample is set to False. However, temperature is set to 0.7 -- this flag is only used in sample-based generation modes. You should set do_sample=True or unset temperature.
warnings.warn(

RuntimeError Traceback (most recent call last)
in <cell line: 14>()
12 prompt = input()
13 inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
---> 14 outputs = model.generate(**inputs, max_length=64, do_sample=False, temperature=0.7, use_cache=True)
15 response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
16

32 frames
/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py in _flash_attn_forward(q, k, v, dropout_p, softmax_scale, causal, window_size_left, window_size_right, softcap, alibi_slopes, return_softmax)
89 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
90 q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
---> 91 out, softmax_lse, S_dmask, rng_state = flash_attn_cuda.fwd(
92 q,
93 k,

RuntimeError: FlashAttention only supports Ampere GPUs or newer.

colab t4

Can it be issued without Flash AttentionุŸุŸุŸุŸุŸุŸุŸุŸุŸุŸุŸุŸุŸุŸ

Sign up or log in to comment