Using the Accelerate API to train models on multiple GPUs

#28
by ajash - opened

I have installed flash attention & rotary emb:
pip install flash-attn==2.1.1 --no-build-isolation
pip install git+https://github.com/HazyResearch/[email protected]#subdirectory=csrc/rotary

The machine has 4 A100 gpus. While using the accelerate API I get the following error:

Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
Traceback (most recent call last):
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 38, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/init.py", line 3, in
from flash_attn.flash_attn_interface import (
File "/home/paperspace/.local/lib/python3.9/site-packages/flash_attn/flash_attn_interface.py", line 7, in
import flash_attn_2_cuda as flash_attn_cuda
ImportError: libtorch_cuda_cpp.so: cannot open shared object file: No such file or directory

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 76, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 66, in run_model
model = rv_model.AmznCausalLMTrainingMultiGPU(
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 24, in init
model = AutoModelForCausalLM.from_pretrained(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 480, in from_pretrained
model_class = get_class_from_dynamic_module(
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 443, in get_class_from_dynamic_module
return get_class_in_module(class_name, final_module.replace(".py", ""))
File "/home/paperspace/.local/lib/python3.9/site-packages/transformers/dynamic_module_utils.py", line 164, in get_class_in_module
module = importlib.import_module(module_path)
File "/usr/lib/python3.9/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
File "", line 1030, in _gcd_import
File "", line 1007, in _find_and_load
File "", line 986, in _find_and_load_unlocked
File "", line 680, in _load_unlocked
File "", line 850, in exec_module
File "", line 228, in _call_with_frames_removed
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 49, in
raise ImportError('Please install Flash Attention: pip install flash-attn --no-build-isolation')
ImportError: Please install Flash Attention: pip install flash-attn --no-build-isolation
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 11162) of binary: /usr/bin/python3.9
Traceback (most recent call last):
File "/home/paperspace/.local/bin/accelerate", line 8, in
sys.exit(main())
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 970, in launch_command
multi_gpu_launcher(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

ajash changed discussion title from Using Accelerate API to train the model. to Using Accelerate API to train model multiple GPUs
ajash changed discussion title from Using Accelerate API to train model multiple GPUs to Using the Accelerate API to train models on multiple GPUs
Together org

Hi @ajash , it looks like this is an issue arising from incompatible versions of pytorch / cuda / flash_attn. Can you provide more details about your setup? what versions do you have installed?

nvcc --version (cuda version)
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Jun__8_16:49:14_PDT_2022
Cuda compilation tools, release 11.7, V11.7.99
Build cuda_11.7.r11.7/compiler.31442593_0

python3 -c "import torch; print(torch.version)". (pytorch version)
2.0.1+cu117

pip freeze | grep flash-attn
flash-attn==2.1.1
flash attention was installed with: pip install flash-attn --no-build-isolation

Together org

ok, the version seem to be compatible. It might be that installing flash-attn with pip install ... points to a different python environment since according to the stacktrace it can't find the library. Can you try to install flash attention using python3 -m pip install flash-attn --no-build-isolation ?

I uninstalled flsh-attn and then installed it back... that seemed to have worked. I am getting another error: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!

I have printed out the entire stack trace here:

Traceback (most recent call last):
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
run_model()run_model()

File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,model.train_model(gradient_accum_steps=args.batch_size,

File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
outputs = self.model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)outputs = self.model(

File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
inputs_embeds = self.embed_tokens(input_ids)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
inputs_embeds = self.embed_tokens(input_ids)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
outputs = self.model(
return F.embedding( File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl

File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return F.embedding(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 656, in forward
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError : inputs_embeds = self.embed_tokens(input_ids)return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
result = forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/functional.py", line 2210, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py:350: UserWarning: operator() profile_node %34 : int[] = prim::profile_ivalue(%32)
does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
kv = repeat_kv(kv, self.num_key_value_groups)
Traceback (most recent call last):
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 81, in
run_model()
File "/home/paperspace/DigitalSynapse/models/run_models.py", line 77, in run_model
model.train_model(gradient_accum_steps=args.batch_size,
File "/home/paperspace/DigitalSynapse/models/reviews_model.py", line 103, in train_model
outputs = model(**batch)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/peft/peft_model.py", line 922, in forward
return self.base_model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 812, in forward
outputs = self.model(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 687, in forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, *args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(*args)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 683, in custom_forward
return module(*inputs, output_attentions, None)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 444, in forward
hidden_states = self.input_layernorm(hidden_states)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 88, in forward
return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
File "/home/paperspace/.cache/huggingface/modules/transformers_modules/togethercomputer/LLaMA-2-7B-32K/e6c58dac682e6c33b0e8fa0923ac5c79b76047c6/modeling_flash_llama.py", line 70, in rmsnorm_func
hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0!
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 5654) of binary: /usr/bin/python3.9
Traceback (most recent call last):
File "/home/paperspace/.local/bin/accelerate", line 8, in
sys.exit(main())
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
args.func(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 970, in launch_command
multi_gpu_launcher(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/accelerate/commands/launch.py", line 646, in multi_gpu_launcher
distrib_run.run(args)
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/paperspace/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

run_models.py FAILED

Failures:
[1]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 5655)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 5656)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 5657)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2023-09-28_03:41:06
host : psvxc2krd
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 5654)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

after looking at the stack trace a bit more feels like the layernorm is complaining: https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/08639a72e17836184096ae6a7e2766f2a34c3e36/modeling_flash_llama.py#L444
Is it because of model sharding.... output of the same layer is sharded across devices so its causing a problem?

great that the installation now worked! re the new error -- it's hard to say without seeing the code but I think your hunch is correct. How does your setup look like? and how are you distributing the model to different devices?

My code is very basic:

Start of code

def train_model(self, gradient_accum_steps, model_hub_loc):
# Multi-gpu implementation will use accelerates implementation
accelerator = Accelerator(gradient_accumulation_steps=gradient_accum_steps)
device = accelerator.device
model = self.model
model.train().to(device)
dataset = self.dataset.with_format("torch")
dataloader = DataLoader(dataset, collate_fn=DataCollatorForLanguageModeling(
self.data_processor.tokenizer, mlm=False), batch_size=1)
#optimizer = self.set_optimizer()
# Do better
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=10000,
)
# Accelerator specific code
model, optimizer, dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, dataloader, lr_scheduler
)

# ######################################
if self.is_debug_mode:
  self.print_model_device_placement(model)
  # There is no model training.
  return
# ######################################
for i, batch in enumerate(dataloader):
  with accelerator.accumulate(model):
    #batch = {k: v for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs[0]
    # Gradient accumulation need not be done manually
    # Instead of loss.backward()
    accelerator.backward(loss)
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    if i % 100 == 0:  # Poor mans logging
      print(f"loss: {loss}, steps: {i}")
if model_hub_loc:
  model.push_to_hub(model_hub_loc)

I have the sharding info as well. Pasting it below:

module.base_model.model.model.embed_tokens.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.gate_proj.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.up_proj.weight -> cuda:0
module.base_model.model.model.layers.0.mlp.down_proj.weight -> cuda:0
module.base_model.model.model.layers.0.input_layernorm.weight -> cuda:0
module.base_model.model.model.layers.0.post_attention_layernorm.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight -> cuda:0module.base_model.model.model.embed_tokens.weight -> cuda:1
module.base_model.model.model.embed_tokens.weight -> cuda:2
module.base_model.model.model.layers.1.mlp.gate_proj.weight -> cuda:0

module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:1module.base_model.model.model.embed_tokens.weight -> cuda:3
module.base_model.model.model.layers.1.mlp.up_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:2

module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:1

module.base_model.model.model.layers.1.mlp.down_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.q_proj.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:1module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:2

module.base_model.model.model.layers.1.input_layernorm.weight -> cuda:0

module.base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:1module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:2
module.base_model.model.model.layers.1.post_attention_layernorm.weight -> cuda:0

module.base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:2module.base_model.model.model.layers.2.self_attn.q_proj.weight -> cuda:0

module.base_model.model.model.layers.0.self_attn.k_proj.weight -> cuda:3module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.k_proj.weight -> cuda:0module.base_model.model.model.layers.0.self_attn.v_proj.weight -> cuda:3

module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:2module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:1

module.base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:2
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:1
module.base_model.model.model.layers.2.self_attn.v_proj.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:2
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:1
module.base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight -> cuda:0
module.base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight -> cuda:3
module.base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight -> cuda:2

would love some help...

Sign up or log in to comment