CUDA error: invalid device ordinal error resolution
- Error causes and solutions
Traceback (most recent call last): File "/share3/home/zhangyuhan/code/myMANet/codes/eval.py", line 70, in <module> evaluate(test_loader, model) File "/share3/home/zhangyuhan/code/myMANet/codes/eval.py", line 57, in evaluate img_pred = model(torch.cat((blur,event_inT),dim=1)) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 158, in forward inputs, kwargs = self. scatter(inputs, kwargs, self. device_ids) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 175, in scatter return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter_kwargs inputs = scatter(inputs, target_gpus, dim) if inputs else [] File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 36, in scatter res = scatter_map(inputs) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 23, in scatter_map return list(zip(*map(scatter_map, obj))) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 19, in scatter_map return Scatter.apply(target_gpus, None, dim, obj) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) RuntimeError: CUDA error: invalid device ordinal CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. (vid2e) zhangyuhan@server-04:~/code/myMANet/codes$ python eval.py Evaluating: 0%| | 0/611 [00:00<?, ?it/s] Traceback (most recent call last): File "/share3/home/zhangyuhan/code/myMANet/codes/eval.py", line 70, in <module> evaluate(test_loader, model) File "/share3/home/zhangyuhan/code/myMANet/codes/eval.py", line 57, in evaluate img_pred = model(torch.cat((blur,event_inT),dim=1)) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 158, in forward inputs, kwargs = self. scatter(inputs, kwargs, self. device_ids) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 175, in scatter return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter_kwargs inputs = scatter(inputs, target_gpus, dim) if inputs else [] File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 36, in scatter res = scatter_map(inputs) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 23, in scatter_map return list(zip(*map(scatter_map, obj))) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 19, in scatter_map return Scatter.apply(target_gpus, None, dim, obj) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams) File "/share3/home/zhangyuhan/miniconda3/envs/vid2e/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) RuntimeError: CUDA error: invalid device ordinal CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Error reasons and solutions
Since DataParallel is used during training, and torch.save is saved using the following code
def save_checkpoint(epoch, model, optimizer): """ Save model checkpoint. :param epoch: epoch number :param model: model :param optimizer: optimizer """ import torch state = {<!-- -->'epoch': epoch, 'model': model, 'optimizer': optimizer} filename = 'checkpoint_myMANet_blur5.pth.tar' torch. save(state, filename)
Save the model directly here. Change it to save the parameters of the model (note: model.module.state_dict()
must be used, model.state_dict()
cannot be used. Related link: pytorch loads nn. Problems with the model trained by DataParallel
change to:
def save_checkpoint_dict(epoch, model, optimizer): """ Save model checkpoint. :param epoch: epoch number :param model: model :param optimizer: optimizer """ import torch state = {<!-- -->'epoch': epoch, 'model': model.module.state_dict(), 'optimizer': optimizer. state_dict()} filename = 'checkpoint_dict_myMANet_blur5.pth.tar' torch. save(state, filename)
The specific use is as follows:
test.py
os.environ['CUDA_VISIBLE_DEVICES']="1" # choose GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint = r'checkpoint_dict_myMANet_blur5.pth.tar' # Load model checkpoint that is to be evaluated checkpoint = torch.load(checkpoint,map_location=torch.device('cpu')) # model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'./pca_matrix_aniso21_15_x4.pth') model.load_state_dict(checkpoint['model']) # This is used when torch.save(model) #model = checkpoint['model'] model = model.to(device) # Switch to eval mode model.eval()
train.py (not verified whether it works)
model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'/share3/home/zhangyuhan/code/myMANet/codes/pca_matrix_aniso21_15_x4.pth') optimizer = torch.optim.Adam(params=model.parameters(), lr=lr) if checkpoint is None: start_epoch = 0 #model = MANet_s3(ev_lr_nc=7,in_nc=32,out_nc=1,pca_path=r'/share3/home/zhangyuhan/code/myMANet/codes/pca_matrix_aniso21_15_x4.pth') #optimizer = torch.optim.Adam(params=model.parameters(), lr=lr) else: checkpoint = torch.load(checkpoint,map_location=torch.device('cpu')) start_epoch = checkpoint['epoch'] + 1 print('\\ Loaded checkpoint from epoch %d.\\ ' % start_epoch) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) #model = checkpoint['model'] #optimizer = checkpoint['optimizer'] model = model. cuda() model = torch.nn.DataParallel(model)