pytorch RuntimeError: CUDA error: device-side assert triggered

I've a notebook on google colab that fails with following error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

---> 94         raise e

     95     finally: cb_handler.on_train_end(exception)



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)

---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     85                 if cb_handler.on_batch_end(loss): break



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     24     if opt is not None:

---> 25         loss = cb_handler.on_backward_begin(loss)

     26         loss.backward()



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)

    223         for cb in self.callbacks:

--> 224             a = cb.on_backward_begin(**self.state_dict)

    225             if a is not None: self.state_dict['last_loss'] = a



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)

    266         if self.pbar is not None and hasattr(self.pbar,'child'):

--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'

    268 



/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)

    377         if self.dim() == 0:

--> 378             return self.item().__format__(format_spec)

    379         return object.__format__(self, format_spec)



RuntimeError: CUDA error: device-side assert triggered



During handling of the above exception, another exception occurred:



RuntimeError                              Traceback (most recent call last)

<ipython-input-33-dd390b1c8108> in <module>()

----> 1 lr_find(learn)

      2 learn.recorder.plot()



/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)

     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)

     27     a = int(np.ceil(num_it/len(learn.data.train_dl)))

---> 28     learn.fit(a, start_lr, callbacks=[cb], **kwargs)

     29 

     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)

    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)

    161         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,

--> 162             callbacks=self.callbacks+callbacks)

    163 

    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

     94         raise e

---> 95     finally: cb_handler.on_train_end(exception)

     96 

     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)

    254     def on_train_end(self, exception:Union[bool,Exception])->None:

    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."

--> 256         self('train_end', exception=exception)

    257 

    258 class AverageMetric(Callback):



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)

     45         # restore the valid_dl we turned of on `__init__`

     46         self.data.valid_dl = self.valid_dl

---> 47         self.learn.load('tmp')

     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()

     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)

    202         "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."

    203         if device is None: device = self.data.device

--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))

    205         return self

    206 



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)

    356         f = open(f, 'rb')

    357     try:

--> 358         return _load(f, map_location, pickle_module)

    359     finally:

    360         if new_fd:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)

    527     unpickler = pickle_module.Unpickler(f)

    528     unpickler.persistent_load = persistent_load

--> 529     result = unpickler.load()

    530 

    531     deserialized_storage_keys = pickle_module.load(f)



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)

    493             if root_key not in deserialized_objects:

    494                 deserialized_objects[root_key] = restore_location(

--> 495                     data_type(size), location)

    496             storage = deserialized_objects[root_key]

    497             if view_metadata is not None:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)

    376     elif isinstance(map_location, torch.device):

    377         def restore_location(storage, location):

--> 378             return default_restore_location(storage, str(map_location))

    379     else:

    380         def restore_location(storage, location):



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)

    102 def default_restore_location(storage, location):

    103     for _, _, fn in _package_registry:

--> 104         result = fn(storage, location)

    105         if result is not None:

    106             return result



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)

     84                                'to an existing device.'.format(

     85                                    device, torch.cuda.device_count()))

---> 86         return obj.cuda(device)

     87 

     88 



/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)

     74         else:

     75             new_type = getattr(torch.cuda, self.__class__.__name__)

---> 76             return new_type(self.size()).copy_(self, non_blocking)

     77 

     78 



RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this

!export CUDA_LAUNCH_BLOCKING=1

But this does not seem to work, still having the same error with.

Is there another way that works with Google Colab?

asked Nov 12 at 18:58

bachr

2,28053360

add a comment |

I've a notebook on google colab that fails with following error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

---> 94         raise e

     95     finally: cb_handler.on_train_end(exception)



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)

---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     85                 if cb_handler.on_batch_end(loss): break



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     24     if opt is not None:

---> 25         loss = cb_handler.on_backward_begin(loss)

     26         loss.backward()



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)

    223         for cb in self.callbacks:

--> 224             a = cb.on_backward_begin(**self.state_dict)

    225             if a is not None: self.state_dict['last_loss'] = a



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)

    266         if self.pbar is not None and hasattr(self.pbar,'child'):

--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'

    268 



/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)

    377         if self.dim() == 0:

--> 378             return self.item().__format__(format_spec)

    379         return object.__format__(self, format_spec)



RuntimeError: CUDA error: device-side assert triggered



During handling of the above exception, another exception occurred:



RuntimeError                              Traceback (most recent call last)

<ipython-input-33-dd390b1c8108> in <module>()

----> 1 lr_find(learn)

      2 learn.recorder.plot()



/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)

     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)

     27     a = int(np.ceil(num_it/len(learn.data.train_dl)))

---> 28     learn.fit(a, start_lr, callbacks=[cb], **kwargs)

     29 

     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)

    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)

    161         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,

--> 162             callbacks=self.callbacks+callbacks)

    163 

    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

     94         raise e

---> 95     finally: cb_handler.on_train_end(exception)

     96 

     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)

    254     def on_train_end(self, exception:Union[bool,Exception])->None:

    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."

--> 256         self('train_end', exception=exception)

    257 

    258 class AverageMetric(Callback):



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)

     45         # restore the valid_dl we turned of on `__init__`

     46         self.data.valid_dl = self.valid_dl

---> 47         self.learn.load('tmp')

     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()

     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)

    202         "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."

    203         if device is None: device = self.data.device

--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))

    205         return self

    206 



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)

    356         f = open(f, 'rb')

    357     try:

--> 358         return _load(f, map_location, pickle_module)

    359     finally:

    360         if new_fd:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)

    527     unpickler = pickle_module.Unpickler(f)

    528     unpickler.persistent_load = persistent_load

--> 529     result = unpickler.load()

    530 

    531     deserialized_storage_keys = pickle_module.load(f)



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)

    493             if root_key not in deserialized_objects:

    494                 deserialized_objects[root_key] = restore_location(

--> 495                     data_type(size), location)

    496             storage = deserialized_objects[root_key]

    497             if view_metadata is not None:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)

    376     elif isinstance(map_location, torch.device):

    377         def restore_location(storage, location):

--> 378             return default_restore_location(storage, str(map_location))

    379     else:

    380         def restore_location(storage, location):



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)

    102 def default_restore_location(storage, location):

    103     for _, _, fn in _package_registry:

--> 104         result = fn(storage, location)

    105         if result is not None:

    106             return result



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)

     84                                'to an existing device.'.format(

     85                                    device, torch.cuda.device_count()))

---> 86         return obj.cuda(device)

     87 

     88 



/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)

     74         else:

     75             new_type = getattr(torch.cuda, self.__class__.__name__)

---> 76             return new_type(self.size()).copy_(self, non_blocking)

     77 

     78 



RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this

!export CUDA_LAUNCH_BLOCKING=1

But this does not seem to work, still having the same error with.

Is there another way that works with Google Colab?

asked Nov 12 at 18:58

bachr

2,28053360

add a comment |

I've a notebook on google colab that fails with following error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

---> 94         raise e

     95     finally: cb_handler.on_train_end(exception)



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)

---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     85                 if cb_handler.on_batch_end(loss): break



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     24     if opt is not None:

---> 25         loss = cb_handler.on_backward_begin(loss)

     26         loss.backward()



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)

    223         for cb in self.callbacks:

--> 224             a = cb.on_backward_begin(**self.state_dict)

    225             if a is not None: self.state_dict['last_loss'] = a



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)

    266         if self.pbar is not None and hasattr(self.pbar,'child'):

--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'

    268 



/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)

    377         if self.dim() == 0:

--> 378             return self.item().__format__(format_spec)

    379         return object.__format__(self, format_spec)



RuntimeError: CUDA error: device-side assert triggered



During handling of the above exception, another exception occurred:



RuntimeError                              Traceback (most recent call last)

<ipython-input-33-dd390b1c8108> in <module>()

----> 1 lr_find(learn)

      2 learn.recorder.plot()



/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)

     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)

     27     a = int(np.ceil(num_it/len(learn.data.train_dl)))

---> 28     learn.fit(a, start_lr, callbacks=[cb], **kwargs)

     29 

     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)

    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)

    161         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,

--> 162             callbacks=self.callbacks+callbacks)

    163 

    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

     94         raise e

---> 95     finally: cb_handler.on_train_end(exception)

     96 

     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)

    254     def on_train_end(self, exception:Union[bool,Exception])->None:

    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."

--> 256         self('train_end', exception=exception)

    257 

    258 class AverageMetric(Callback):



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)

     45         # restore the valid_dl we turned of on `__init__`

     46         self.data.valid_dl = self.valid_dl

---> 47         self.learn.load('tmp')

     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()

     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)

    202         "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."

    203         if device is None: device = self.data.device

--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))

    205         return self

    206 



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)

    356         f = open(f, 'rb')

    357     try:

--> 358         return _load(f, map_location, pickle_module)

    359     finally:

    360         if new_fd:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)

    527     unpickler = pickle_module.Unpickler(f)

    528     unpickler.persistent_load = persistent_load

--> 529     result = unpickler.load()

    530 

    531     deserialized_storage_keys = pickle_module.load(f)



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)

    493             if root_key not in deserialized_objects:

    494                 deserialized_objects[root_key] = restore_location(

--> 495                     data_type(size), location)

    496             storage = deserialized_objects[root_key]

    497             if view_metadata is not None:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)

    376     elif isinstance(map_location, torch.device):

    377         def restore_location(storage, location):

--> 378             return default_restore_location(storage, str(map_location))

    379     else:

    380         def restore_location(storage, location):



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)

    102 def default_restore_location(storage, location):

    103     for _, _, fn in _package_registry:

--> 104         result = fn(storage, location)

    105         if result is not None:

    106             return result



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)

     84                                'to an existing device.'.format(

     85                                    device, torch.cuda.device_count()))

---> 86         return obj.cuda(device)

     87 

     88 



/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)

     74         else:

     75             new_type = getattr(torch.cuda, self.__class__.__name__)

---> 76             return new_type(self.size()).copy_(self, non_blocking)

     77 

     78 



RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this

!export CUDA_LAUNCH_BLOCKING=1

But this does not seem to work, still having the same error with.

Is there another way that works with Google Colab?

asked Nov 12 at 18:58

bachr

2,28053360

I've a notebook on google colab that fails with following error

---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

---> 94         raise e

     95     finally: cb_handler.on_train_end(exception)



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     83                 xb, yb = cb_handler.on_batch_begin(xb, yb)

---> 84                 loss = loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     85                 if cb_handler.on_batch_end(loss): break



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in loss_batch(model, xb, yb, loss_func, opt, cb_handler)

     24     if opt is not None:

---> 25         loss = cb_handler.on_backward_begin(loss)

     26         loss.backward()



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_backward_begin(self, loss)

    223         for cb in self.callbacks:

--> 224             a = cb.on_backward_begin(**self.state_dict)

    225             if a is not None: self.state_dict['last_loss'] = a



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in on_backward_begin(self, smooth_loss, **kwargs)

    266         if self.pbar is not None and hasattr(self.pbar,'child'):

--> 267             self.pbar.child.comment = f'{smooth_loss:.4f}'

    268 



/usr/local/lib/python3.6/dist-packages/torch/tensor.py in __format__(self, format_spec)

    377         if self.dim() == 0:

--> 378             return self.item().__format__(format_spec)

    379         return object.__format__(self, format_spec)



RuntimeError: CUDA error: device-side assert triggered



During handling of the above exception, another exception occurred:



RuntimeError                              Traceback (most recent call last)

<ipython-input-33-dd390b1c8108> in <module>()

----> 1 lr_find(learn)

      2 learn.recorder.plot()



/usr/local/lib/python3.6/dist-packages/fastai/train.py in lr_find(learn, start_lr, end_lr, num_it, stop_div, **kwargs)

     26     cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)

     27     a = int(np.ceil(num_it/len(learn.data.train_dl)))

---> 28     learn.fit(a, start_lr, callbacks=[cb], **kwargs)

     29 

     30 def to_fp16(learn:Learner, loss_scale:float=512., flat_master:bool=False)->Learner:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)

    160         callbacks = [cb(self) for cb in self.callback_fns] + listify(callbacks)

    161         fit(epochs, self.model, self.loss_func, opt=self.opt, data=self.data, metrics=self.metrics,

--> 162             callbacks=self.callbacks+callbacks)

    163 

    164     def create_opt(self, lr:Floats, wd:Floats=0.)->None:



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in fit(epochs, model, loss_func, opt, data, callbacks, metrics)

     93         exception = e

     94         raise e

---> 95     finally: cb_handler.on_train_end(exception)

     96 

     97 loss_func_name2activ = {'cross_entropy_loss': partial(F.softmax, dim=1), 'nll_loss': torch.exp, 'poisson_nll_loss': torch.exp,



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in on_train_end(self, exception)

    254     def on_train_end(self, exception:Union[bool,Exception])->None:

    255         "Handle end of training, `exception` is an `Exception` or False if no exceptions during training."

--> 256         self('train_end', exception=exception)

    257 

    258 class AverageMetric(Callback):



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callback.py in <listcomp>(.0)

    185         "Call through to all of the `CallbakHandler` functions."

    186         if call_mets: [getattr(met, f'on_{cb_name}')(**self.state_dict, **kwargs) for met in self.metrics]

--> 187         return [getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs) for cb in self.callbacks]

    188 

    189     def on_train_begin(self, epochs:int, pbar:PBar, metrics:MetricFuncList)->None:



/usr/local/lib/python3.6/dist-packages/fastai/callbacks/lr_finder.py in on_train_end(self, **kwargs)

     45         # restore the valid_dl we turned of on `__init__`

     46         self.data.valid_dl = self.valid_dl

---> 47         self.learn.load('tmp')

     48         if hasattr(self.learn.model, 'reset'): self.learn.model.reset()

     49         print('LR Finder complete, type {learner_name}.recorder.plot() to see the graph.')



/usr/local/lib/python3.6/dist-packages/fastai/basic_train.py in load(self, name, device)

    202         "Load model `name` from `self.model_dir` using `device`, defaulting to `self.data.device`."

    203         if device is None: device = self.data.device

--> 204         self.model.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))

    205         return self

    206 



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in load(f, map_location, pickle_module)

    356         f = open(f, 'rb')

    357     try:

--> 358         return _load(f, map_location, pickle_module)

    359     finally:

    360         if new_fd:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _load(f, map_location, pickle_module)

    527     unpickler = pickle_module.Unpickler(f)

    528     unpickler.persistent_load = persistent_load

--> 529     result = unpickler.load()

    530 

    531     deserialized_storage_keys = pickle_module.load(f)



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in persistent_load(saved_id)

    493             if root_key not in deserialized_objects:

    494                 deserialized_objects[root_key] = restore_location(

--> 495                     data_type(size), location)

    496             storage = deserialized_objects[root_key]

    497             if view_metadata is not None:



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in restore_location(storage, location)

    376     elif isinstance(map_location, torch.device):

    377         def restore_location(storage, location):

--> 378             return default_restore_location(storage, str(map_location))

    379     else:

    380         def restore_location(storage, location):



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in default_restore_location(storage, location)

    102 def default_restore_location(storage, location):

    103     for _, _, fn in _package_registry:

--> 104         result = fn(storage, location)

    105         if result is not None:

    106             return result



/usr/local/lib/python3.6/dist-packages/torch/serialization.py in _cuda_deserialize(obj, location)

     84                                'to an existing device.'.format(

     85                                    device, torch.cuda.device_count()))

---> 86         return obj.cuda(device)

     87 

     88 



/usr/local/lib/python3.6/dist-packages/torch/_utils.py in _cuda(self, device, non_blocking, **kwargs)

     74         else:

     75             new_type = getattr(torch.cuda, self.__class__.__name__)

---> 76             return new_type(self.size()).copy_(self, non_blocking)

     77 

     78 



RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

There is no information about the real cause, I tried to get the stack trace by forcing cuda to run on one gpu (as suggested here) using a cell like this

!export CUDA_LAUNCH_BLOCKING=1

But this does not seem to work, still having the same error with.

Is there another way that works with Google Colab?

python python-3.x computer-vision pytorch google-colaboratory

asked Nov 12 at 18:58

bachr

2,28053360

asked Nov 12 at 18:58

bachr

2,28053360

asked Nov 12 at 18:58

bachr

2,28053360

asked Nov 12 at 18:58

bachr

2,28053360

asked Nov 12 at 18:58

bachr

2,28053360

add a comment |

1 Answer
1

active

oldest

votes

!export FOO=blah is usually not useful to run in a notebook because ! means run the following command in a sub-shell, so the effect of the statement is gone by the time the ! returns.
You might have more success by storing your python code in a file and then executing that file in a subshell:

In one cell:

%%writefile foo.py

[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15 at 17:39

Ami F

58917

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53268442%2fpytorch-runtimeerror-cuda-error-device-side-assert-triggered%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

In one cell:

%%writefile foo.py

[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15 at 17:39

Ami F

58917

add a comment |

In one cell:

%%writefile foo.py

[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15 at 17:39

Ami F

58917

add a comment |

In one cell:

%%writefile foo.py

[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15 at 17:39

Ami F

58917

In one cell:

%%writefile foo.py

[...your code...]

In the next cell:

!export CUDA_LAUNCH_BLOCKING=1; python3 foo.py

(or s/python3/python2/ if you're writing py2)

answered Nov 15 at 17:39

Ami F

58917

answered Nov 15 at 17:39

Ami F

58917

answered Nov 15 at 17:39

Ami F

58917

answered Nov 15 at 17:39

Ami F

58917

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

bXXZ4tITpQYvIy

搜尋此網誌

Vfrdtyky