pytorch runs much slower than tensorflow even for a shallow CNN
The input data is an EEG data which was converted into PSD. And its shape is [103600, 59, 51], where 103600 is the number of samples, i.e. the total samples for an epoch. The data was loaded into memory.
And the model I used is very simple, however, the time required for training an epoch was 50s with pytorch 0.4 and a GTX 1080. However, the time for val set interface was 0.3s.
I also tested this model using tensorflow 1.2.0 the time was 8s. I have no idea if I got it wrong.
So I made some test. This img shows the time for runing.
enter image description here
where
time_to_tensor is the time for inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
time_to_cuda for inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_model for out = self.model(inputs)
time_criterion for loss = self.criterion(out, targets)
time_backward for self.optimizer.step()
It seems the operation for tensor.to(self.device) (or tensor.cuda()) wasted much time. So I tried to move the data to GPU at once, using
self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
and commented
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
inputs, targets = inputs.to(self.device), targets.to(self.device)
However I got the running time in the following img.enter image description here. The time required for an epoch still was about 50s, and time for each operation changed. I am confused.
Could someone help me to find the problem? Thanks.
Code was here.
It's the model I used
import torch
import torch.nn as nn
import torch.nn.functional as F
class TxtCNN4(nn.Module):
def __init__(self, n_classes=6, time_steps=59, psd_lenght=51, filter_numbers=128):
super(TxtCNN4, self).__init__()
self.n_classes = n_classes
self.time_steps = time_steps
self.psd_lenght = psd_lenght
self.filters = filter_numbers
self.conv1 = nn.Conv2d(in_channels=1, out_channels=self.filters//2,
kernel_size=(3, psd_lenght), stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(self.filters//2, momentum=0.05)
self.pool1 = nn.MaxPool2d(kernel_size=(self.time_steps - 3 + 1, 1))
self.conv2 = nn.Conv2d(in_channels=self.filters//2, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool2 = nn.MaxPool2d(kernel_size=(self.time_steps - 5 + 1, 1))
self.conv3 = nn.Conv2d(in_channels=self.filters, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn3 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool3 = nn.MaxPool2d(kernel_size=(self.time_steps - 7 + 1, 1))
self.fc = nn.Linear(in_features=2*self.filters+self.filters//2, out_features=128)
self.cls = nn.Linear(in_features=128, out_features=self.n_classes)
self.criterion = nn.CrossEntropyLoss(weight=None)
def forward(self, inputs):
inputs = inputs.view(-1, self.time_steps, self.psd_lenght, 1)
inputs = inputs.permute(0, 3, 1, 2)
conv1 = F.relu(self.bn1(self.conv1(inputs)), inplace=True)
conv2 = F.relu(self.bn2(self.conv2(conv1)), inplace=True)
conv3 = F.relu(self.bn3(self.conv3(conv2)), inplace=True)
x = torch.cat((self.pool1(conv1), self.pool2(conv2), self.pool3(conv3)), dim=1)
x = x.view(-1, 2*self.filters+self.filters//2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc(x), inplace=True)
x = self.cls(x)
return x
It's the main python file for traning the model
file_path = '/mnt/disk2/wy/SLEEP_EDF/'
batch_size = 32*4*2
dropout_rate = 0.5
nb_classes = 5
max_epochs = 100
early_stop_epoch = 10
learning_rate = 1e-3
model_type='TxtCNN4'
filter_numbers = 128
fs = 100 # Sample frequence
n_Channels = 2
n_Samples = 30*100
load_flag = False
model_path = ''
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
input_len = inputs.shape[0]
assert input_len == len(targets)
if shuffle:
indices = np.arange(input_len)
np.random.shuffle(indices)
for start_idx in range(0, input_len, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
class Trainer():
def __init__(self, X_inputs, labels, fold, subj_id, log_path, model_type=model_type, args=None):
self.subj_id = subj_id
self.model_type = model_type
self.writer_root_path = os.path.abspath(
os.path.join(
os.path.curdir,
log_path+'_'+model_type,
model_type + '_' + str(subj_id)))
self.train_writer = SummaryWriter(os.path.join(self.writer_root_path, 'train'))
self.val_writer = SummaryWriter(os.path.join(self.writer_root_path, 'val'))
self.test_writer = SummaryWriter(os.path.join(self.writer_root_path, 'test'))
self.curr_epcoh = 0
self.curr_iter = 0
self.train_end_flag = False
(self.X_train, self.y_train), (self.X_val, self.y_val), (self.X_test, self.y_test) = reformatInput(X_inputs, labels, fold)
print('Test set label and BiLi:t', np.unique(self.y_test, return_counts=True))
# normalization between all data
X_mean = self.X_train.mean()
X_std = self.X_train.std()
self.X_train = (self.X_train - X_mean)/X_std
self.X_val = (self.X_val - X_mean)/X_std
self.X_test = (self.X_test - X_mean)/X_std
if model_type == 'TxtCNN4':
model = TxtCNN4()
else:
model = DeepSleepNet()
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.model = model.to(self.device)
self.criterion = nn.CrossEntropyLoss(weight=None)
# self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
# self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def train(self, epoch):
train_loss = AvgMeter()
train_acc = AvgMeter()
start_time = time.time()
time_to_tensor = 0
time_to_cuda = 0
time_model = 0
time_criterion = 0
time_backward = 0
time_step = 0
time_writer = 0
self.curr_epcoh = epoch
self.model.train()
for (inputs, targets) in iterate_minibatches(self.X_train, self.y_train, batch_size, shuffle=False):
self.curr_iter += 1
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
time_to_tensor += (time.time() - time_1)
time_1 = time.time()
inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_cuda += (time.time() - time_1)
# zero the parameter gradients
self.optimizer.zero_grad()
# forward + backward + optimize
time_1 = time.time()
out = self.model(inputs)
time_model += (time.time() - time_1)
time_1 = time.time()
loss = self.criterion(out, targets)
time_criterion += (time.time() - time_1)
time_1 = time.time()
loss.backward()
time_backward += (time.time() - time_1)
time_1 = time.time()
self.optimizer.step()
time_step += (time.time() - time_1)
if self.curr_iter % 20 == 0:
time_1 = time.time()
pred = out.max(dim=1)[1].cpu().numpy()
gt = targets.cpu().numpy()
acc = np.mean(pred==gt)
_loss = loss.detach().cpu().numpy()
train_acc.update(acc)
train_loss.update(_loss)
self.train_writer.add_scalar('learning_rate', curr_lr, self.curr_iter)
self.train_writer.add_scalar('acc', acc, self.curr_iter)
self.train_writer.add_scalar('loss', _loss, self.curr_iter)
time_writer += (time.time() - time_1)
fmt_str = "TraintEpoch [{:d}/{:d}] train_Loss: {:.4f}ttrain_Acc: {:.2f}tTime per Epoch: {:.4f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
train_loss.avg,
train_acc.avg*100,
time.time()-start_time)
print(print_str)
print('time_to_tensor: ',time_to_tensor)
print('time_to_cuda: ',time_to_cuda)
print('time_model: ',time_model)
print('time_criterion: ',time_criterion)
print('time_backward: ',time_backward)
print('time_step: ',time_step)
print('time_writer: ',time_writer)
print(len(self.y_train))
train_acc.reset()
train_loss.reset()
def validation(self):
self.model.eval()
with torch.no_grad():
# val set
loss_gather = AvgMeter()
pred_gather = PreGather()
tran_time = 0
for (inputs, targets) in iterate_minibatches(self.X_val, self.y_val, batch_size, shuffle=False):
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
inputs, targets = inputs.to(self.device), targets.to(self.device)
tran_time += (time.time() - time_1)
out = self.model(inputs)
_loss = self.criterion(out, targets).detach().cpu().numpy()
pred = out.max(dim=1)[1].cpu().numpy()
loss_gather.update(_loss)
pred_gather.update(pred)
val_loss = loss_gather.avg
pred = pred_gather.pred
val_acc = np.mean(pred==self.y_val)
val_kappa = cohen_kappa_score(self.y_val, pred)
val_BCA = recall_score(self.y_val, pred, average='macro')
loss_gather.reset()
pred_gather.reset()
# Then we print the results for this epoch:
fmt_str = "VAL tEpoch [{:d}/{:d}] val_Loss: {:.4f}tval_Acc: {:.2f}tval_kappa: {:.2f}tval_BCA: {:.2f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
val_loss,
val_acc*100,
val_kappa*100,
val_BCA*100)
print(print_str)
print(tran_time)
print(len(self.y_val))
self.val_writer.add_scalar('acc', val_acc, self.curr_iter)
self.val_writer.add_scalar('loss', val_loss, self.curr_iter)
self.val_writer.add_scalar('kappa', val_kappa, self.curr_iter)
self.val_writer.add_scalar('bca', val_BCA, self.curr_iter)
def train_all_subject(num_epochs=max_epochs, log_path=None):
# Leave-Subject-Out cross validation
subj_nums, fold_pairs, EEGs, labels = load_data(file_path, subj_nums=38, channel=1)
for subj_id in range(subj_nums):
print('The subj_id', subj_id, 'tt Training the ' + model_type + ' Model...')
trainer = Trainer(EEGs, labels, fold_pairs[subj_id], subj_id, log_path)
for epoch in range(trainer.curr_epcoh, max_epochs):
if trainer.train_end_flag is False:
trainer.train(epoch)
trainer.validation()
print('-'*50)
else:
break
python tensorflow pytorch
add a comment |
The input data is an EEG data which was converted into PSD. And its shape is [103600, 59, 51], where 103600 is the number of samples, i.e. the total samples for an epoch. The data was loaded into memory.
And the model I used is very simple, however, the time required for training an epoch was 50s with pytorch 0.4 and a GTX 1080. However, the time for val set interface was 0.3s.
I also tested this model using tensorflow 1.2.0 the time was 8s. I have no idea if I got it wrong.
So I made some test. This img shows the time for runing.
enter image description here
where
time_to_tensor is the time for inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
time_to_cuda for inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_model for out = self.model(inputs)
time_criterion for loss = self.criterion(out, targets)
time_backward for self.optimizer.step()
It seems the operation for tensor.to(self.device) (or tensor.cuda()) wasted much time. So I tried to move the data to GPU at once, using
self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
and commented
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
inputs, targets = inputs.to(self.device), targets.to(self.device)
However I got the running time in the following img.enter image description here. The time required for an epoch still was about 50s, and time for each operation changed. I am confused.
Could someone help me to find the problem? Thanks.
Code was here.
It's the model I used
import torch
import torch.nn as nn
import torch.nn.functional as F
class TxtCNN4(nn.Module):
def __init__(self, n_classes=6, time_steps=59, psd_lenght=51, filter_numbers=128):
super(TxtCNN4, self).__init__()
self.n_classes = n_classes
self.time_steps = time_steps
self.psd_lenght = psd_lenght
self.filters = filter_numbers
self.conv1 = nn.Conv2d(in_channels=1, out_channels=self.filters//2,
kernel_size=(3, psd_lenght), stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(self.filters//2, momentum=0.05)
self.pool1 = nn.MaxPool2d(kernel_size=(self.time_steps - 3 + 1, 1))
self.conv2 = nn.Conv2d(in_channels=self.filters//2, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool2 = nn.MaxPool2d(kernel_size=(self.time_steps - 5 + 1, 1))
self.conv3 = nn.Conv2d(in_channels=self.filters, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn3 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool3 = nn.MaxPool2d(kernel_size=(self.time_steps - 7 + 1, 1))
self.fc = nn.Linear(in_features=2*self.filters+self.filters//2, out_features=128)
self.cls = nn.Linear(in_features=128, out_features=self.n_classes)
self.criterion = nn.CrossEntropyLoss(weight=None)
def forward(self, inputs):
inputs = inputs.view(-1, self.time_steps, self.psd_lenght, 1)
inputs = inputs.permute(0, 3, 1, 2)
conv1 = F.relu(self.bn1(self.conv1(inputs)), inplace=True)
conv2 = F.relu(self.bn2(self.conv2(conv1)), inplace=True)
conv3 = F.relu(self.bn3(self.conv3(conv2)), inplace=True)
x = torch.cat((self.pool1(conv1), self.pool2(conv2), self.pool3(conv3)), dim=1)
x = x.view(-1, 2*self.filters+self.filters//2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc(x), inplace=True)
x = self.cls(x)
return x
It's the main python file for traning the model
file_path = '/mnt/disk2/wy/SLEEP_EDF/'
batch_size = 32*4*2
dropout_rate = 0.5
nb_classes = 5
max_epochs = 100
early_stop_epoch = 10
learning_rate = 1e-3
model_type='TxtCNN4'
filter_numbers = 128
fs = 100 # Sample frequence
n_Channels = 2
n_Samples = 30*100
load_flag = False
model_path = ''
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
input_len = inputs.shape[0]
assert input_len == len(targets)
if shuffle:
indices = np.arange(input_len)
np.random.shuffle(indices)
for start_idx in range(0, input_len, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
class Trainer():
def __init__(self, X_inputs, labels, fold, subj_id, log_path, model_type=model_type, args=None):
self.subj_id = subj_id
self.model_type = model_type
self.writer_root_path = os.path.abspath(
os.path.join(
os.path.curdir,
log_path+'_'+model_type,
model_type + '_' + str(subj_id)))
self.train_writer = SummaryWriter(os.path.join(self.writer_root_path, 'train'))
self.val_writer = SummaryWriter(os.path.join(self.writer_root_path, 'val'))
self.test_writer = SummaryWriter(os.path.join(self.writer_root_path, 'test'))
self.curr_epcoh = 0
self.curr_iter = 0
self.train_end_flag = False
(self.X_train, self.y_train), (self.X_val, self.y_val), (self.X_test, self.y_test) = reformatInput(X_inputs, labels, fold)
print('Test set label and BiLi:t', np.unique(self.y_test, return_counts=True))
# normalization between all data
X_mean = self.X_train.mean()
X_std = self.X_train.std()
self.X_train = (self.X_train - X_mean)/X_std
self.X_val = (self.X_val - X_mean)/X_std
self.X_test = (self.X_test - X_mean)/X_std
if model_type == 'TxtCNN4':
model = TxtCNN4()
else:
model = DeepSleepNet()
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.model = model.to(self.device)
self.criterion = nn.CrossEntropyLoss(weight=None)
# self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
# self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def train(self, epoch):
train_loss = AvgMeter()
train_acc = AvgMeter()
start_time = time.time()
time_to_tensor = 0
time_to_cuda = 0
time_model = 0
time_criterion = 0
time_backward = 0
time_step = 0
time_writer = 0
self.curr_epcoh = epoch
self.model.train()
for (inputs, targets) in iterate_minibatches(self.X_train, self.y_train, batch_size, shuffle=False):
self.curr_iter += 1
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
time_to_tensor += (time.time() - time_1)
time_1 = time.time()
inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_cuda += (time.time() - time_1)
# zero the parameter gradients
self.optimizer.zero_grad()
# forward + backward + optimize
time_1 = time.time()
out = self.model(inputs)
time_model += (time.time() - time_1)
time_1 = time.time()
loss = self.criterion(out, targets)
time_criterion += (time.time() - time_1)
time_1 = time.time()
loss.backward()
time_backward += (time.time() - time_1)
time_1 = time.time()
self.optimizer.step()
time_step += (time.time() - time_1)
if self.curr_iter % 20 == 0:
time_1 = time.time()
pred = out.max(dim=1)[1].cpu().numpy()
gt = targets.cpu().numpy()
acc = np.mean(pred==gt)
_loss = loss.detach().cpu().numpy()
train_acc.update(acc)
train_loss.update(_loss)
self.train_writer.add_scalar('learning_rate', curr_lr, self.curr_iter)
self.train_writer.add_scalar('acc', acc, self.curr_iter)
self.train_writer.add_scalar('loss', _loss, self.curr_iter)
time_writer += (time.time() - time_1)
fmt_str = "TraintEpoch [{:d}/{:d}] train_Loss: {:.4f}ttrain_Acc: {:.2f}tTime per Epoch: {:.4f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
train_loss.avg,
train_acc.avg*100,
time.time()-start_time)
print(print_str)
print('time_to_tensor: ',time_to_tensor)
print('time_to_cuda: ',time_to_cuda)
print('time_model: ',time_model)
print('time_criterion: ',time_criterion)
print('time_backward: ',time_backward)
print('time_step: ',time_step)
print('time_writer: ',time_writer)
print(len(self.y_train))
train_acc.reset()
train_loss.reset()
def validation(self):
self.model.eval()
with torch.no_grad():
# val set
loss_gather = AvgMeter()
pred_gather = PreGather()
tran_time = 0
for (inputs, targets) in iterate_minibatches(self.X_val, self.y_val, batch_size, shuffle=False):
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
inputs, targets = inputs.to(self.device), targets.to(self.device)
tran_time += (time.time() - time_1)
out = self.model(inputs)
_loss = self.criterion(out, targets).detach().cpu().numpy()
pred = out.max(dim=1)[1].cpu().numpy()
loss_gather.update(_loss)
pred_gather.update(pred)
val_loss = loss_gather.avg
pred = pred_gather.pred
val_acc = np.mean(pred==self.y_val)
val_kappa = cohen_kappa_score(self.y_val, pred)
val_BCA = recall_score(self.y_val, pred, average='macro')
loss_gather.reset()
pred_gather.reset()
# Then we print the results for this epoch:
fmt_str = "VAL tEpoch [{:d}/{:d}] val_Loss: {:.4f}tval_Acc: {:.2f}tval_kappa: {:.2f}tval_BCA: {:.2f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
val_loss,
val_acc*100,
val_kappa*100,
val_BCA*100)
print(print_str)
print(tran_time)
print(len(self.y_val))
self.val_writer.add_scalar('acc', val_acc, self.curr_iter)
self.val_writer.add_scalar('loss', val_loss, self.curr_iter)
self.val_writer.add_scalar('kappa', val_kappa, self.curr_iter)
self.val_writer.add_scalar('bca', val_BCA, self.curr_iter)
def train_all_subject(num_epochs=max_epochs, log_path=None):
# Leave-Subject-Out cross validation
subj_nums, fold_pairs, EEGs, labels = load_data(file_path, subj_nums=38, channel=1)
for subj_id in range(subj_nums):
print('The subj_id', subj_id, 'tt Training the ' + model_type + ' Model...')
trainer = Trainer(EEGs, labels, fold_pairs[subj_id], subj_id, log_path)
for epoch in range(trainer.curr_epcoh, max_epochs):
if trainer.train_end_flag is False:
trainer.train(epoch)
trainer.validation()
print('-'*50)
else:
break
python tensorflow pytorch
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also trytorch.tensor(..., device=device)as this is generally (sometimes much) faster than using.to(device). No detour over CPU memory.
– blue-phoenox
Nov 12 at 12:17
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22
add a comment |
The input data is an EEG data which was converted into PSD. And its shape is [103600, 59, 51], where 103600 is the number of samples, i.e. the total samples for an epoch. The data was loaded into memory.
And the model I used is very simple, however, the time required for training an epoch was 50s with pytorch 0.4 and a GTX 1080. However, the time for val set interface was 0.3s.
I also tested this model using tensorflow 1.2.0 the time was 8s. I have no idea if I got it wrong.
So I made some test. This img shows the time for runing.
enter image description here
where
time_to_tensor is the time for inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
time_to_cuda for inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_model for out = self.model(inputs)
time_criterion for loss = self.criterion(out, targets)
time_backward for self.optimizer.step()
It seems the operation for tensor.to(self.device) (or tensor.cuda()) wasted much time. So I tried to move the data to GPU at once, using
self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
and commented
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
inputs, targets = inputs.to(self.device), targets.to(self.device)
However I got the running time in the following img.enter image description here. The time required for an epoch still was about 50s, and time for each operation changed. I am confused.
Could someone help me to find the problem? Thanks.
Code was here.
It's the model I used
import torch
import torch.nn as nn
import torch.nn.functional as F
class TxtCNN4(nn.Module):
def __init__(self, n_classes=6, time_steps=59, psd_lenght=51, filter_numbers=128):
super(TxtCNN4, self).__init__()
self.n_classes = n_classes
self.time_steps = time_steps
self.psd_lenght = psd_lenght
self.filters = filter_numbers
self.conv1 = nn.Conv2d(in_channels=1, out_channels=self.filters//2,
kernel_size=(3, psd_lenght), stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(self.filters//2, momentum=0.05)
self.pool1 = nn.MaxPool2d(kernel_size=(self.time_steps - 3 + 1, 1))
self.conv2 = nn.Conv2d(in_channels=self.filters//2, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool2 = nn.MaxPool2d(kernel_size=(self.time_steps - 5 + 1, 1))
self.conv3 = nn.Conv2d(in_channels=self.filters, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn3 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool3 = nn.MaxPool2d(kernel_size=(self.time_steps - 7 + 1, 1))
self.fc = nn.Linear(in_features=2*self.filters+self.filters//2, out_features=128)
self.cls = nn.Linear(in_features=128, out_features=self.n_classes)
self.criterion = nn.CrossEntropyLoss(weight=None)
def forward(self, inputs):
inputs = inputs.view(-1, self.time_steps, self.psd_lenght, 1)
inputs = inputs.permute(0, 3, 1, 2)
conv1 = F.relu(self.bn1(self.conv1(inputs)), inplace=True)
conv2 = F.relu(self.bn2(self.conv2(conv1)), inplace=True)
conv3 = F.relu(self.bn3(self.conv3(conv2)), inplace=True)
x = torch.cat((self.pool1(conv1), self.pool2(conv2), self.pool3(conv3)), dim=1)
x = x.view(-1, 2*self.filters+self.filters//2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc(x), inplace=True)
x = self.cls(x)
return x
It's the main python file for traning the model
file_path = '/mnt/disk2/wy/SLEEP_EDF/'
batch_size = 32*4*2
dropout_rate = 0.5
nb_classes = 5
max_epochs = 100
early_stop_epoch = 10
learning_rate = 1e-3
model_type='TxtCNN4'
filter_numbers = 128
fs = 100 # Sample frequence
n_Channels = 2
n_Samples = 30*100
load_flag = False
model_path = ''
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
input_len = inputs.shape[0]
assert input_len == len(targets)
if shuffle:
indices = np.arange(input_len)
np.random.shuffle(indices)
for start_idx in range(0, input_len, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
class Trainer():
def __init__(self, X_inputs, labels, fold, subj_id, log_path, model_type=model_type, args=None):
self.subj_id = subj_id
self.model_type = model_type
self.writer_root_path = os.path.abspath(
os.path.join(
os.path.curdir,
log_path+'_'+model_type,
model_type + '_' + str(subj_id)))
self.train_writer = SummaryWriter(os.path.join(self.writer_root_path, 'train'))
self.val_writer = SummaryWriter(os.path.join(self.writer_root_path, 'val'))
self.test_writer = SummaryWriter(os.path.join(self.writer_root_path, 'test'))
self.curr_epcoh = 0
self.curr_iter = 0
self.train_end_flag = False
(self.X_train, self.y_train), (self.X_val, self.y_val), (self.X_test, self.y_test) = reformatInput(X_inputs, labels, fold)
print('Test set label and BiLi:t', np.unique(self.y_test, return_counts=True))
# normalization between all data
X_mean = self.X_train.mean()
X_std = self.X_train.std()
self.X_train = (self.X_train - X_mean)/X_std
self.X_val = (self.X_val - X_mean)/X_std
self.X_test = (self.X_test - X_mean)/X_std
if model_type == 'TxtCNN4':
model = TxtCNN4()
else:
model = DeepSleepNet()
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.model = model.to(self.device)
self.criterion = nn.CrossEntropyLoss(weight=None)
# self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
# self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def train(self, epoch):
train_loss = AvgMeter()
train_acc = AvgMeter()
start_time = time.time()
time_to_tensor = 0
time_to_cuda = 0
time_model = 0
time_criterion = 0
time_backward = 0
time_step = 0
time_writer = 0
self.curr_epcoh = epoch
self.model.train()
for (inputs, targets) in iterate_minibatches(self.X_train, self.y_train, batch_size, shuffle=False):
self.curr_iter += 1
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
time_to_tensor += (time.time() - time_1)
time_1 = time.time()
inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_cuda += (time.time() - time_1)
# zero the parameter gradients
self.optimizer.zero_grad()
# forward + backward + optimize
time_1 = time.time()
out = self.model(inputs)
time_model += (time.time() - time_1)
time_1 = time.time()
loss = self.criterion(out, targets)
time_criterion += (time.time() - time_1)
time_1 = time.time()
loss.backward()
time_backward += (time.time() - time_1)
time_1 = time.time()
self.optimizer.step()
time_step += (time.time() - time_1)
if self.curr_iter % 20 == 0:
time_1 = time.time()
pred = out.max(dim=1)[1].cpu().numpy()
gt = targets.cpu().numpy()
acc = np.mean(pred==gt)
_loss = loss.detach().cpu().numpy()
train_acc.update(acc)
train_loss.update(_loss)
self.train_writer.add_scalar('learning_rate', curr_lr, self.curr_iter)
self.train_writer.add_scalar('acc', acc, self.curr_iter)
self.train_writer.add_scalar('loss', _loss, self.curr_iter)
time_writer += (time.time() - time_1)
fmt_str = "TraintEpoch [{:d}/{:d}] train_Loss: {:.4f}ttrain_Acc: {:.2f}tTime per Epoch: {:.4f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
train_loss.avg,
train_acc.avg*100,
time.time()-start_time)
print(print_str)
print('time_to_tensor: ',time_to_tensor)
print('time_to_cuda: ',time_to_cuda)
print('time_model: ',time_model)
print('time_criterion: ',time_criterion)
print('time_backward: ',time_backward)
print('time_step: ',time_step)
print('time_writer: ',time_writer)
print(len(self.y_train))
train_acc.reset()
train_loss.reset()
def validation(self):
self.model.eval()
with torch.no_grad():
# val set
loss_gather = AvgMeter()
pred_gather = PreGather()
tran_time = 0
for (inputs, targets) in iterate_minibatches(self.X_val, self.y_val, batch_size, shuffle=False):
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
inputs, targets = inputs.to(self.device), targets.to(self.device)
tran_time += (time.time() - time_1)
out = self.model(inputs)
_loss = self.criterion(out, targets).detach().cpu().numpy()
pred = out.max(dim=1)[1].cpu().numpy()
loss_gather.update(_loss)
pred_gather.update(pred)
val_loss = loss_gather.avg
pred = pred_gather.pred
val_acc = np.mean(pred==self.y_val)
val_kappa = cohen_kappa_score(self.y_val, pred)
val_BCA = recall_score(self.y_val, pred, average='macro')
loss_gather.reset()
pred_gather.reset()
# Then we print the results for this epoch:
fmt_str = "VAL tEpoch [{:d}/{:d}] val_Loss: {:.4f}tval_Acc: {:.2f}tval_kappa: {:.2f}tval_BCA: {:.2f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
val_loss,
val_acc*100,
val_kappa*100,
val_BCA*100)
print(print_str)
print(tran_time)
print(len(self.y_val))
self.val_writer.add_scalar('acc', val_acc, self.curr_iter)
self.val_writer.add_scalar('loss', val_loss, self.curr_iter)
self.val_writer.add_scalar('kappa', val_kappa, self.curr_iter)
self.val_writer.add_scalar('bca', val_BCA, self.curr_iter)
def train_all_subject(num_epochs=max_epochs, log_path=None):
# Leave-Subject-Out cross validation
subj_nums, fold_pairs, EEGs, labels = load_data(file_path, subj_nums=38, channel=1)
for subj_id in range(subj_nums):
print('The subj_id', subj_id, 'tt Training the ' + model_type + ' Model...')
trainer = Trainer(EEGs, labels, fold_pairs[subj_id], subj_id, log_path)
for epoch in range(trainer.curr_epcoh, max_epochs):
if trainer.train_end_flag is False:
trainer.train(epoch)
trainer.validation()
print('-'*50)
else:
break
python tensorflow pytorch
The input data is an EEG data which was converted into PSD. And its shape is [103600, 59, 51], where 103600 is the number of samples, i.e. the total samples for an epoch. The data was loaded into memory.
And the model I used is very simple, however, the time required for training an epoch was 50s with pytorch 0.4 and a GTX 1080. However, the time for val set interface was 0.3s.
I also tested this model using tensorflow 1.2.0 the time was 8s. I have no idea if I got it wrong.
So I made some test. This img shows the time for runing.
enter image description here
where
time_to_tensor is the time for inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
time_to_cuda for inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_model for out = self.model(inputs)
time_criterion for loss = self.criterion(out, targets)
time_backward for self.optimizer.step()
It seems the operation for tensor.to(self.device) (or tensor.cuda()) wasted much time. So I tried to move the data to GPU at once, using
self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
and commented
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64)
inputs, targets = inputs.to(self.device), targets.to(self.device)
However I got the running time in the following img.enter image description here. The time required for an epoch still was about 50s, and time for each operation changed. I am confused.
Could someone help me to find the problem? Thanks.
Code was here.
It's the model I used
import torch
import torch.nn as nn
import torch.nn.functional as F
class TxtCNN4(nn.Module):
def __init__(self, n_classes=6, time_steps=59, psd_lenght=51, filter_numbers=128):
super(TxtCNN4, self).__init__()
self.n_classes = n_classes
self.time_steps = time_steps
self.psd_lenght = psd_lenght
self.filters = filter_numbers
self.conv1 = nn.Conv2d(in_channels=1, out_channels=self.filters//2,
kernel_size=(3, psd_lenght), stride=1, padding=0, bias=False)
self.bn1 = nn.BatchNorm2d(self.filters//2, momentum=0.05)
self.pool1 = nn.MaxPool2d(kernel_size=(self.time_steps - 3 + 1, 1))
self.conv2 = nn.Conv2d(in_channels=self.filters//2, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn2 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool2 = nn.MaxPool2d(kernel_size=(self.time_steps - 5 + 1, 1))
self.conv3 = nn.Conv2d(in_channels=self.filters, out_channels=self.filters,
kernel_size=(3, 1), stride=1, padding=0, bias=False)
self.bn3 = nn.BatchNorm2d(self.filters, momentum=0.05)
self.pool3 = nn.MaxPool2d(kernel_size=(self.time_steps - 7 + 1, 1))
self.fc = nn.Linear(in_features=2*self.filters+self.filters//2, out_features=128)
self.cls = nn.Linear(in_features=128, out_features=self.n_classes)
self.criterion = nn.CrossEntropyLoss(weight=None)
def forward(self, inputs):
inputs = inputs.view(-1, self.time_steps, self.psd_lenght, 1)
inputs = inputs.permute(0, 3, 1, 2)
conv1 = F.relu(self.bn1(self.conv1(inputs)), inplace=True)
conv2 = F.relu(self.bn2(self.conv2(conv1)), inplace=True)
conv3 = F.relu(self.bn3(self.conv3(conv2)), inplace=True)
x = torch.cat((self.pool1(conv1), self.pool2(conv2), self.pool3(conv3)), dim=1)
x = x.view(-1, 2*self.filters+self.filters//2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc(x), inplace=True)
x = self.cls(x)
return x
It's the main python file for traning the model
file_path = '/mnt/disk2/wy/SLEEP_EDF/'
batch_size = 32*4*2
dropout_rate = 0.5
nb_classes = 5
max_epochs = 100
early_stop_epoch = 10
learning_rate = 1e-3
model_type='TxtCNN4'
filter_numbers = 128
fs = 100 # Sample frequence
n_Channels = 2
n_Samples = 30*100
load_flag = False
model_path = ''
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
input_len = inputs.shape[0]
assert input_len == len(targets)
if shuffle:
indices = np.arange(input_len)
np.random.shuffle(indices)
for start_idx in range(0, input_len, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
class Trainer():
def __init__(self, X_inputs, labels, fold, subj_id, log_path, model_type=model_type, args=None):
self.subj_id = subj_id
self.model_type = model_type
self.writer_root_path = os.path.abspath(
os.path.join(
os.path.curdir,
log_path+'_'+model_type,
model_type + '_' + str(subj_id)))
self.train_writer = SummaryWriter(os.path.join(self.writer_root_path, 'train'))
self.val_writer = SummaryWriter(os.path.join(self.writer_root_path, 'val'))
self.test_writer = SummaryWriter(os.path.join(self.writer_root_path, 'test'))
self.curr_epcoh = 0
self.curr_iter = 0
self.train_end_flag = False
(self.X_train, self.y_train), (self.X_val, self.y_val), (self.X_test, self.y_test) = reformatInput(X_inputs, labels, fold)
print('Test set label and BiLi:t', np.unique(self.y_test, return_counts=True))
# normalization between all data
X_mean = self.X_train.mean()
X_std = self.X_train.std()
self.X_train = (self.X_train - X_mean)/X_std
self.X_val = (self.X_val - X_mean)/X_std
self.X_test = (self.X_test - X_mean)/X_std
if model_type == 'TxtCNN4':
model = TxtCNN4()
else:
model = DeepSleepNet()
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.model = model.to(self.device)
self.criterion = nn.CrossEntropyLoss(weight=None)
# self.X_train = torch.tensor(self.X_train, dtype=torch.float).to(self.device)
# self.y_train = torch.tensor(self.y_train, dtype=torch.int64).to(self.device)
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def train(self, epoch):
train_loss = AvgMeter()
train_acc = AvgMeter()
start_time = time.time()
time_to_tensor = 0
time_to_cuda = 0
time_model = 0
time_criterion = 0
time_backward = 0
time_step = 0
time_writer = 0
self.curr_epcoh = epoch
self.model.train()
for (inputs, targets) in iterate_minibatches(self.X_train, self.y_train, batch_size, shuffle=False):
self.curr_iter += 1
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
time_to_tensor += (time.time() - time_1)
time_1 = time.time()
inputs, targets = inputs.to(self.device), targets.to(self.device)
time_to_cuda += (time.time() - time_1)
# zero the parameter gradients
self.optimizer.zero_grad()
# forward + backward + optimize
time_1 = time.time()
out = self.model(inputs)
time_model += (time.time() - time_1)
time_1 = time.time()
loss = self.criterion(out, targets)
time_criterion += (time.time() - time_1)
time_1 = time.time()
loss.backward()
time_backward += (time.time() - time_1)
time_1 = time.time()
self.optimizer.step()
time_step += (time.time() - time_1)
if self.curr_iter % 20 == 0:
time_1 = time.time()
pred = out.max(dim=1)[1].cpu().numpy()
gt = targets.cpu().numpy()
acc = np.mean(pred==gt)
_loss = loss.detach().cpu().numpy()
train_acc.update(acc)
train_loss.update(_loss)
self.train_writer.add_scalar('learning_rate', curr_lr, self.curr_iter)
self.train_writer.add_scalar('acc', acc, self.curr_iter)
self.train_writer.add_scalar('loss', _loss, self.curr_iter)
time_writer += (time.time() - time_1)
fmt_str = "TraintEpoch [{:d}/{:d}] train_Loss: {:.4f}ttrain_Acc: {:.2f}tTime per Epoch: {:.4f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
train_loss.avg,
train_acc.avg*100,
time.time()-start_time)
print(print_str)
print('time_to_tensor: ',time_to_tensor)
print('time_to_cuda: ',time_to_cuda)
print('time_model: ',time_model)
print('time_criterion: ',time_criterion)
print('time_backward: ',time_backward)
print('time_step: ',time_step)
print('time_writer: ',time_writer)
print(len(self.y_train))
train_acc.reset()
train_loss.reset()
def validation(self):
self.model.eval()
with torch.no_grad():
# val set
loss_gather = AvgMeter()
pred_gather = PreGather()
tran_time = 0
for (inputs, targets) in iterate_minibatches(self.X_val, self.y_val, batch_size, shuffle=False):
time_1 = time.time()
inputs, targets = torch.tensor(inputs, dtype=torch.float), torch.tensor(targets, dtype=torch.int64) # dtype=torch.int64
inputs, targets = inputs.to(self.device), targets.to(self.device)
tran_time += (time.time() - time_1)
out = self.model(inputs)
_loss = self.criterion(out, targets).detach().cpu().numpy()
pred = out.max(dim=1)[1].cpu().numpy()
loss_gather.update(_loss)
pred_gather.update(pred)
val_loss = loss_gather.avg
pred = pred_gather.pred
val_acc = np.mean(pred==self.y_val)
val_kappa = cohen_kappa_score(self.y_val, pred)
val_BCA = recall_score(self.y_val, pred, average='macro')
loss_gather.reset()
pred_gather.reset()
# Then we print the results for this epoch:
fmt_str = "VAL tEpoch [{:d}/{:d}] val_Loss: {:.4f}tval_Acc: {:.2f}tval_kappa: {:.2f}tval_BCA: {:.2f}"
print_str = fmt_str.format(self.curr_epcoh,
max_epochs,
val_loss,
val_acc*100,
val_kappa*100,
val_BCA*100)
print(print_str)
print(tran_time)
print(len(self.y_val))
self.val_writer.add_scalar('acc', val_acc, self.curr_iter)
self.val_writer.add_scalar('loss', val_loss, self.curr_iter)
self.val_writer.add_scalar('kappa', val_kappa, self.curr_iter)
self.val_writer.add_scalar('bca', val_BCA, self.curr_iter)
def train_all_subject(num_epochs=max_epochs, log_path=None):
# Leave-Subject-Out cross validation
subj_nums, fold_pairs, EEGs, labels = load_data(file_path, subj_nums=38, channel=1)
for subj_id in range(subj_nums):
print('The subj_id', subj_id, 'tt Training the ' + model_type + ' Model...')
trainer = Trainer(EEGs, labels, fold_pairs[subj_id], subj_id, log_path)
for epoch in range(trainer.curr_epcoh, max_epochs):
if trainer.train_end_flag is False:
trainer.train(epoch)
trainer.validation()
print('-'*50)
else:
break
python tensorflow pytorch
python tensorflow pytorch
edited Nov 13 at 8:27
asked Nov 12 at 11:34
user9724030
12
12
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also trytorch.tensor(..., device=device)as this is generally (sometimes much) faster than using.to(device). No detour over CPU memory.
– blue-phoenox
Nov 12 at 12:17
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22
add a comment |
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also trytorch.tensor(..., device=device)as this is generally (sometimes much) faster than using.to(device). No detour over CPU memory.
– blue-phoenox
Nov 12 at 12:17
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also try
torch.tensor(..., device=device) as this is generally (sometimes much) faster than using .to(device). No detour over CPU memory.– blue-phoenox
Nov 12 at 12:17
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also try
torch.tensor(..., device=device) as this is generally (sometimes much) faster than using .to(device). No detour over CPU memory.– blue-phoenox
Nov 12 at 12:17
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22
add a comment |
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53261322%2fpytorch-runs-much-slower-than-tensorflow-even-for-a-shallow-cnn%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53261322%2fpytorch-runs-much-slower-than-tensorflow-even-for-a-shallow-cnn%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
It would be easier to respond on this if you would use generated training data for this test, so that someone else would be able to reproduce the results in order to track down potential inefficiencies. Without being able to test it is difficult to say. That said, I wonder why time for backward and step are really low in the first picture, but extremely high in the second picture. That is a bit weird. You may also try
torch.tensor(..., device=device)as this is generally (sometimes much) faster than using.to(device). No detour over CPU memory.– blue-phoenox
Nov 12 at 12:17
Thanks for your reply. It might be hard to generate suitable data, and more code is needed to add. Actually, I also tried torch.tensor(..., device=self.device). There is no essential difference in the training time required.
– user9724030
Nov 13 at 2:22