Pytorch loss function, backpropagation and optimizer, Sequential use

Pytorch_Sequential usage, loss function, backpropagation and optimizer

Article directory

- nn.Sequential
- Build a small practice
- Loss function and backpropagation
- optimizer

nn.Sequential

nn.Sequential is an ordered container. The modules used to build neural networks are added to the nn.Sequential() container in the order in which is passed into the constructor.

import torch.nn as nn
from collections import OrderedDict
# Using Sequential to create a small model. When `model` is run,
# input will first be passed to `Conv2d(1,20,5)`. The output of
# `Conv2d(1,20,5)` will be used as the input to the first
# `ReLU`; the output of the first `ReLU` will become the input
# for `Conv2d(20,64,5)`. Finally, the output of
# `Conv2d(20,64,5)` will be used as input to the second `ReLU`
model = nn.Sequential(
          nn.Conv2d(1,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        )

# Using Sequential with OrderedDict. This is functionally the
# same as the above code
model = nn.Sequential(OrderedDict([
          ('conv1', nn.Conv2d(1,20,5)),
          ('relu1', nn.ReLU()),
          ('conv2', nn.Conv2d(20,64,5)),
          ('relu2', nn.ReLU())
        ]))
print(model)

Sequential(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu1): ReLU()
  (conv2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu2): ReLU()
)

Building a small practice

Still with

CIFAR-10 model

CIFAR?10model as an example

The input image is 3-channel 32×32
Pass through the convolution layer (5×5 convolution kernel) successively
Max pooling layer (2×2 pooling kernel)
Convolutional layer (5×5 convolution kernel)
Max pooling layer (2×2 pooling kernel)
Convolutional layer (5×5 convolution kernel)
Max pooling layer (2×2 pooling kernel)
flatten
Fully connected layer processing,
The final output size is 10

Based on the above introduction, Pytorch will be used to build the model and implement

CIFAR-10 \quad model \quad structure

CIFAR?10modelstructure

Parameter description: in_channels: int, out_channels: int, kernel_size: Union can be seen from the input, feature map and convolution kernel, while stride and padding need to be calculated through formulas.

The calculation formula of the specific feature map size is as follows:

inputs: 3@32×32, 3-channel 32×32 image, 5*5 kernel –> Feature maps: 32@32×32

That is, after 32 3@5×5 convolution layers, the output size does not change (There are x convolution kernels, that is, x convolution kernels, and the number of channels of the convolution kernel is equal to the number of input channels )

Calculated by the above calculation formula

stride

stride and

padding

The stride in the convolutional layer defaults to 1

The stride in the pooling layer defaults to the size of kernel_size

import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

class BS(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3,
                               out_channels=32,
                               kernel_size=5,
                               stride=1,
                               padding=2) #stride and padding are calculated
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=32,
                               out_channels=32,
                               kernel_size=5,
                               stride=1,
                               padding=2)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        self.conv3 = nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size=5,
                               padding=2)
        self.maxpool3 = nn.MaxPool2d(kernel_size=2)
        self.flatten = nn.Flatten() #becomes 63*4*4=1024
        self.linear1 = nn.Linear(in_features=1024, out_features=64)
        self.linear2 = nn.Linear(in_features=64, out_features=10)
        
        
    def forward(self,x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        x = self.conv3(x)
        x = self.maxpool3(x)
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.linear2(x)
        return x
    
bs = BS()
bs

BS(
  (conv1): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (maxpool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=1024, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=10, bias=True)
)

Use Sequential to optimize the code and display it on tensorboard

The .add_graph function is used to add a PyTorch model graph to TensorBoard. Through this function, you can display the calculation graph of the model in a visual way, making it easier for others to understand your model structure and workflow.

add_graph(model, input_to_model, strip_default_attributes=True)

model: The PyTorch model to add.
input_to_model: Input data used to generate model graphs.
strip_default_attributes: Whether to delete the default attributes in the model, the default is True.

class BS(nn.Module):

    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3,
                               out_channels=32,
                               kernel_size=5,
                               stride=1,
                               padding=2), #stride and padding are calculated
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                                   out_channels=32,
                                   kernel_size=5,
                                   stride=1,
                                   padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                                   out_channels=64,
                                   kernel_size=5,
                                   padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(), #becomes 64*4*4=1024
            nn.Linear(in_features=1024, out_features=64),
            nn.Linear(in_features=64, out_features=10),
        )
    
    def forward(self,x):
        x = self.model(x)
        return x
    
bs = BS()
print(bs)

BS(
  (model): Sequential(
    (0): Conv2d(3, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Flatten(start_dim=1, end_dim=-1)
    (7): Linear(in_features=1024, out_features=64, bias=True)
    (8): Linear(in_features=64, out_features=10, bias=True)
  )
)

# Display in tensorboard
input_ = torch.ones((64,3,32,32))
writer = SummaryWriter(".logs")
writer.add_graph(bs, input_) # defined model, data
writer.close()

Using tensorboard to visualize the network structure graph is as follows

Loss function and backpropagation

Calculate the error between the model’s target output and the actual output. And update the weights and parameters of the model through the back propagation algorithm to reduce the error between the predicted output and the actual output.

Calculate the difference between actual output and target output
Provide a certain basis for updating the output (back propagation)

The loss functions used by different models are generally different.

MAE Mean Absolute Error

torch.nn.L1Loss(size_average=None, reduce=None, reduction=mean’)

import torch
import torch.nn as nn
# Instantiate
criterion1 = nn.L1Loss(reduction='mean')#mean
criterion2 = nn.L1Loss(reduction="sum")#sum
output = torch.tensor([1.0, 2.0, 3.0])#or dtype=torch.float32
target = torch.tensor([2.0, 2.0, 2.0])#or dtype=torch.float32
# Average loss value
loss = criterion1(output, target)
print(loss) # Output: tensor(0.6667)
# Error sum
loss1 = criterion2(output,target)
print(loss1) # tensor(2.)

tensor(0.6667)
tensor(2.)

loss = nn.L1Loss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)
output = loss(input, target)
output.backward()
output

tensor(1.0721, grad_fn=<MeanBackward0>)

MSE Mean-Square Error

torch.nn.MSELoss(size_average=None, reduce=None, reduction=mean’)

import torch.nn as nn
# Instantiate
criterion1 = nn.MSELoss(reduction='mean')
criterion2 = nn.MSELoss(reduction="sum")
output = torch.tensor([1, 2, 3],dtype=torch.float32)
target = torch.tensor([1, 2, 5],dtype=torch.float32)
# Average loss value
loss = criterion1(output, target)
print(loss) # Output: tensor(1.3333)
# Error sum
loss1 = criterion2(output,target)
print(loss1) # tensor(4.)

tensor(1.3333)
tensor(4.)

Cross entropy loss CrossEntropyLoss

torch.nn.CrossEntropyLoss(weight=None,size_average=None,ignore_index=-100,reduce=None,reduction='mean',label_smoothing=0.0)

This is particularly useful when you have an unbalanced training set

import torch
import torch.nn as nn

# Set up a three-class classification problem. Assume that the probability of a human is 0.1, the probability of a dog is 0.2, and the probability of a cat is 0.3.
x = torch.tensor([0.1, 0.2, 0.3])
print(x)
y = torch.tensor([1]) # Let the target label be 1, that is, the label corresponding to 0.2 dogs, and the target label tensor y
x = torch.reshape(x, (1, 3)) # tensor([[0.1000, 0.2000, 0.3000]]), the batch size is 1, the number of categories is 3, that is, 3 categories
print(x)
print(y)
# Instantiate the object
loss_cross = nn.CrossEntropyLoss()
# Calculation results
result_cross = loss_cross(x, y)
print(result_cross)

tensor([0.1000, 0.2000, 0.3000])
tensor([[0.1000, 0.2000, 0.3000]])
tensor([1])
tensor(1.1019)

import torch
import torchvision
from torch.utils.data import DataLoader

# Prepare data set
dataset = torchvision.datasets.CIFAR10(root="dataset",train=False,transform=torchvision.transforms.ToTensor(),download=True)
#Dataset loader
dataloader = DataLoader(dataset, batch_size=1)
"""
The input image is 3-channel 32×32,
After successively passing through the convolution layer (5×5 convolution kernel),
Max pooling layer (2×2 pooling kernel),
Convolutional layer (5×5 convolution kernel),
Max pooling layer (2×2 pooling kernel),
Convolutional layer (5×5 convolution kernel),
Max pooling layer (2×2 pooling kernel),
straighten,
Fully connected layer processing,
The final output size is 10
"""

# Build neural network
class BS(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3,
                      out_channels=32,
                      kernel_size=5,
                      stride=1,
                      padding=2), #stride and padding are calculated
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                      out_channels=32,
                      kernel_size=5,
                      stride=1,
                      padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                      out_channels=64,
                      kernel_size=5,
                      padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(), #becomes 64*4*4=1024
            nn.Linear(in_features=1024, out_features=64),
            nn.Linear(in_features=64, out_features=10),
        )

    def forward(self, x):
        x = self.model(x)
        return x


# Instantiate
bs = BS()
loss = torch.nn.CrossEntropyLoss()
# Calculate the CrossEntropyLoss loss function for each image
# Use the loss function loss to calculate the cross-entropy loss between the prediction result and the target label

for inputs,labels in dataloader:
    outputs = bs(inputs)
    result = loss(outputs,labels)
    print(result)

tensor(2.3497, grad_fn=<NllLossBackward0>)
tensor(2.2470, grad_fn=<NllLossBackward0>)
tensor(2.2408, grad_fn=<NllLossBackward0>)
tensor(2.2437, grad_fn=<NllLossBackward0>)
tensor(2.3121, grad_fn=<NllLossBackward0>)
........

Optimizer

Optimizer is a tool used to update neural network parameters

It adjusts the parameters of the model based on the calculated gradient of the loss function to minimize the loss function and improve the performance of the model

Common optimizers include: SGD, Adam

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.parameters() is used to obtain the learnable parameters of the model

learning rate,lr represents the learning rate, that is, the step size of each parameter update

In each training batch, the following operations need to be performed:

Input training data into the model and perform forward propagation
Calculate loss based on loss function
Call the zero_grad() method of the optimizer to clear the previous gradient.
Call the backward() method to perform backpropagation and calculate the gradient
Call the step() method of the optimizer to update the model parameters

The pseudo code is as follows (it doesn’t work)

import torch
import torch.optim as optim

# Step 1: Define the model
model = ...
# Step 2: Define the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)
# Step 3: Define loss function
criterion = ...
# Step 4: Training loop
for inputs, labels in dataloader:
    # forward propagation
    outputs = model(inputs)

    # Calculate loss
    loss = criterion(outputs, labels)

    # Clear gradient
    optimizer.zero_grad()

    # Backpropagation to get the gradient
    loss.backward()

    # Update parameters and optimize based on gradients
    optimizer.step()

In the above model code, SGD is used as the optimizer and lr is 0.01. At the same time, select the appropriate loss function according to the specific task, such as torch.nn.CrossEntropyLoss, torch.nn.MSELoss, etc., taking CIFRA10 as an example

import torch
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader

dataset = torchvision.datasets.CIFAR10(root="dataset", train=False, transform=torchvision.transforms.ToTensor(),download=True)
dataloader = DataLoader(dataset,batch_size=1)

class BS(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=3,
                      out_channels=32,
                      kernel_size=5,
                      stride=1,
                      padding=2), #stride and padding are calculated
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                      out_channels=32,
                      kernel_size=5,
                      stride=1,
                      padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=32,
                      out_channels=64,
                      kernel_size=5,
                      padding=2),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(), #becomes 64*4*4=1024
            nn.Linear(in_features=1024, out_features=64),
            nn.Linear(in_features=64, out_features=10),
        )

    def forward(self, x):
        x = self.model(x)
        return x


model = BS() #define model
optimizer = optim.SGD(model.parameters(), lr=0.01) #Define optimizer SGD
criterion = nn.CrossEntropyLoss() #Define the loss function, cross entropy loss function

'''Loop once and perform one round of learning only on the data'''
for inputs, labels in dataloader:
    # forward propagation
    outputs = model(inputs)
    # Calculate loss
    loss = criterion(outputs, labels)
    # Clear gradient
    optimizer.zero_grad()
    # Backpropagation
    loss.backward()
    # Update parameters
    optimizer.step()
    #Print the results after the optimizer
    print(loss)
    
"""Training cycle 20 times"""
# for epoch in range(20):
#running_loss = 0.0
# for inputs, labels in dataloader:
# # Forward propagation
# outputs = model(inputs)
# # Calculate loss
# loss = criterion(outputs,labels)
# # Clear gradient
# optimizer.zero_grad()
# # Backpropagation
# loss.backward()
# # Update parameters
# optimizer.step()
# # Print the results after the optimizer
# running_loss = running_loss + loss
# print(running_loss)

Files already downloaded and verified
tensor(2.3942, grad_fn=<NllLossBackward0>)
tensor(2.2891, grad_fn=<NllLossBackward0>)
tensor(2.2345, grad_fn=<NllLossBackward0>)
tensor(2.2888, grad_fn=<NllLossBackward0>)
tensor(2.2786, grad_fn=<NllLossBackward0>)
........