1. Introduction
Pytorch reproduces the lenet5 model and detects your own handwritten digital images.
It is relatively simple to build a model using the torch framework, but you will also encounter many problems. There is a lot of information on the Internet, and the methods of building the model are similar. After I tried to build the model myself, I encountered many problems in both training and detection, such as It is useless to ask others for advice on this kind of problem you encounter. Originally, I used a code on github to reproduce it. After the environment was built, I discovered that a GPU was required. I used a CPU to build it, but it failed. In order to reproduce it, I rented the AutoDL platform and built it again. It is recorded here. For operations under GPU, the CPU version needs to modify the source code and modify it by yourself. My purpose is to train my own model and deploy it on RK3568, so train and test it first. Lay the foundation for subsequent deployment.
2. Environment
3. Construction
1. Create a virtual environment
conda create -n LeNet5_env python==3.8
2. Install pytorch
PyTorch
According to the official PyTorch, when installing pytorch, the CPU version is used. Other versions can be installed by yourself. The installation command is:
pip install torch==1.7.1 + cu110 torchvision==0.8.2 + cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple
You also need to install some other libraries
pip install matplotlib -i https://pypi.tuna.tsinghua.edu.cn/simple pip install opencv-python -i https://pypi.tuna.tsinghua.edu.cn/simple
3. Data set download
http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Just copy the above address to the web page and you can only download it.
After downloading, save it to the data/MNIST/raw directory.
4. Training code
The training model has four files: LeNet5.py; myDatast.py; readMnist.py; train.py
The file LeNet5.py is the network layer model
train.py
import torch from torch.autograd import Variable import torch.nn as nn from torch.utils.data import DataLoader from readMnist import * from myDatast import Mnist from LeNet5 import LeNet5 train_images = load_train_images() train_labels = load_train_labels() trainData = Mnist(train_images, train_labels) train_data = DataLoader(dataset=trainData, batch_size=1, shuffle=True) lenet5 = LeNet5() lenet5.cuda() lossFun = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(params=lenet5.parameters(), lr=1e-4) Epochs = 100 L = len(train_data) for epoch in range(Epochs): for i, (img, id) in enumerate(train_data): img = img.float() id = id.float() img = img.cuda() id = id.cuda() img = Variable(img, requires_grad=True) id = Variable(id, requires_grad=True) Output = lenet5.forward(img) loss = lossFun(Output, id.long()) optimizer.zero_grad() loss.backward() optimizer.step() iter = epoch * L + i + 1 if iter % 100 == 0: print('epoch:{}, iter:{},loss:{:.6f}'.format(epoch + 1, iter, loss)) torch.save(lenet5.state_dict(), 'lenet5.pth') </code><img class="look-more-preCode contentImg-no-view" src="//i2.wp.com/csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreBlack. png" alt="" title="">
**LeNet5.py**
import torch.nn as nn class LeNet5(nn.Module): def __init__(self): super(LeNet5, self).__init__() self.conv1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2) ) self.conv2 = nn.Sequential( nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2) ) self.fc1 = nn.Sequential( nn.Linear(in_features=16 * 4 * 4, out_features=120), nn.Sigmoid() ) self.fc2 = nn.Sequential( nn.Linear(in_features=120, out_features=84), nn.Sigmoid() ) self.fc3 = nn.Linear(in_features=84, out_features=10) def forward(self, img): img = self.conv1.forward(img) img = self.conv2.forward(img) img = img.view(img.size()[0], -1) img = self.fc1.forward(img) img = self.fc2.forward(img) img = self.fc3.forward(img) return img </code><img class="look-more-preCode contentImg-no-view" src="//i2.wp.com/csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreBlack. png" alt="" title="">
**readMnist.py**
from torch.utils.data import Dataset from torchvision import transforms import numpy as np class Mnist(Dataset): def __init__(self, dataset, label): self.dataset = dataset self.label = label self.len = len(self.label) self.transforms = transforms.Compose([transforms.ToTensor() , transforms.Normalize(mean=[0.5], std=[0.5])]) def __len__(self): return self.len def __getitem__(self, item): img = self.dataset[item] img_id = self.label[item] img = np.transpose(img,(1,2,0)) img = self.transforms(img) return img, img_id </code><img class="look-more-preCode contentImg-no-view" src="//i2.wp.com/csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreBlack. png" alt="" title="">
readMnist.py
import numpy as np import struct import matplotlib.pyplot as plt import cv2 fpath = 'G:/enpei_Project_Code/21_LeNet5/LeNet5-master/myLeNet5/data/MNIST/raw/' #Training set file train_images_idx3_ubyte_file = fpath + 'train-images-idx3-ubyte' #Training set label file train_labels_idx1_ubyte_file = fpath + 'train-labels-idx1-ubyte' # Test set file test_images_idx3_ubyte_file = fpath + 't10k-images-idx3-ubyte' #Test set label file test_labels_idx1_ubyte_file = fpath + 't10k-labels-idx1-ubyte' def decode_idx3_ubyte(idx3_ubyte_file): """ Common functions for parsing idx3 files :param idx3_ubyte_file: idx3 file path :return: data set """ # Read binary data bin_data = open(idx3_ubyte_file, 'rb').read() # Parse the file header information, which is the magic number, the number of pictures, the height of each picture, and the width of each picture. offset=0 fmt_header = '>iiii' # Because the data types of the first 4 rows in the data structure are all 32-bit integers, the i format is used, but we need to read the first 4 rows of data, so 4 i's are needed. We will see later that label centralization uses only 2 ii. magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset) print('Magic number: %d, number of pictures: %d, picture size: %d*%d' % (magic_number, num_images, num_rows, num_cols)) # Parse the data set image_size = num_rows * num_cols offset + = struct.calcsize(fmt_header) # Get the pointer position of the data in the cache. From the data structure introduced earlier, we can see that after reading the first 4 rows, the pointer position (i.e. offset position) points to 0016. print(offset) fmt_image = '>' + str( image_size) + 'B' # The type of image data pixel value is unsigned char, and the corresponding format is B. There is also an image size of 784 added here to read 784 B format data. If not, only one value will be read (i.e. one pixel value in an image) print(fmt_image, offset, struct.calcsize(fmt_image)) images = np.empty((num_images, 1, num_rows, num_cols)) # plt.figure() for i in range(num_images): if (i + 1) % 10000 == 0: print('Parsed %d' % (i + 1) + 'Zhang') print(offset) images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((1, num_rows, num_cols)) # print(images[i]) offset + = struct.calcsize(fmt_image) # plt.imshow(images[i],'gray') # plt.pause(0.00001) # plt.show() # plt.show() return images def decode_idx1_ubyte(idx1_ubyte_file): """ General function for parsing idx1 files :param idx1_ubyte_file: idx1 file path :return: data set """ # Read binary data bin_data = open(idx1_ubyte_file, 'rb').read() # Parse the file header information, which is the magic number and the number of tags in order offset=0 fmt_header = '>ii' magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset) print('Magic number: %d, number of pictures: %d' % (magic_number, num_images)) # Parse the data set offset + = struct.calcsize(fmt_header) fmt_image = '>B' labels = np.empty(num_images) for i in range(num_images): if (i + 1) % 10000 == 0: print('Parsed %d' % (i + 1) + 'Zhang') labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0] offset + = struct.calcsize(fmt_image) return labels def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file): """ TRAINING SET IMAGE FILE (train-images-idx3-ubyte): [offset] [type] [value] [description] 0000 32 bit integer 0x00000803(2051) magic number 0004 32 bit integer 60000 number of images 0008 32 bit integer 28 number of rows 0012 32 bit integer 28 number of columns 0016 unsigned byte pixel 0017 unsigned byte pixel ........ xxxx unsigned byte pixel Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black). :param idx_ubyte_file: idx file path :return: n*row*col dimension np.array object, n is the number of pictures """ return decode_idx3_ubyte(idx_ubyte_file) def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file): """ TRAINING SET LABEL FILE (train-labels-idx1-ubyte): [offset] [type] [value] [description] 0000 32 bit integer 0x00000801(2049) magic number (MSB first) 0004 32 bit integer 60000 number of items 0008 unsigned byte label 0009 unsigned byte label ........ xxxx unsigned byte label The label values are 0 to 9. :param idx_ubyte_file: idx file path :return: n*1-dimensional np.array object, n is the number of pictures """ return decode_idx1_ubyte(idx_ubyte_file) def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file): """ TEST SET IMAGE FILE (t10k-images-idx3-ubyte): [offset] [type] [value] [description] 0000 32 bit integer 0x00000803(2051) magic number 0004 32 bit integer 10000 number of images 0008 32 bit integer 28 number of rows 0012 32 bit integer 28 number of columns 0016 unsigned byte pixel 0017 unsigned byte pixel ........ xxxx unsigned byte pixel Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black). :param idx_ubyte_file: idx file path :return: n*row*col dimension np.array object, n is the number of pictures """ return decode_idx3_ubyte(idx_ubyte_file) def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file): """ TEST SET LABEL FILE (t10k-labels-idx1-ubyte): [offset] [type] [value] [description] 0000 32 bit integer 0x00000801(2049) magic number (MSB first) 0004 32 bit integer 10000 number of items 0008 unsigned byte label 0009 unsigned byte label ........ xxxx unsigned byte label The label values are 0 to 9. :param idx_ubyte_file: idx file path :return: n*1-dimensional np.array object, n is the number of pictures """ return decode_idx1_ubyte(idx_ubyte_file) if __name__ == '__main__': train_images = load_train_images() train_labels = load_train_labels() test_images = load_test_images() test_labels = load_test_labels() pass # Check the first ten data and their labels to see if the reading is correct for i in range(10): print(train_labels[i]) img = train_images[i] img = np.transpose(img, (1, 2, 0)) cv2.namedWindow('img') cv2.imshow('img', img) cv2.waitKey(100) print('done') </code><img class="look-more-preCode contentImg-no-view" src="//i2.wp.com/csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreBlack. png" alt="" title="">
What needs to be noted in the above code is the path of the data set, which needs to be modified to the corresponding path.
Run python train.py
Training takes about 5 hours
5. Test
from LeNet5 import LeNet5 import torch from readMnist import * from myDatast import Mnist from torch.utils.data import DataLoader import numpy as np import cv2 test_images = load_test_images() test_labels = load_test_labels() testData = Mnist(test_images, test_labels) test_data = DataLoader(dataset=testData, batch_size=1, shuffle=True) lenet5 = LeNet5() lenet5.load_state_dict(torch.load('lenet5.pth')) lenet5.eval() showimg=True js = 0 for i, (img, id) in enumerate(test_data): img = img.float() outid = lenet5(img) oid = torch.argmax(outid) if oid == id: js = js + 1 if showimg == True: img = img.numpy() img = np.squeeze(img) id = id.numpy() id = np.squeeze(id) id = np.int32(id) oid = oid.numpy() oid = np.squeeze(oid) maxv = np.max(img) minv = np.min(img) img = (img - minv) / (maxv - minv) cv2.namedWindow("img", 0) cv2.imshow("img", img) title = "img, predicted value:{},truth value:{}".format(oid, id) cv2.setWindowTitle("img",title) cv2.waitKey(1) print('Accuracy: {:.6f}'.format(js / (i + 1))) </code><img class="look-more-preCode contentImg-no-view" src="//i2.wp.com/csdnimg.cn/release/blogv2/dist/pc/img/newCodeMoreBlack. png" alt="" title="">
The accuracy of the test results reached 0.986, basically meeting the requirements.
If there is any infringement or you need the complete code, please contact the blogger in time.
ueeze(oid)
maxv = np.max(img) minv = np.min(img) img = (img - minv) / (maxv - minv) cv2.namedWindow("img", 0) cv2.imshow("img", img) title = "img, predicted value:{},truth value:{}".format(oid, id) cv2.setWindowTitle("img",title) cv2.waitKey(1)
print(Accuracy: {:.6f}’.format(js / (i + 1)))
“