Python uses the os module to divide image data sets, files, cut and copy files
Table of Contents
- Python uses the os module to divide image data sets, files, cut and copy files
-
- 1. Method AIP
-
-
- 1. Determine whether the file exists by path
- 2. Create a new folder
- 3. Path splicing
- 4. Return the directory and file name in the specified folder
- 5. Directory Traverser
- 6. Copy files
- 7. Traverse the list and return the index and value
-
- Second, the case
-
- 1. Topic
-
- ① Combined sample division
- ② Separate sample division
- 2. The main idea
- 3. Code
-
- ① Combined sample division
- ② Separate sample division
1. Method AIP
1. Determine whether the file exists by path
-
Method:
os.path.exists(path)
-
Return value: Boolean type, returns true if it exists, otherwise returns false.
2. Create a new folder
- Method:
os.makedirs(path)
3. Path splicing
-
Method:
os.path.join(path1,path2)
-
List:
path1 = "D:\data" path2 = "img1" imgs_path = os.path.join(path1,path2) print(imgs_path) # print out: "D:\data\img1"
4. Return the directory and file name in the specified folder
- Method:
os.listdir(path)
5. Directory Traverser
-
Method:
os.walk()
-
code:
plant_path="D:data" # is the absolute path of the sample folder for root, dirs, files in os. walk(plant_path):
-
Return value: triplet,
- root: refers to the address of the folder that is currently being traversed
- dirs: is a list, the content is the names of all directories in the folder (excluding subdirectories)
- files : the same list, the content is all the files in the folder (excluding subdirectories)
6. Copy files
-
Method:
shutil.copy(path1, path2)
-
Solution: path1 is the source file path, path2 is the target folder path
7. Traverse the list and return the index and value
-
Method:
enumerate(list)
-
code:
for id, list_name in enumerate(list): # Traverse the list collection to get a tuple. id is the index of each value in the collection, list_name is the value of the collection
2. Case
1. Topic
①Merge sample division
- Divide the data set as shown in the figure:
Among them, each sample folder under data (such as apple scab) has pictures one by one below. After division, each folder of the dataset contains pictures of different samples
② Separate sample division
- Divide each sample in the dataset into test, train, and val:
2. The main idea
3. Code
①Merge sample division
import os import random import shut-off """ Combine the samples and divide the dataset """ path_data = "D:\javaBC\PyChamDm\algorithm\demo4\data" # Data set storage path (first-level directory) split_data_path = "D:\javaBC\PyChamDm\algorithm\demo4\split_data" # Divided dataset path (first-level directory) """Setting of hyperparameters, setting the ratio of training set, verification set and test set""" train_pct = 0.8 valid_pct = 0.1 test_pct = 0.1 train_data = os.path.join(split_data_path, "train") # training set folder path valid_data = os.path.join(split_data_path, "val") # validation set folder path test_data = os.path.join(split_data_path, "test") # test set folder path """ Partition the dataset using paths """ def split_path(): for plant_name in os.listdir(path_data): # Get the folder name of each sample plant_path = os.path.join(path_data, plant_name) # Join the folder name and path of each sample for root, dirs, files in os.walk(plant_path): # files is a list of each picture name random.shuffle(files) # Shuffle the image path data_number = len(files) # Get the number of paths for each sample, and verify whether the data is missing print(f"{<!-- -->plant_name} sample number:{<!-- -->data_number}") # print out each sample number # Calculate each sample, after dividing according to the specified ratio, the number of pictures in the training set, verification set, and test set train_number = data_number * train_pct valid_number = data_number * valid_pct + train_number test_number = data_number * test_pct + valid_number print('a:{},b:{},c:{}'.format(train_number, valid_number, test_number)) # Print out the respective numbers of each sample training set, validation set, and test set for i, img_name in enumerate(files): # traverse the list to get the index i of the list, the value img_name img_path = os.path.join(plant_path, img_name) # Join the picture name and sample path to get the absolute path of each picture # Split the data set proportionally by judging if i < int(train_number): # Take out 0.8 in the data set and copy it to the train directory out_img = os.path.join(train_data, img_name) shutil.copy(img_path, out_img) # copy elif i < int(valid_number): # Take out 0.1 and copy it to the valid directory out_img = os.path.join(valid_data, img_name) shutil. copy(img_path, out_img) else: out_img = os.path.join(test_data, img_name) shutil. copy(img_path, out_img) # main function if __name__ == '__main__': # Create the folder after dividing the dataset if os.path.exists(split_data_path): # Determine whether the folder exists shutil.rmtree(split_data_path) # delete folder else: # create training set folder os.makedirs(split_data_path) os.makedirs(train_data) os.makedirs(valid_data) os.makedirs(test_data) # Call the method to divide the dataset split_path()
② Separate sample division
import os import random import shut-off """ Merge samples and divide datasets, merge sample divisions """ path_data = "D:\javaBC\PyChamDm\algorithm\demo4\data" # Data set storage path (first-level directory) split_data_path = "D:\javaBC\PyChamDm\algorithm\demo4\split_data_two" # Divided data set path (first-level directory) """Setting of hyperparameters, setting the ratio of training set, verification set and test set""" train_pct = 0.8 valid_pct = 0.1 test_pct = 0.1 """ Partition the dataset using paths """ def split_path(): """Traverse data set (first-level directory)""" for plant_name in os.listdir(path_data): # Get the folder name of each sample """Create divided data set folders to get different folders for different samples""" train_data = os.path.join(split_data_path, plant_name, "train") # training set folder path valid_data = os.path.join(split_data_path, plant_name, "val") # validation set folder path test_data = os.path.join(split_data_path, plant_name, "test") # test set folder path os_set(train_data, valid_data, test_data) plant_path = os.path.join(path_data, plant_name) # Join the folder name and path of each sample """Traverse each sample (secondary directory)""" for root, dirs, files in os.walk(plant_path): # files is a list of each picture name random.shuffle(files) # Shuffle the image path data_number = len(files) # Get the number of paths for each sample, and verify whether the data is missing print(f"{<!-- -->plant_name} sample number:{<!-- -->data_number}") # print out each sample number # Calculate each sample, after dividing according to the specified ratio, the number of pictures in the training set, verification set, and test set train_number = data_number * train_pct valid_number = data_number * valid_pct + train_number test_number = data_number * test_pct + valid_number print('a:{},b:{},c:{}'.format(train_number, valid_number, test_number)) # Print out the respective numbers of each sample training set, validation set, and test set for i, img_name in enumerate(files): # traverse the list to get the index i of the list, the value img_name img_path = os.path.join(plant_path, img_name) # Join the picture name and sample path to get the absolute path of each picture # Split the data set proportionally by judging if i < int(train_number): # Take out 0.8 in the data set and copy it to the train directory out_img = os.path.join(train_data, img_name) shutil.copy(img_path, out_img) # copy elif i < int(valid_number): # Take out 0.1 and copy it to the valid directory out_img = os.path.join(valid_data, img_name) shutil. copy(img_path, out_img) else: out_img = os.path.join(test_data, img_name) shutil. copy(img_path, out_img) def os_set(train_data, valid_data, test_data): # create training set folder os.makedirs(train_data) os.makedirs(valid_data) os.makedirs(test_data) # main function if __name__ == '__main__': # Create the folder after dividing the dataset if os.path.exists(split_data_path): # Determine whether the folder exists shutil.rmtree(split_data_path) # delete folder else: os.makedirs(split_data_path) # Call the method to divide the dataset split_path(