Python uses the os module to divide image datasets, cut and copy files

Python uses the os module to divide image data sets, files, cut and copy files

Table of Contents

  • Python uses the os module to divide image data sets, files, cut and copy files
    • 1. Method AIP
        • 1. Determine whether the file exists by path
        • 2. Create a new folder
        • 3. Path splicing
        • 4. Return the directory and file name in the specified folder
        • 5. Directory Traverser
        • 6. Copy files
        • 7. Traverse the list and return the index and value
    • Second, the case
      • 1. Topic
        • ① Combined sample division
        • ② Separate sample division
      • 2. The main idea
      • 3. Code
        • ① Combined sample division
        • ② Separate sample division

1. Method AIP

1. Determine whether the file exists by path

  • Method: os.path.exists(path)

  • Return value: Boolean type, returns true if it exists, otherwise returns false.

2. Create a new folder

  • Method: os.makedirs(path)

3. Path splicing

  • Method: os.path.join(path1,path2)

  • List:

    path1 = "D:\data"
    path2 = "img1"
    imgs_path = os.path.join(path1,path2)
    print(imgs_path) # print out: "D:\data\img1"
    

4. Return the directory and file name in the specified folder

  • Method: os.listdir(path)

5. Directory Traverser

  • Method: os.walk()

  • code:

    plant_path="D:data" # is the absolute path of the sample folder
    for root, dirs, files in os. walk(plant_path):
    
  • Return value: triplet,

    • root: refers to the address of the folder that is currently being traversed
    • dirs: is a list, the content is the names of all directories in the folder (excluding subdirectories)
    • files : the same list, the content is all the files in the folder (excluding subdirectories)

6. Copy files

  • Method: shutil.copy(path1, path2)

  • Solution: path1 is the source file path, path2 is the target folder path

7. Traverse the list and return the index and value

  • Method: enumerate(list)

  • code:

     for id, list_name in enumerate(list): # Traverse the list collection to get a tuple. id is the index of each value in the collection, list_name is the value of the collection
    

2. Case

1. Topic

①Merge sample division

  • Divide the data set as shown in the figure:

Among them, each sample folder under data (such as apple scab) has pictures one by one below. After division, each folder of the dataset contains pictures of different samples

image-20230523202321268

② Separate sample division

  • Divide each sample in the dataset into test, train, and val:

image-20230523212615962

2. The main idea

image-20230523205543363

3. Code

①Merge sample division

import os
import random
import shut-off

"""
Combine the samples and divide the dataset
"""
path_data = "D:\javaBC\PyChamDm\algorithm\demo4\data" # Data set storage path (first-level directory)
split_data_path = "D:\javaBC\PyChamDm\algorithm\demo4\split_data" # Divided dataset path (first-level directory)

"""Setting of hyperparameters, setting the ratio of training set, verification set and test set"""
train_pct = 0.8
valid_pct = 0.1
test_pct = 0.1

train_data = os.path.join(split_data_path, "train") # training set folder path
valid_data = os.path.join(split_data_path, "val") # validation set folder path
test_data = os.path.join(split_data_path, "test") # test set folder path

"""
Partition the dataset using paths
"""


def split_path():
    for plant_name in os.listdir(path_data): # Get the folder name of each sample
        plant_path = os.path.join(path_data, plant_name) # Join the folder name and path of each sample
        for root, dirs, files in os.walk(plant_path): # files is a list of each picture name

            random.shuffle(files) # Shuffle the image path
            data_number = len(files) # Get the number of paths for each sample, and verify whether the data is missing
            print(f"{<!-- -->plant_name} sample number:{<!-- -->data_number}") # print out each sample number

            # Calculate each sample, after dividing according to the specified ratio, the number of pictures in the training set, verification set, and test set
            train_number = data_number * train_pct
            valid_number = data_number * valid_pct + train_number
            test_number = data_number * test_pct + valid_number
            print('a:{},b:{},c:{}'.format(train_number, valid_number, test_number)) # Print out the respective numbers of each sample training set, validation set, and test set

            for i, img_name in enumerate(files): # traverse the list to get the index i of the list, the value img_name
                img_path = os.path.join(plant_path, img_name) # Join the picture name and sample path to get the absolute path of each picture

                # Split the data set proportionally by judging
                if i < int(train_number): # Take out 0.8 in the data set and copy it to the train directory
                    out_img = os.path.join(train_data, img_name)
                    shutil.copy(img_path, out_img) # copy

                elif i < int(valid_number): # Take out 0.1 and copy it to the valid directory
                    out_img = os.path.join(valid_data, img_name)
                    shutil. copy(img_path, out_img)

                else:
                    out_img = os.path.join(test_data, img_name)
                    shutil. copy(img_path, out_img)


# main function
if __name__ == '__main__':
    # Create the folder after dividing the dataset
    if os.path.exists(split_data_path): # Determine whether the folder exists
        shutil.rmtree(split_data_path) # delete folder
    else:
        # create training set folder
        os.makedirs(split_data_path)
        os.makedirs(train_data)
        os.makedirs(valid_data)
        os.makedirs(test_data)
        # Call the method to divide the dataset
        split_path()

② Separate sample division

import os
import random
import shut-off

"""
Merge samples and divide datasets, merge sample divisions
"""
path_data = "D:\javaBC\PyChamDm\algorithm\demo4\data" # Data set storage path (first-level directory)
split_data_path = "D:\javaBC\PyChamDm\algorithm\demo4\split_data_two" # Divided data set path (first-level directory)

"""Setting of hyperparameters, setting the ratio of training set, verification set and test set"""
train_pct = 0.8
valid_pct = 0.1
test_pct = 0.1

"""
Partition the dataset using paths
"""
def split_path():
    """Traverse data set (first-level directory)"""
    for plant_name in os.listdir(path_data): # Get the folder name of each sample
        """Create divided data set folders to get different folders for different samples"""
        train_data = os.path.join(split_data_path, plant_name, "train") # training set folder path
        valid_data = os.path.join(split_data_path, plant_name, "val") # validation set folder path
        test_data = os.path.join(split_data_path, plant_name, "test") # test set folder path

        os_set(train_data, valid_data, test_data)

        plant_path = os.path.join(path_data, plant_name) # Join the folder name and path of each sample
        """Traverse each sample (secondary directory)"""
        for root, dirs, files in os.walk(plant_path): # files is a list of each picture name

            random.shuffle(files) # Shuffle the image path
            data_number = len(files) # Get the number of paths for each sample, and verify whether the data is missing
            print(f"{<!-- -->plant_name} sample number:{<!-- -->data_number}") # print out each sample number

            # Calculate each sample, after dividing according to the specified ratio, the number of pictures in the training set, verification set, and test set
            train_number = data_number * train_pct
            valid_number = data_number * valid_pct + train_number
            test_number = data_number * test_pct + valid_number
            print('a:{},b:{},c:{}'.format(train_number, valid_number, test_number)) # Print out the respective numbers of each sample training set, validation set, and test set

            for i, img_name in enumerate(files): # traverse the list to get the index i of the list, the value img_name
                img_path = os.path.join(plant_path, img_name) # Join the picture name and sample path to get the absolute path of each picture

                # Split the data set proportionally by judging
                if i < int(train_number): # Take out 0.8 in the data set and copy it to the train directory
                    out_img = os.path.join(train_data, img_name)
                    shutil.copy(img_path, out_img) # copy

                elif i < int(valid_number): # Take out 0.1 and copy it to the valid directory
                    out_img = os.path.join(valid_data, img_name)
                    shutil. copy(img_path, out_img)

                else:
                    out_img = os.path.join(test_data, img_name)
                    shutil. copy(img_path, out_img)


def os_set(train_data, valid_data, test_data):
    # create training set folder
    os.makedirs(train_data)
    os.makedirs(valid_data)
    os.makedirs(test_data)


# main function
if __name__ == '__main__':
    # Create the folder after dividing the dataset
    if os.path.exists(split_data_path): # Determine whether the folder exists
        shutil.rmtree(split_data_path) # delete folder
    else:
        os.makedirs(split_data_path)
        # Call the method to divide the dataset
        split_path(