[Target Detection] Visdrone data set and CARPK data set preprocessing

The previous blog post [Target Detection] YOLOv5 ran through the VisDrone data set introduced the Visdrone data set. I will not repeat it here. This article mainly performs target extraction and filtering on the Visdrone data set and CARPK data set.

Description of requirements

This article needs to extract and merge the data sets about cars and people in the Visdrone data set. Cars are marked as category 0 and people are marked as category 1, and converted into txt format supported by YOLO.

Visdrone Dataset

Convert Visdrone data set to YOLO txt format

First, perform a format conversion on the original data set. The following code continues to use the official conversion script.

from utils.general import download, os, Path


def visdrone2yolo(dir):
    from PIL import Image
    from tqdm import tqdm

    def convert_box(size, box):
        # Convert VisDrone box to YOLO xywh box
        dw = 1./size[0]
        dh = 1. / size[1]
        return (box[0] + box[2] / 2) * dw, (box[1] + box[3] / 2) * dh, box[2] * dw, box[3] * dh

    (dir / 'labels').mkdir(parents=True, exist_ok=True) # make labels directory
    pbar = tqdm((dir / 'annotations').glob('*.txt'), desc=f'Converting {<!-- -->dir}')
    for f in pbar:
        img_size = Image.open((dir / 'images' / f.name).with_suffix('.jpg')).size
        lines = []
        with open(f, 'r') as file: # read annotation.txt
            for row in [x.split(',') for x in file.read().strip().splitlines()]:
                if row[4] == '0': # VisDrone 'ignored regions' class 0
                    continue
                cls = int(row[5]) - 1 #Category number-1
                box = convert_box(img_size, tuple(map(int, row[:4])))
                lines.append(f"{<!-- -->cls} {<!-- -->' '.join(f'{<!-- -->x:.6f}' for x in box)}\\
")
                with open(str(f).replace(os.sep + 'annotations' + os.sep, os.sep + 'labels' + os.sep), 'w') as fl:
                    fl.writelines(lines) # write label.txt


dir = Path(r'E:\Dataset\VisDrone') # Visdrone2019 folder directory under the datasets folder
#Convert
for d in 'VisDrone2019-DET-train', 'VisDrone2019-DET-val', 'VisDrone2019-DET-test-dev':
    visdrone2yolo(dir / d) # convert VisDrone annotations to YOLO labels

Tag visualization

Visualize the txt tag and see the effect before filtering.

import os
import numpy as np
import cv2

# Modify the input image folder
img_folder = "image"
img_list = os.listdir(img_folder)
img_list.sort()
# Modify the input label folder
label_folder = "labels2"
label_list = os.listdir(label_folder)
label_list.sort()
# Output image folder location
path = os.getcwd()
output_folder = path + '/' + str("output")
os.mkdir(output_folder)

#Coordinate conversion
def xywh2xyxy(x, w1, h1, img):
    label, x, y, w, h = x
    # print("Original image width and height:\\
w1={}\\
h1={}".format(w1, h1))
    # Bounding box denormalization
    x_t = x * w1
    y_t = y * h1
    w_t = w * w1
    h_t = h * h1
    # print("Output after denormalization:\\
First:{}\tSecond:{}\tThird:{}\tFourth:{}\ t\\
\\
".format(x_t, y_t, w_t, h_t))
    # Calculate coordinates
    top_left_x = x_t - w_t / 2
    top_left_y = y_t - h_t / 2
    bottom_right_x = x_t + w_t / 2
    bottom_right_y = y_t + h_t / 2

    # print('label:{}'.format(labels[int(label)]))
    # print("Top left x coordinate:{}".format(top_left_x))
    # print("Top left y coordinate:{}".format(top_left_y))
    # print("Bottom right x coordinate:{}".format(bottom_right_x))
    # print("Bottom right y coordinate:{}".format(bottom_right_y))
    # Draw a rectangular box
    # cv2.rectangle(img, (int(top_left_x), int(top_left_y)), (int(bottom_right_x), int(bottom_right_y)), colormap[1], 2)
    # (Optional) Draw different color boxes for different targets
    if int(label) == 0:
        cv2.rectangle(img, (int(top_left_x), int(top_left_y)), (int(bottom_right_x), int(bottom_right_y)), (0, 255, 0), 2)
    elif int(label) == 1:
        cv2.rectangle(img, (int(top_left_x), int(top_left_y)), (int(bottom_right_x), int(bottom_right_y)), (255, 0, 0), 2)
    else:
        cv2.rectangle(img, (int(top_left_x), int(top_left_y)), (int(bottom_right_x), int(bottom_right_y)), (0, 0, 0), 2)

    return img


if __name__ == '__main__':
    for i in range(len(img_list)):
        image_path = img_folder + "/" + img_list[i]
        label_path = label_folder + "/" + label_list[i]
        #Read image file
        img = cv2.imread(str(image_path))
        h, w = img.shape[:2]
        # Read labels
        with open(label_path, 'r') as f:
            lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)
        # Draw each target
        for x in lb:
            #Denormalize and get the upper left and lower right coordinates, and draw a rectangular frame
            img = xywh2xyxy(x, w, h, img)
        """
        # Directly view the generated result graph
        cv2.imshow('show', img)
        cv2.waitKey(0)
        """
        cv2.imwrite(output_folder + '/' + '{}.png'.format(image_path.split('/')[-1][:-4]), img)

The visualization effect is shown in the figure:
Note: This data set also distinguishes human postures. People who are walking are classified as pedestrians, and other postures (such as lying down or sitting down) are marked as people.

Filter tag

Specific filtering rules:

Merge car, van, truck, bus into car(0)
Merge pedestrian, people is person(1)
Discard other categories

import os
import numpy as np
from tqdm import tqdm

# Visdrone Category
# names: ['pedestrian', 'people', 'bicycle', 'car', 'van', 'truck', 'tricycle', 'awning- tricycle', 'bus', 'motor' ]

# Modify the input label folder
label_folder = "labels"
label_list = os.listdir(label_folder)

# Label output folder
label_output = "labels2"

# class_set
car_set = [3, 4, 5, 8]
person_set = [0, 1]

if __name__ == '__main__':
    for label_file in tqdm(os.listdir(label_folder)):
        # Read labels
        with open(os.path.join(label_folder, label_file), 'r') as f:
            lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)
        # write labels
        with open(os.path.join(label_output, label_file), 'a') as f:
            for obj in lb:
                # If it is a pedestrian, change the category to 1
                if int(obj[0]) in person_set:
                    obj[0] = 1
                    f.write(('%g ' * 5).rstrip() % tuple(obj) + '\\
')
                # If it is a vehicle, modify the category to 0
                elif int(obj[0]) in car_set:
                    obj[0] = 0
                    f.write(('%g ' * 5).rstrip() % tuple(obj) + '\\
')

The effect after filtering is shown in the figure:

CARPK data set

The CARPK data set is a car data set captured by a drone at an altitude of 40 meters, which only contains a single target of a car.

Download address: https://github.com/zstar1003/Dataset

Original label format:

1019 521 1129 571 1
1013 583 1120 634 1

The corresponding meanings are: xmin, ymin, xmax, ymax, cls

Processing script:

import os
import numpy as np
from tqdm import tqdm

# Modify the input label folder
# label_folder = r"E:\Dataset\CARPK_devkit\data\Annotations"
label_folder = r"annotations"
label_list = os.listdir(label_folder)

# Label output folder
label_output = r"labels"

# Image width and height
img_width = 1280
img_height = 720

if __name__ == '__main__':
    for label_file in tqdm(os.listdir(label_folder)):
        # Read labels
        with open(os.path.join(label_folder, label_file), 'r') as f:
            lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=int)
        for obj in lb:
            class_index = obj[4]
            xmin, ymin, xmax, ymax = obj[0], obj[1], obj[2], obj[3]
            # Convert box information to yolo format
            xcenter = xmin + (xmax - xmin) / 2
            ycenter = ymin + (ymax - ymin) / 2
            w = xmax - xmin
            h = ymax - ymin
            # Convert absolute coordinates to relative coordinates, save 6 decimal places
            xcenter = round(xcenter / img_width, 6)
            ycenter = round(ycenter / img_height, 6)
            w = round(w / img_width, 6)
            h = round(h / img_height, 6)
            info = [str(i) for i in [class_index, xcenter, ycenter, w, h]]
            # Write labels
            with open(os.path.join(label_output, label_file), 'a') as f:
                # If the file is not empty, add a newline
                if os.path.getsize(os.path.join(label_output, label_file)):
                    f.write("\\
" + " ".join(info))
                else:
                    f.write(" ".join(info))

Visually verify the conversion effect: