[Computer Vision] Exploratory data analysis of image targets

Article directory

  • Preface
  • 1. Data preparation
  • 2. Data processing
    • 1. Image size distribution
    • 2. Number of different categories
    • 3. Statistics on the number of targets of different sizes
  • Summarize

Foreword

Exploratory data analysis of image targets can help us understand target distribution, what should be paid attention to during the training process, training hyperparameter settings, etc. This article is suitable for data set analysis in coco format.

1. Data preparation

This article uses the PCB defect detection data set for demonstration.
Dataset download link
Organize the data into pandas table format:

from pycocotools.coco import COCO
import pandas as pd
import os.path as osp
import logging


def is_pic(img_name):
    valid_suffix = ['JPEG', 'jpeg', 'JPG', 'jpg', 'BMP', 'bmp', 'PNG', 'png' ]
    suffix = img_name.split('.')[-1]
    if suffix not in valid_suffix:
        return False
    return True


if __name__ == '__main__':
    anno_file = r".\pcb\pcb_cocoanno\train.json"
    data_dir = r'.\pcb\images'
    labels = []
    coco = COCO(anno_file)
    img_ids = sorted(coco.getImgIds())
    cat_ids = coco.getCatIds()
    cat_id2cls_id = dict({<!-- -->cat_id: i for i, cat_id in enumerate(cat_ids)})
    cname2clsid = dict({<!-- -->
        coco.loadCats(cat_id)[0]['name']: clsid
        for cat_id, clsid in cat_id2cls_id.items()
    })
    
    df = pd.DataFrame()
    cls2id = dict({<!-- -->id: name for name, id in cname2clsid.items()})
    for label, cid in sorted(cname2clsid.items(), key=lambda d: d[1]):
        labels.append(label)

    ct=0
    for img_id in img_ids:
        is_empty = False
        img_anno = coco.loadImgs(img_id)[0]
        im_fname = osp.join(data_dir, img_anno['file_name'])
        if not is_pic(im_fname):
            continue
        im_w = float(img_anno['width'])
        im_h = float(img_anno['height'])
        ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
        instances = coco.loadAnns(ins_anno_ids)
        
        temp_dict = {<!-- -->}
        bboxes = []
        for inst in instances:
            temp_dict['id'] = img_id
            temp_dict['imagePath'] = im_fname
            temp_dict['imageWidth'] = im_w
            temp_dict['imageHeight'] = im_h
            x, y, box_w, box_h = inst['bbox']
            x1 = max(0, x)
            y1 = max(0, y)
            x2 = min(im_w - 1, x1 + max(0, box_w))
            y2 = min(im_h - 1, y1 + max(0, box_h))
            temp_dict['category'] = cls2id[inst['category_id']]
            if inst['area'] > 0 and x2 >= x1 and y2 >= y1:
                if inst['area'] < 32 * 32:
                    temp_dict["area"] = "small"
                elif inst['area'] > 96 * 96:
                    temp_dict["area"] = "large"
                else:
                    temp_dict["area"] = "medium"
                inst['clean_bbox'] = [x1, y1, x2, y2]
                temp_dict['bbox'] = [x1, y1, x2, y2]
            else:
                logging.warning(
                    "Found an invalid bbox in annotations: "
                    "im_id: {}, area: {} x1: {}, y1: {}, x2: {}, y2: {}."
                    .format(img_id, float(inst['area']), x1, y1, x2, y2))
            df = df.append(temp_dict, ignore_index=True)
        num_bbox = len(bboxes)
    
    df.to_csv('./train_dataset.csv', index=False)

2. Data processing

1. Image size distribution

Count image sizes and visualize image distribution.

import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def size_distribution(dataframe):
    x = dataframe['imageWidth']
    y = dataframe['imageHeight']
    
    xy = np.vstack([x, y])
    z = gaussian_kde(xy)(xy)
    
    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]
    plt.figure(figsize=(10, 10))
    plt.scatter(x, y, c=z, s=5, cmap='Spectral_r')
    plt.tick_params(labelsize=15)
    
    xy_max = max(max(x), max(y))
    plt.xlim(xmin=0, xmax=xy_max)
    plt.ylim(ymin=0, ymax=xy_max)
    
    plt.ylabel('height', fontsize=25)
    plt.xlabel('width', fontsize=25)
    
    plt.savefig('chart/size_distribution.png', dpi=120, bbox_inches='tight')
    plt.show()

if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally
    plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally
df = pd.read_csv('./train_dataset.csv')
size_distribution(df)


It can be seen from the results that the image size distribution is relatively even.

2. Number of different categories

During the training process, we hope that the types of targets to be trained are as similar as possible so that the model will not be biased towards any one side.

import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


def number_of_category(dataframe):
    df_num = pd.DataFrame()
    label_type_list = []
    num_list = []
    for each in df['category'].unique():
        label_type_list.append(each)
        num_list.append(len(dataframe[dataframe['category'] == each]))
    
    df_num['label_type'] = label_type_list
    df_num['num'] = num_list
    
    df_num = df_num.sort_values(by='num', ascending=False)
    
    plt.figure(figsize=(22, 10))
    
    x = df_num['label_type']
    y = df_num['num']
    
    plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')
    
    plt.xticks(rotation=90)
    plt.tick_params(labelsize=15)
    plt.xlabel('label category', fontsize=20)
    plt.ylabel('target quantity', fontsize=20)
    
    plt.savefig('chart/number_of_category.png', dpi=120, bbox_inches='tight')
    
    plt.show()


if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally
    plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally
    df = pd.read_csv('./train_dataset.csv')
    number_of_category(df)

It can be seen that the number of targets in different categories is roughly equal.

3. Statistics on the number of targets of different sizes

The commonly used coco detection indicators for target detection will divide targets into small targets (<32X=x32), medium targets (32x32~96x96), and large targets (>96×96). We can count the target size in advance so that we know which target to focus on first during the training and verification process.

# -*- coding: utf-8 -*-
import os
import cv2
import numpy as np
import pandas as pd
import math

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


def num_of_area(dataframe):
    new_df = pd.DataFrame()
    label_type_list = []
    num_list = []
    for area_type in dataframe['area'].unique():
        label_type_list.append(area_type)
        num_list.append(len(dataframe[dataframe['area'] == area_type]))
    
    new_df['label_type'] = label_type_list
    new_df['num'] = num_list
    
    df_num = new_df.sort_values(by='num', ascending=False)
    
    plt.figure(figsize=(11, 5))
    
    x = df_num['label_type']
    y = df_num['num']
    
    x = df_num['label_type']
    y = df_num['num']
    
    plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')
    
    plt.xticks(rotation=90)
    plt.tick_params(labelsize=15)
    plt.xlabel('target size', fontsize=20)
    plt.ylabel('target quantity', fontsize=20)
    
    plt.savefig('chart/num_of_area.png', dpi=120, bbox_inches='tight')
    
    plt.show()


if __name__ == '__main__':
    if not os.path.exists('chart'):
        os.mkdir('./chart')
    plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally
    plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally
    df = pd.read_csv('./train_dataset.csv')
    num_of_area(df)


It can be seen from this that the targets are mainly medium-sized targets. When verifying, the main focus is on the map value of this indicator.

Summary

This article mainly introduces the exploratory data analysis of image targets. It can also count some other things that you want to pay attention to, such as the distribution of the size of the labeling box, the distribution of the center position of the labeling box, etc. These are all helpful to help understand the data set itself, so as to Better set hyperparameters for training.