Article directory
- Preface
- 1. Data preparation
- 2. Data processing
-
- 1. Image size distribution
- 2. Number of different categories
- 3. Statistics on the number of targets of different sizes
- Summarize
Foreword
Exploratory data analysis of image targets can help us understand target distribution, what should be paid attention to during the training process, training hyperparameter settings, etc. This article is suitable for data set analysis in coco format.
1. Data preparation
This article uses the PCB defect detection data set for demonstration.
Dataset download link
Organize the data into pandas table format:
from pycocotools.coco import COCO import pandas as pd import os.path as osp import logging def is_pic(img_name): valid_suffix = ['JPEG', 'jpeg', 'JPG', 'jpg', 'BMP', 'bmp', 'PNG', 'png' ] suffix = img_name.split('.')[-1] if suffix not in valid_suffix: return False return True if __name__ == '__main__': anno_file = r".\pcb\pcb_cocoanno\train.json" data_dir = r'.\pcb\images' labels = [] coco = COCO(anno_file) img_ids = sorted(coco.getImgIds()) cat_ids = coco.getCatIds() cat_id2cls_id = dict({<!-- -->cat_id: i for i, cat_id in enumerate(cat_ids)}) cname2clsid = dict({<!-- --> coco.loadCats(cat_id)[0]['name']: clsid for cat_id, clsid in cat_id2cls_id.items() }) df = pd.DataFrame() cls2id = dict({<!-- -->id: name for name, id in cname2clsid.items()}) for label, cid in sorted(cname2clsid.items(), key=lambda d: d[1]): labels.append(label) ct=0 for img_id in img_ids: is_empty = False img_anno = coco.loadImgs(img_id)[0] im_fname = osp.join(data_dir, img_anno['file_name']) if not is_pic(im_fname): continue im_w = float(img_anno['width']) im_h = float(img_anno['height']) ins_anno_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) instances = coco.loadAnns(ins_anno_ids) temp_dict = {<!-- -->} bboxes = [] for inst in instances: temp_dict['id'] = img_id temp_dict['imagePath'] = im_fname temp_dict['imageWidth'] = im_w temp_dict['imageHeight'] = im_h x, y, box_w, box_h = inst['bbox'] x1 = max(0, x) y1 = max(0, y) x2 = min(im_w - 1, x1 + max(0, box_w)) y2 = min(im_h - 1, y1 + max(0, box_h)) temp_dict['category'] = cls2id[inst['category_id']] if inst['area'] > 0 and x2 >= x1 and y2 >= y1: if inst['area'] < 32 * 32: temp_dict["area"] = "small" elif inst['area'] > 96 * 96: temp_dict["area"] = "large" else: temp_dict["area"] = "medium" inst['clean_bbox'] = [x1, y1, x2, y2] temp_dict['bbox'] = [x1, y1, x2, y2] else: logging.warning( "Found an invalid bbox in annotations: " "im_id: {}, area: {} x1: {}, y1: {}, x2: {}, y2: {}." .format(img_id, float(inst['area']), x1, y1, x2, y2)) df = df.append(temp_dict, ignore_index=True) num_bbox = len(bboxes) df.to_csv('./train_dataset.csv', index=False)
2. Data processing
1. Image size distribution
Count image sizes and visualize image distribution.
import os import cv2 import numpy as np import pandas as pd import math from mpl_toolkits.axes_grid1 import ImageGrid import matplotlib.pyplot as plt from scipy.stats import gaussian_kde def size_distribution(dataframe): x = dataframe['imageWidth'] y = dataframe['imageHeight'] xy = np.vstack([x, y]) z = gaussian_kde(xy)(xy) # Sort the points by density, so that the densest points are plotted last idx = z.argsort() x, y, z = x[idx], y[idx], z[idx] plt.figure(figsize=(10, 10)) plt.scatter(x, y, c=z, s=5, cmap='Spectral_r') plt.tick_params(labelsize=15) xy_max = max(max(x), max(y)) plt.xlim(xmin=0, xmax=xy_max) plt.ylim(ymin=0, ymax=xy_max) plt.ylabel('height', fontsize=25) plt.xlabel('width', fontsize=25) plt.savefig('chart/size_distribution.png', dpi=120, bbox_inches='tight') plt.show() if __name__ == '__main__': if not os.path.exists('chart'): os.mkdir('./chart') plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally df = pd.read_csv('./train_dataset.csv') size_distribution(df)
It can be seen from the results that the image size distribution is relatively even.
2. Number of different categories
During the training process, we hope that the types of targets to be trained are as similar as possible so that the model will not be biased towards any one side.
import os import cv2 import numpy as np import pandas as pd import math from mpl_toolkits.axes_grid1 import ImageGrid import matplotlib.pyplot as plt from scipy.stats import gaussian_kde def number_of_category(dataframe): df_num = pd.DataFrame() label_type_list = [] num_list = [] for each in df['category'].unique(): label_type_list.append(each) num_list.append(len(dataframe[dataframe['category'] == each])) df_num['label_type'] = label_type_list df_num['num'] = num_list df_num = df_num.sort_values(by='num', ascending=False) plt.figure(figsize=(22, 10)) x = df_num['label_type'] y = df_num['num'] plt.bar(x, y, facecolor='#1f77b4', edgecolor='k') plt.xticks(rotation=90) plt.tick_params(labelsize=15) plt.xlabel('label category', fontsize=20) plt.ylabel('target quantity', fontsize=20) plt.savefig('chart/number_of_category.png', dpi=120, bbox_inches='tight') plt.show() if __name__ == '__main__': if not os.path.exists('chart'): os.mkdir('./chart') plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally df = pd.read_csv('./train_dataset.csv') number_of_category(df)
It can be seen that the number of targets in different categories is roughly equal.
3. Statistics on the number of targets of different sizes
The commonly used coco detection indicators for target detection will divide targets into small targets (<32X=x32), medium targets (32x32~96x96), and large targets (>96×96). We can count the target size in advance so that we know which target to focus on first during the training and verification process.
# -*- coding: utf-8 -*- import os import cv2 import numpy as np import pandas as pd import math from mpl_toolkits.axes_grid1 import ImageGrid import matplotlib.pyplot as plt from scipy.stats import gaussian_kde def num_of_area(dataframe): new_df = pd.DataFrame() label_type_list = [] num_list = [] for area_type in dataframe['area'].unique(): label_type_list.append(area_type) num_list.append(len(dataframe[dataframe['area'] == area_type])) new_df['label_type'] = label_type_list new_df['num'] = num_list df_num = new_df.sort_values(by='num', ascending=False) plt.figure(figsize=(11, 5)) x = df_num['label_type'] y = df_num['num'] x = df_num['label_type'] y = df_num['num'] plt.bar(x, y, facecolor='#1f77b4', edgecolor='k') plt.xticks(rotation=90) plt.tick_params(labelsize=15) plt.xlabel('target size', fontsize=20) plt.ylabel('target quantity', fontsize=20) plt.savefig('chart/num_of_area.png', dpi=120, bbox_inches='tight') plt.show() if __name__ == '__main__': if not os.path.exists('chart'): os.mkdir('./chart') plt.rcParams['font.sans-serif'] = ['SimHei'] # Used to display Chinese labels normally plt.rcParams['axes.unicode_minus'] = False # Used to display negative signs normally df = pd.read_csv('./train_dataset.csv') num_of_area(df)
It can be seen from this that the targets are mainly medium-sized targets. When verifying, the main focus is on the map value of this indicator.
Summary
This article mainly introduces the exploratory data analysis of image targets. It can also count some other things that you want to pay attention to, such as the distribution of the size of the labeling box, the distribution of the center position of the labeling box, etc. These are all helpful to help understand the data set itself, so as to Better set hyperparameters for training.