Object Detection VOC2COCO! Convert data in xml format to MSCOCO format data set, and divide the data set according to custom ratio

1. Dataset file organization format in the form of PASCAL_VOC

VOCdevkit/
|__ VOC2007/
   |__ Annotations/ (XML annotation files for each picture are stored here, such as 000001.xml)
   |__ImageSets/ (here contains a list of image filenames for different tasks or categories)
      |__ Main/ (eg, train.txt, val.txt, trainval.txt, test.txt)
      |__Layout/ (set of images for layout tasks, e.g. train.txt, val.txt, trainval.txt, test.txt)
      |__Segmentation/ (set of images for segmentation tasks, e.g. train.txt, val.txt, trainval.txt, test.txt)
   |__ JPEGImages/ (all JPEG images in the dataset are stored here, such as 000001.jpg)
   |__SegmentationClass/ (here contains images for segmentation class labels)
   |__SegmentationObject/ (images containing segmentation object labels here)

2. VOC2COCO.py data conversion

Here we take the DIOR dataset in remote sensing as an example (DIOR uses the file organization form of the PASCAL_VOC dataset)

Generally, the VOC data set has been divided into train.txt, val.txt, and test.txt in advance, so the following code is directly read according to the list of these three files. code show as below:

import os
import json
import xml.etree.ElementTree as ET
from typing import List, Dict


def get_all_categories(voc_root: str, splits: List[str]) -> Dict[str, int]:
    category_dict = {}
    for split in splits:
        print(f'scanning {split}')
        image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt'))
        annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB')
        for image_id in image_ids:
            file = f'{image_id}.xml'
            tree = ET. parse(os. path. join(annotation_dir, file))
            root = tree. getroot()
            for obj in root.iter('object'):
                category = obj.find('name').text
                if category not in category_dict:
                    category_dict[category] = len(category_dict) + 1
    return category_dict


def get_image_ids_from_file(file_path: str) -> List[str]:
    with open(file_path, 'r') as file:
        return file.read().strip().split('\\
')


def voc_to_coco(voc_root: str, split: str, category_dict: Dict[str, int]) -> Dict:
    annotations = []
    images = []
    categories = []

    # Get the required image ID
    image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt'))

    # Modify according to your own folder structure: scan all xml files in the Annotations directory
    annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB')

    for image_id in image_ids:
        file = f'{image_id}.xml'
        print(f'Processing {file}')

        tree = ET. parse(os. path. join(annotation_dir, file))
        root = tree. getroot()

        # extract information from xml file
        size = root. find('size')
        width = int(size. find('width'). text)
        height = int(size. find('height'). text)

        # Populate the images field
        images.append({
            "file_name": f'{image_id}.jpg',
            "height": height,
            "width": width,
            "id": image_id
        })

        # process each target
        for obj in root.iter('object'):
            category = obj.find('name').text
            # No need to check if category exists in dictionary, as it is already handled in get_all_categories

            bndbox = obj. find('bndbox')
            xmin = int(bndbox.find('xmin').text)
            ymin = int(bndbox.find('ymin').text)
            xmax = int(bndbox.find('xmax').text)
            ymax = int(bndbox.find('ymax').text)

            # Populate the annotations field
            annotations.append({
                "segmentation": [], # The segmentation field in COCO format usually needs to be marked with segmentation tasks, which is ignored here
                "area": (xmax - xmin) * (ymax - ymin),
                "iscrowd": 0,
                "image_id": image_id,
                "bbox": [xmin, ymin, xmax - xmin, ymax - ymin],
                "category_id": category_dict[category],
                "id": len(annotations) + 1
            })

    # Populate the categories field
    for category, category_id in category_dict.items():
        categories.append({
            'id': category_id,
            'name': category,
            'supercategory': 'none', # can be modified according to the actual situation
        })

    # Combine all information
    coco_format = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }

    return coco_format



def main():
    voc_root = './DIOR/' #Dataset root directory
    splits = ['train', 'val', 'test']

    category_dict = get_all_categories(voc_root, splits)

    for split in splits:
        coco = voc_to_coco(voc_root, split, category_dict)

        with open(f'{split}.json', 'w') as f:
            json. dump(coco, f)


if __name__ == "__main__":
    main()

However, if the dataset is not split in advance, it can be directly converted to a complete all.json. Just a little code change.

3. Divide the complete coco dataset into training, verification and test sets

Read a json file in coco format containing all the data, divided by a custom ratio.

import json
import numpy as np


def split_dataset(json_file, ratios, names):
    assert sum(ratios) == 1.0, "Ratios must sum to 1.0"
    assert len(ratios) == len(names), "Must provide name for each split"

    # load the entire json dataset
    with open(json_file, "r") as read_file:
        data = json. load(read_file)

    # Split the id of the picture and annotation
    image_ids = [image["id"] for image in data["images"]]
    np.random.shuffle(image_ids)
    num_images = len(image_ids)

    splits = [int(ratio * num_images) for ratio in ratios]
    splits[-1] = num_images - sum(splits[:-1]) # Ensure the splits sum to num_images
    split_ids = np. split(image_ids, np. cumsum(splits[:-1]))

    # Create a function to generate a new json dataset
    def create_subset(ids, name):
        subset = {}
        subset["info"] = data["info"]
        subset["licenses"] = data["licenses"]
        subset["categories"] = data["categories"]
        subset["images"] = [image for image in data["images"] if image["id"] in ids]
        subset["annotations"] = [annotation for annotation in data["annotations"] if annotation["image_id"] in ids]

        # Save as a new json file
        with open(f"{name}.json", "w") as write_file:
            json.dump(subset, write_file)

    # create dataset
    for ids, names in zip(split_ids, names):
        create_subset(ids, name)


# Example usage:
split_dataset("all.json", [0.75, 0.25, 0.0], ["train", "val", "test"])