1. Dataset file organization format in the form of PASCAL_VOC
VOCdevkit/ |__ VOC2007/ |__ Annotations/ (XML annotation files for each picture are stored here, such as 000001.xml) |__ImageSets/ (here contains a list of image filenames for different tasks or categories) |__ Main/ (eg, train.txt, val.txt, trainval.txt, test.txt) |__Layout/ (set of images for layout tasks, e.g. train.txt, val.txt, trainval.txt, test.txt) |__Segmentation/ (set of images for segmentation tasks, e.g. train.txt, val.txt, trainval.txt, test.txt) |__ JPEGImages/ (all JPEG images in the dataset are stored here, such as 000001.jpg) |__SegmentationClass/ (here contains images for segmentation class labels) |__SegmentationObject/ (images containing segmentation object labels here)
2. VOC2COCO.py data conversion
Here we take the DIOR dataset in remote sensing as an example (DIOR uses the file organization form of the PASCAL_VOC dataset)
Generally, the VOC data set has been divided into train.txt, val.txt, and test.txt in advance, so the following code is directly read according to the list of these three files. code show as below:
import os import json import xml.etree.ElementTree as ET from typing import List, Dict def get_all_categories(voc_root: str, splits: List[str]) -> Dict[str, int]: category_dict = {} for split in splits: print(f'scanning {split}') image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt')) annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB') for image_id in image_ids: file = f'{image_id}.xml' tree = ET. parse(os. path. join(annotation_dir, file)) root = tree. getroot() for obj in root.iter('object'): category = obj.find('name').text if category not in category_dict: category_dict[category] = len(category_dict) + 1 return category_dict def get_image_ids_from_file(file_path: str) -> List[str]: with open(file_path, 'r') as file: return file.read().strip().split('\\ ') def voc_to_coco(voc_root: str, split: str, category_dict: Dict[str, int]) -> Dict: annotations = [] images = [] categories = [] # Get the required image ID image_ids = get_image_ids_from_file(os.path.join(voc_root, 'ImageSets', 'Main', f'{split}.txt')) # Modify according to your own folder structure: scan all xml files in the Annotations directory annotation_dir = os.path.join(voc_root, 'Annotations/Annotations/HBB') for image_id in image_ids: file = f'{image_id}.xml' print(f'Processing {file}') tree = ET. parse(os. path. join(annotation_dir, file)) root = tree. getroot() # extract information from xml file size = root. find('size') width = int(size. find('width'). text) height = int(size. find('height'). text) # Populate the images field images.append({ "file_name": f'{image_id}.jpg', "height": height, "width": width, "id": image_id }) # process each target for obj in root.iter('object'): category = obj.find('name').text # No need to check if category exists in dictionary, as it is already handled in get_all_categories bndbox = obj. find('bndbox') xmin = int(bndbox.find('xmin').text) ymin = int(bndbox.find('ymin').text) xmax = int(bndbox.find('xmax').text) ymax = int(bndbox.find('ymax').text) # Populate the annotations field annotations.append({ "segmentation": [], # The segmentation field in COCO format usually needs to be marked with segmentation tasks, which is ignored here "area": (xmax - xmin) * (ymax - ymin), "iscrowd": 0, "image_id": image_id, "bbox": [xmin, ymin, xmax - xmin, ymax - ymin], "category_id": category_dict[category], "id": len(annotations) + 1 }) # Populate the categories field for category, category_id in category_dict.items(): categories.append({ 'id': category_id, 'name': category, 'supercategory': 'none', # can be modified according to the actual situation }) # Combine all information coco_format = { "images": images, "annotations": annotations, "categories": categories } return coco_format def main(): voc_root = './DIOR/' #Dataset root directory splits = ['train', 'val', 'test'] category_dict = get_all_categories(voc_root, splits) for split in splits: coco = voc_to_coco(voc_root, split, category_dict) with open(f'{split}.json', 'w') as f: json. dump(coco, f) if __name__ == "__main__": main()
However, if the dataset is not split in advance, it can be directly converted to a complete all.json. Just a little code change.
3. Divide the complete coco dataset into training, verification and test sets
Read a json file in coco format containing all the data, divided by a custom ratio.
import json import numpy as np def split_dataset(json_file, ratios, names): assert sum(ratios) == 1.0, "Ratios must sum to 1.0" assert len(ratios) == len(names), "Must provide name for each split" # load the entire json dataset with open(json_file, "r") as read_file: data = json. load(read_file) # Split the id of the picture and annotation image_ids = [image["id"] for image in data["images"]] np.random.shuffle(image_ids) num_images = len(image_ids) splits = [int(ratio * num_images) for ratio in ratios] splits[-1] = num_images - sum(splits[:-1]) # Ensure the splits sum to num_images split_ids = np. split(image_ids, np. cumsum(splits[:-1])) # Create a function to generate a new json dataset def create_subset(ids, name): subset = {} subset["info"] = data["info"] subset["licenses"] = data["licenses"] subset["categories"] = data["categories"] subset["images"] = [image for image in data["images"] if image["id"] in ids] subset["annotations"] = [annotation for annotation in data["annotations"] if annotation["image_id"] in ids] # Save as a new json file with open(f"{name}.json", "w") as write_file: json.dump(subset, write_file) # create dataset for ids, names in zip(split_ids, names): create_subset(ids, name) # Example usage: split_dataset("all.json", [0.75, 0.25, 0.0], ["train", "val", "test"])