Automatically and continuously train multiple improved yolov8 models

When there are multiple modification methods to the model, automatic continuous training can be realized. There is no need to wait for one modification method to be trained before manually clicking to start running. This way, you can make full use of your time at night and facilitate remote training of the model.

The functions implemented are as follows:

1. You can choose whether to log in to the web version of WeChat on your computer, and then receive training information in real time through your mobile phone.

2. Customize the training start time.

3. Continuously train multiple models and handle possible exceptions “RuntimeError; CUDA error; an illegal memory access was encounterede” to prevent continuous training interruption.

4. If you use a laptop to train for more than ten hours, you can choose which rounds to start training and rest for a while before continuing.

5. Due to some reasons, training should be stopped in certain time periods, such as 00:30:00~08:30:00. You can set whether to stop training in a custom time period, and then automatically continue training after this period.

The third function cannot be implemented. After testing today, the exception “RuntimeError; CUDA error; an illegal memory access was encounterede” cannot be solved. It will keep reporting errors. Adding a delay function to let it take a rest or restarting from the original code will not work. It must be stopped manually. The exception in the program will disappear. I thought it was too simple and couldn’t do it. It couldn’t be done at all. I don’t know if turning off the overclocked graphics card will reduce the occurrence of this abnormality. I turned it off and it didn’t appear for the time being.

Make changes in ultralytics\models\yolo\detect\train.py and ultralytics\engine\trainer.py. The modified ones are newly added or changed. The code is as follows:

train.py--------------------------------------------- ----------------------------------
# modified
# from ultralytics import YOLO
# if __name__ == '__main__':
# model = YOLO('yaml_files/yolov8n_helmet_1.yaml')
# model.train(data='helmet.yaml', epochs=200)

# Ultralytics YOLO , AGPL-3.0 license
from copy import copy
import numpy as np

# modified
import pygame
import time
from ultralytics.utils import colorstr
import datetime
from wxpy import *
import traceback
import os
import shutil

from ultralytics.data import build_dataloader, build_yolo_dataset
from ultralytics.engine.trainer import BaseTrainer
from ultralytics.models import yolo
from ultralytics.nn.tasks import DetectionModel
from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
from ultralytics.utils.plotting import plot_images, plot_labels, plot_results
from ultralytics.utils.torch_utils import de_parallel, torch_distributed_zero_first


# BaseTrainer python usage
class DetectionTrainer(BaseTrainer):

    def build_dataset(self, img_path, mode='train', batch=None):
        """
        Build YOLO Dataset.

        Args:
            img_path (str): Path to the folder containing images.
            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
        """
        gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
        return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == 'val', stride=gs)

    def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'):
        """Construct and return dataloader."""
        assert mode in ['train', 'val']
        with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
            dataset = self.build_dataset(dataset_path, mode, batch_size)
        shuffle = mode == 'train'
        if getattr(dataset, 'rect', False) and shuffle:
            LOGGER.warning("WARNING  'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False")
            shuffle = False
        workers = self.args.workers if mode == 'train' else self.args.workers * 2
        return build_dataloader(dataset, batch_size, workers, shuffle, rank) # return dataloader

    def preprocess_batch(self, batch):
        """Preprocesses a batch of images by scaling and converting to float."""
        batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255
        return batch

    def set_model_attributes(self):
        """nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps)."""
        # self.args.box *= 3 / nl # scale to layers
        # self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers
        # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers
        self.model.nc = self.data['nc'] # attach number of classes to model
        self.model.names = self.data['names'] # attach class names to model
        self.model.args = self.args # attach hyperparameters to model
        # TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc

    def get_model(self, cfg=None, weights=None, verbose=True):
        """Return a YOLO detection model."""
        model = DetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1)
        if weights:
            model.load(weights)
        return model

    def get_validator(self):
        """Returns a DetectionValidator for YOLO model validation."""
        self.loss_names = 'box_loss', 'cls_loss', 'dfl_loss'
        return yolo.detect.DetectionValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args))

    def label_loss_items(self, loss_items=None, prefix='train'):
        """
        Returns a loss dict with labeled training loss items tensor
        """
        # Not needed for classification but necessary for segmentation & detection
        keys = [f'{prefix}/{x}' for x in self.loss_names]
        if loss_items is not None:
            loss_items = [round(float(x), 5) for x in loss_items] # convert tensors to 5 decimal place floats
            return dict(zip(keys, loss_items))
        else:
            return keys

    def progress_string(self):
        """Returns a formatted string of training progress with epoch, GPU memory, loss, instances and size."""
        return ('\\
' + ' s' *
                (4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size')

    def plot_training_samples(self, batch, ni):
        """Plots training samples with their annotations."""
        plot_images(images=batch['img'],
                    batch_idx=batch['batch_idx'],
                    cls=batch['cls'].squeeze(-1),
                    bboxes=batch['bboxes'],
                    paths=batch['im_file'],
                    fname=self.save_dir / f'train_batch{ni}.jpg',
                    on_plot=self.on_plot)

    def plot_metrics(self):
        """Plots metrics from a CSV file."""
        plot_results(file=self.csv, on_plot=self.on_plot) # save results.png

    def plot_training_labels(self):
        """Create a labeled training plot of the YOLO model."""
        boxes = np.concatenate([lb['bboxes'] for lb in self.train_loader.dataset.labels], 0)
        cls = np.concatenate([lb['cls'] for lb in self.train_loader.dataset.labels], 0)
        plot_labels(boxes, cls.squeeze(), names=self.data['names'], save_dir=self.save_dir, on_plot=self.on_plot)


# modified
def play_voice(audio_path):
    pygame.init()
    pygame.mixer.init()
    sound = pygame.mixer.Sound(audio_path)
    sound.play()
    while pygame.mixer.get_busy():
        pass
    pygame.quit()


# modified
def train(model_path, rest_epoch, cfg=DEFAULT_CFG, use_python=False):
    """Train and optimize YOLO model given training data and device."""

    # modified
    # model = cfg.model or 'yolov8n.pt'
    # data = cfg.data or 'coco128.yaml' # or yolo.ClassificationDataset("mnist")
    model = model_path
    data = 'helmet.yaml'

    device = cfg.device if cfg.device is not None else ''
    args = dict(model=model, data=data, device=device)
    if use_python:
        from ultralytics import YOLO
        YOLO(model).train(**args)
    else:
        trainer = DetectionTrainer(overrides=args)
        trainer.train(rest_epoch)


# modified
def message(my_friend, string):
    emphasis_frequency = 3
    for j in range(1, emphasis_frequency + 1):
        try:
            my_friend.send_msg(str(j) + '-' + string)
        except:
            LOGGER.info(f"Failed to send WeChat message.")
            traceback.print_exc()
            break
        time.sleep(1)


# modified
def main(initial_sequence, training_number, resting_time, year, month, day, hour, minute, second, open_wechat, rest_epoch):
    if open_wechat:
        bot = Bot()
        my_friend = bot.friends().search('WeChat nickname')[0]
    while True:
        if datetime.datetime.now() > datetime.datetime(year, month, day, hour, minute, second):
            for i in range(initial_sequence, initial_sequence + training_number):
                while True:
                    start_time = time.time()
                    # play_voice("pythonCode/Start training model.mp3")
                    model_path = "yaml_files/yolov8n_helmet_" + str(i) + ".yaml"
                    try:
                        if open_wechat:
                            message(my_friend, "yolov8n_helmet_" + str(i) + " starts training.")
                        train(model_path, rest_epoch)
                    except:
                        LOGGER.info(f"Unexpected Error.")
                        traceback.print_exc()
                        time.sleep(60)
                        folder_path = "runs/detect/train" + str(i)
                        if os.path.exists(folder_path):
                            shutil.rmtree(folder_path)
                        if open_wechat:
                            message(my_friend, "yolov8n_helmet_" + str(i) + " occurs error.")
                        continue
                    finally:
                        # play_voice("pythonCode/Model training has ended.mp3")
                        end_time = time.time()
                        run_time = time.strftime('%H:%M:%S', time.gmtime(end_time - start_time))
                        LOGGER.info(f"\\
{colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)))}")
                        LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(end_time)))}")
                        LOGGER.info(f"Run_time: {run_time}\\
")
                    if open_wechat:
                        message(my_friend, "yolov8n_helmet_" + str(i) + " training has ended. The running time is " + run_time)
                    if i < initial_sequence + training_number - 1:
                        time.sleep(resting_time)
                    break
            break
    if open_wechat:
        bot.logout()


# modified
if __name__ == '__main__':
    initial_sequence = 1 # The starting sequence number of the yaml file to be trained
    training_number = 3 # How many yaml files are there in total to be trained?
    resting_time = 3600 # How many seconds to let the computer rest after training a yaml file
    year, month, day, hour, minute, second = 2023, 10, 29, 2, 28, 00 # Training start time
    open_wechat = True # True, False, whether to enable WeChat message notifications
    rest_epoch = [50, 100, 150] # Let the computer rest for a period of time after training epoch=50, 100, 150. You can adjust it at will. If you want to train to the end without rest, just do not fill in any number of rounds in the square brackets.
    main(initial_sequence, training_number, resting_time, year, month, day, hour, minute, second, open_wechat, rest_epoch)


trainer.py-------------------------------------------------- --------------------------------
for epoch in range(self.start_epoch, self.epochs):

    # modified
    if epoch in rest_epoch:
        resting_time = 1200
        LOGGER.info(f"Rest for {resting_time / 60} minutes after training for {epoch} epochs")
        LOGGER.info(f"{colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}")
        time.sleep(resting_time)
        LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}")
    if open_night_break:
        now = datetime.now()
        if datetime(now.year, now.month, now.day, 0, 30, 00) <= datetime.now() and datetime.now() <= datetime(now.year, now.month, now.day, 8, 30, 00):
            LOGGER.info(f"Stop training between 00:30:00 and 08:30:00")
            LOGGER.info(f"{colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}")
            while True:
                if datetime(now.year, now.month, now.day, 8, 30, 00) < datetime.now():
                    LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}")
                    break
                time.sleep(60)

    self.epoch = epoch
    self.run_callbacks('on_train_epoch_start')

YAML files corresponding to multiple models to be trained: