When there are multiple modification methods to the model, automatic continuous training can be realized. There is no need to wait for one modification method to be trained before manually clicking to start running. This way, you can make full use of your time at night and facilitate remote training of the model.
The functions implemented are as follows:
1. You can choose whether to log in to the web version of WeChat on your computer, and then receive training information in real time through your mobile phone.
2. Customize the training start time.
3. Continuously train multiple models and handle possible exceptions “RuntimeError; CUDA error; an illegal memory access was encounterede” to prevent continuous training interruption.
4. If you use a laptop to train for more than ten hours, you can choose which rounds to start training and rest for a while before continuing.
5. Due to some reasons, training should be stopped in certain time periods, such as 00:30:00~08:30:00. You can set whether to stop training in a custom time period, and then automatically continue training after this period.
The third function cannot be implemented. After testing today, the exception “RuntimeError; CUDA error; an illegal memory access was encounterede” cannot be solved. It will keep reporting errors. Adding a delay function to let it take a rest or restarting from the original code will not work. It must be stopped manually. The exception in the program will disappear. I thought it was too simple and couldn’t do it. It couldn’t be done at all. I don’t know if turning off the overclocked graphics card will reduce the occurrence of this abnormality. I turned it off and it didn’t appear for the time being.
Make changes in ultralytics\models\yolo\detect\train.py and ultralytics\engine\trainer.py. The modified ones are newly added or changed. The code is as follows:
train.py--------------------------------------------- ---------------------------------- # modified # from ultralytics import YOLO # if __name__ == '__main__': # model = YOLO('yaml_files/yolov8n_helmet_1.yaml') # model.train(data='helmet.yaml', epochs=200) # Ultralytics YOLO , AGPL-3.0 license from copy import copy import numpy as np # modified import pygame import time from ultralytics.utils import colorstr import datetime from wxpy import * import traceback import os import shutil from ultralytics.data import build_dataloader, build_yolo_dataset from ultralytics.engine.trainer import BaseTrainer from ultralytics.models import yolo from ultralytics.nn.tasks import DetectionModel from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK from ultralytics.utils.plotting import plot_images, plot_labels, plot_results from ultralytics.utils.torch_utils import de_parallel, torch_distributed_zero_first # BaseTrainer python usage class DetectionTrainer(BaseTrainer): def build_dataset(self, img_path, mode='train', batch=None): """ Build YOLO Dataset. Args: img_path (str): Path to the folder containing images. mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode. batch (int, optional): Size of batches, this is for `rect`. Defaults to None. """ gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32) return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == 'val', stride=gs) def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode='train'): """Construct and return dataloader.""" assert mode in ['train', 'val'] with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP dataset = self.build_dataset(dataset_path, mode, batch_size) shuffle = mode == 'train' if getattr(dataset, 'rect', False) and shuffle: LOGGER.warning("WARNING 'rect=True' is incompatible with DataLoader shuffle, setting shuffle=False") shuffle = False workers = self.args.workers if mode == 'train' else self.args.workers * 2 return build_dataloader(dataset, batch_size, workers, shuffle, rank) # return dataloader def preprocess_batch(self, batch): """Preprocesses a batch of images by scaling and converting to float.""" batch['img'] = batch['img'].to(self.device, non_blocking=True).float() / 255 return batch def set_model_attributes(self): """nl = de_parallel(self.model).model[-1].nl # number of detection layers (to scale hyps).""" # self.args.box *= 3 / nl # scale to layers # self.args.cls *= self.data["nc"] / 80 * 3 / nl # scale to classes and layers # self.args.cls *= (self.args.imgsz / 640) ** 2 * 3 / nl # scale to image size and layers self.model.nc = self.data['nc'] # attach number of classes to model self.model.names = self.data['names'] # attach class names to model self.model.args = self.args # attach hyperparameters to model # TODO: self.model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc def get_model(self, cfg=None, weights=None, verbose=True): """Return a YOLO detection model.""" model = DetectionModel(cfg, nc=self.data['nc'], verbose=verbose and RANK == -1) if weights: model.load(weights) return model def get_validator(self): """Returns a DetectionValidator for YOLO model validation.""" self.loss_names = 'box_loss', 'cls_loss', 'dfl_loss' return yolo.detect.DetectionValidator(self.test_loader, save_dir=self.save_dir, args=copy(self.args)) def label_loss_items(self, loss_items=None, prefix='train'): """ Returns a loss dict with labeled training loss items tensor """ # Not needed for classification but necessary for segmentation & detection keys = [f'{prefix}/{x}' for x in self.loss_names] if loss_items is not None: loss_items = [round(float(x), 5) for x in loss_items] # convert tensors to 5 decimal place floats return dict(zip(keys, loss_items)) else: return keys def progress_string(self): """Returns a formatted string of training progress with epoch, GPU memory, loss, instances and size.""" return ('\\ ' + ' s' * (4 + len(self.loss_names))) % ('Epoch', 'GPU_mem', *self.loss_names, 'Instances', 'Size') def plot_training_samples(self, batch, ni): """Plots training samples with their annotations.""" plot_images(images=batch['img'], batch_idx=batch['batch_idx'], cls=batch['cls'].squeeze(-1), bboxes=batch['bboxes'], paths=batch['im_file'], fname=self.save_dir / f'train_batch{ni}.jpg', on_plot=self.on_plot) def plot_metrics(self): """Plots metrics from a CSV file.""" plot_results(file=self.csv, on_plot=self.on_plot) # save results.png def plot_training_labels(self): """Create a labeled training plot of the YOLO model.""" boxes = np.concatenate([lb['bboxes'] for lb in self.train_loader.dataset.labels], 0) cls = np.concatenate([lb['cls'] for lb in self.train_loader.dataset.labels], 0) plot_labels(boxes, cls.squeeze(), names=self.data['names'], save_dir=self.save_dir, on_plot=self.on_plot) # modified def play_voice(audio_path): pygame.init() pygame.mixer.init() sound = pygame.mixer.Sound(audio_path) sound.play() while pygame.mixer.get_busy(): pass pygame.quit() # modified def train(model_path, rest_epoch, cfg=DEFAULT_CFG, use_python=False): """Train and optimize YOLO model given training data and device.""" # modified # model = cfg.model or 'yolov8n.pt' # data = cfg.data or 'coco128.yaml' # or yolo.ClassificationDataset("mnist") model = model_path data = 'helmet.yaml' device = cfg.device if cfg.device is not None else '' args = dict(model=model, data=data, device=device) if use_python: from ultralytics import YOLO YOLO(model).train(**args) else: trainer = DetectionTrainer(overrides=args) trainer.train(rest_epoch) # modified def message(my_friend, string): emphasis_frequency = 3 for j in range(1, emphasis_frequency + 1): try: my_friend.send_msg(str(j) + '-' + string) except: LOGGER.info(f"Failed to send WeChat message.") traceback.print_exc() break time.sleep(1) # modified def main(initial_sequence, training_number, resting_time, year, month, day, hour, minute, second, open_wechat, rest_epoch): if open_wechat: bot = Bot() my_friend = bot.friends().search('WeChat nickname')[0] while True: if datetime.datetime.now() > datetime.datetime(year, month, day, hour, minute, second): for i in range(initial_sequence, initial_sequence + training_number): while True: start_time = time.time() # play_voice("pythonCode/Start training model.mp3") model_path = "yaml_files/yolov8n_helmet_" + str(i) + ".yaml" try: if open_wechat: message(my_friend, "yolov8n_helmet_" + str(i) + " starts training.") train(model_path, rest_epoch) except: LOGGER.info(f"Unexpected Error.") traceback.print_exc() time.sleep(60) folder_path = "runs/detect/train" + str(i) if os.path.exists(folder_path): shutil.rmtree(folder_path) if open_wechat: message(my_friend, "yolov8n_helmet_" + str(i) + " occurs error.") continue finally: # play_voice("pythonCode/Model training has ended.mp3") end_time = time.time() run_time = time.strftime('%H:%M:%S', time.gmtime(end_time - start_time)) LOGGER.info(f"\\ {colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)))}") LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(end_time)))}") LOGGER.info(f"Run_time: {run_time}\\ ") if open_wechat: message(my_friend, "yolov8n_helmet_" + str(i) + " training has ended. The running time is " + run_time) if i < initial_sequence + training_number - 1: time.sleep(resting_time) break break if open_wechat: bot.logout() # modified if __name__ == '__main__': initial_sequence = 1 # The starting sequence number of the yaml file to be trained training_number = 3 # How many yaml files are there in total to be trained? resting_time = 3600 # How many seconds to let the computer rest after training a yaml file year, month, day, hour, minute, second = 2023, 10, 29, 2, 28, 00 # Training start time open_wechat = True # True, False, whether to enable WeChat message notifications rest_epoch = [50, 100, 150] # Let the computer rest for a period of time after training epoch=50, 100, 150. You can adjust it at will. If you want to train to the end without rest, just do not fill in any number of rounds in the square brackets. main(initial_sequence, training_number, resting_time, year, month, day, hour, minute, second, open_wechat, rest_epoch) trainer.py-------------------------------------------------- -------------------------------- for epoch in range(self.start_epoch, self.epochs): # modified if epoch in rest_epoch: resting_time = 1200 LOGGER.info(f"Rest for {resting_time / 60} minutes after training for {epoch} epochs") LOGGER.info(f"{colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}") time.sleep(resting_time) LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}") if open_night_break: now = datetime.now() if datetime(now.year, now.month, now.day, 0, 30, 00) <= datetime.now() and datetime.now() <= datetime(now.year, now.month, now.day, 8, 30, 00): LOGGER.info(f"Stop training between 00:30:00 and 08:30:00") LOGGER.info(f"{colorstr('red', 'Start_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}") while True: if datetime(now.year, now.month, now.day, 8, 30, 00) < datetime.now(): LOGGER.info(f"{colorstr('red', 'End_time:')} {colorstr('blue', time.strftime('%Y-%m-%d %H: %M:%S', time.localtime(time.time())))}") break time.sleep(60) self.epoch = epoch self.run_callbacks('on_train_epoch_start')
YAML files corresponding to multiple models to be trained: