Call the yolov5 model to implement the area detection function based on opencv

Call the yolov5 model to detect whether the specified type of object is within a fixed area based on opencv

Introduction:

File structure:

The files in utils and models can be found in the official yolov5 files. These are required.

This method is suitable for the pre-trained model of yolov5 and the pt model trained by yourself. It can detect pictures and videos, and can also realize the function of real-time detection of cameras.

Code analysis:

First import the required packages:

import cv2
import torch
from models.experimental import attempt_load
from utils.general import is_ascii, non_max_suppression, scale_coords, set_logging
from utils.plots import Annotator, colors
from utils.torch_utils import select_device

Then define some parameters that yolo needs to use

weights=’pretrained/yolov5s.pt’,# Specify the path of network weights
conf_thres=0.6, # The threshold of confidence, that is, if the confidence is less than this value, it will not be displayed.
iou_thres=0.2, # NMS IOU threshold
max_det=1, # The maximum number of detections that can be displayed, up to 1000, set here to 1
device=”, # cuda device, 0 or 0,1,2,3 or cpu
classes=0, # Specify the category to be detected, and learn each category through the yaml file of the data set
agnostic_nms=False, # Cross-category NMS
line_thickness=2, #The thickness of the bounding box detection box
half=False, # Whether to use the FP16 version of precision inference

Here, I only used these parameters. If you have other needs, you can refer to the detect.py file in the official source code of yolov5 for modification.

Model import:

model = attempt_load(weights, device=device)

Read input (video or camera):

cap = cv2.VideoCapture(‘video2.mp4’)
# cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get the width of the video
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get the height of the video

Obtaining the width and height of the video is to facilitate later delineation of the detection area on the original video. The following is the regulation of the detection area. (x, y) is the coordinate of the upper left corner of the detection area, w and h are the width and height of the detection area.

w = 500
h = 700
x = int(width / 2 – w / 2)
y = int(height / 2 – h / 2)

font = cv2.FONT_HERSHEY_SIMPLEX # Set font style

Call model detection:

#Inference
pred = model(img, augment=False, visualize=False)[0]

#NMS
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)

Among them, augment is a model enhancement function that can improve detection results, but has higher hardware requirements.

Obtain the coordinates, confidence, and label of the detection frame, compare it with the delimited detection area, determine whether it is within the frame, and feed back the judgment result:

for *xyxy, conf, cls in reversed(det):
c = int(cls) # integer class
# print(names[c]) # Output category names
label = f'{names[c]} {conf:.2f}’ # label
annotator.box_label(xyxy, label, color=colors(c, True))
# print(xyxy)
# print(int(xyxy[0].numpy()), int(xyxy[1].numpy()), int(xyxy[2].numpy()), int(xyxy[3].numpy()) )
x1 = int(xyxy[0].numpy()) # The abscissa of the upper left corner
y1 = int(xyxy[1].numpy()) # The abscissa of the upper left corner
x2 = int(xyxy[2].numpy()) # The abscissa of the lower right corner
y2 = int(xyxy[3].numpy()) # The abscissa of the lower right corner
if x1 >= x and y1 >= y and x2 <= x + w and y2 <= y + h:
result = “True”
color = (0, 255, 0)
else:
result = “False”
color = (0, 0, 255)
cv2.putText(frame, result, (10, 30), font, 1.0, color, 2)

Draw the detection area on the original video:

cv2.rectangle(frame, (x, y), (x + w, y + h), color, 5)

Full code:

import cv2
import torch
from models.experimental import attempt_load
from utils.general import is_ascii, non_max_suppression, scale_coords, set_logging
from utils.plots import Annotator, colors
from utils.torch_utils import select_device


@torch.no_grad()
def run(weights='pretrained/yolov5s.pt', # model.pt path(s)
        conf_thres=0.6, #confidence threshold
        iou_thres=0, # NMS IOU threshold
        max_det=1, # maximum detections per image
        device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu
        classes=0, # filter by class: --class 0, or --class 0 2 3
        agnostic_nms=False, # class-agnostic NMS
        line_thickness=2, # bounding box thickness (pixels)
        half=False, # use FP16 half-precision inference
        ):
    #Initialize
    global color
    set_logging()
    device = select_device(device)
    print(device)
    half & amp;= device.type != 'cpu' # half precision only supported on CUDA

    model = attempt_load(weights, device=device) # load FP32 model
    names = model.module.names if hasattr(model, 'module') else model.names # get class names
    ascii = is_ascii(names) # names are ascii (use PIL for UTF-8)

    cap = cv2.VideoCapture('video2.mp4')
    # cap = cv2.VideoCapture(0)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get the width of the video
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get the height of the video
    w = 500
    h = 700
    x = int(width / 2 - w / 2)
    y = int(height / 2 - h / 2)
    font = cv2.FONT_HERSHEY_SIMPLEX # Set font style

    while True:
        # Get a frame q
        ret, frame = cap.read()

        # frame = cv2.resize(frame, (width, height)) # Set the width and length of the screen
        img = torch.from_numpy(frame).to(device)
        img = img.half() if half else img.float() # uint8 to fp16/32
        img = img / 255 # 0 - 255 to 0.0 - 1.0
        if len(img.shape) == 3:
            img = img[None] # expand for batch dim
        img = img.transpose(2, 3)
        img = img.transpose(1, 2)

        #Inference
        pred = model(img, augment=False, visualize=False)[0]

        #NMS
        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)

        # Process predictions
        for i, det in enumerate(pred): # detections per image
            s = ''
            annotator = Annotator(frame, line_width=line_thickness, pil=not ascii)
            iflen(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], frame.shape).round()

                # Print results
                for c in det[:, -1].unique():
                    n = (det[:, -1] == c).sum() # detections per class
                    s + = str(n.item()) + ' ' + str(names[int(c)]) + ' ' # add to string

                # Write results
                for *xyxy, conf, cls in reversed(det):
                    c = int(cls) # integer class
                    # print(names[c]) # Output category names
                    label = f'{names[c]} {conf:.2f}' # label
                    annotator.box_label(xyxy, label, color=colors(c, True))
                    # print(xyxy)
                    # print(int(xyxy[0].numpy()), int(xyxy[1].numpy()), int(xyxy[2].numpy()), int(xyxy[3].numpy()) )
                    x1 = int(xyxy[0].numpy())
                    y1 = int(xyxy[1].numpy())
                    x2 = int(xyxy[2].numpy())
                    y2 = int(xyxy[3].numpy())
                    if x1 >= x and y1 >= y and x2 <= x + w and y2 <= y + h:
                        result = "True"
                        color = (0, 255, 0)
                    else:
                        result = "False"
                        color = (0, 0, 255)
                    cv2.putText(frame, result, (10, 30), font, 1.0, color, 2)

        cv2.rectangle(frame, (x, y), (x + w, y + h), color, 5) # frame is the frame to be drawn, four coordinate points, color, line width

        # print('result:' + s)

        cv2.imshow('frame', frame)
        k = cv2.waitKey(1) & 0xFF
        if k == 27:
            break


def main():
    run()


if __name__ == "__main__":
    main()

The knowledge points of the article match the official knowledge files, and you can further learn related knowledge. OpenCV skill tree Home page Overview 23904 people are learning the system