Call the yolov5 model to detect whether the specified type of object is within a fixed area based on opencv
Introduction:
File structure:
The files in utils and models can be found in the official yolov5 files. These are required.
This method is suitable for the pre-trained model of yolov5 and the pt model trained by yourself. It can detect pictures and videos, and can also realize the function of real-time detection of cameras.
Code analysis:
First import the required packages:
import cv2
import torch
from models.experimental import attempt_load
from utils.general import is_ascii, non_max_suppression, scale_coords, set_logging
from utils.plots import Annotator, colors
from utils.torch_utils import select_device
Then define some parameters that yolo needs to use
weights=’pretrained/yolov5s.pt’,# Specify the path of network weights
conf_thres=0.6, # The threshold of confidence, that is, if the confidence is less than this value, it will not be displayed.
iou_thres=0.2, # NMS IOU threshold
max_det=1, # The maximum number of detections that can be displayed, up to 1000, set here to 1
device=”, # cuda device, 0 or 0,1,2,3 or cpu
classes=0, # Specify the category to be detected, and learn each category through the yaml file of the data set
agnostic_nms=False, # Cross-category NMS
line_thickness=2, #The thickness of the bounding box detection box
half=False, # Whether to use the FP16 version of precision inference
Here, I only used these parameters. If you have other needs, you can refer to the detect.py file in the official source code of yolov5 for modification.
Model import:
model = attempt_load(weights, device=device)
Read input (video or camera):
cap = cv2.VideoCapture(‘video2.mp4’)
# cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get the width of the video
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get the height of the video
Obtaining the width and height of the video is to facilitate later delineation of the detection area on the original video. The following is the regulation of the detection area. (x, y) is the coordinate of the upper left corner of the detection area, w and h are the width and height of the detection area.
w = 500
h = 700
x = int(width / 2 – w / 2)
y = int(height / 2 – h / 2)
font = cv2.FONT_HERSHEY_SIMPLEX # Set font style
Call model detection:
#Inference
pred = model(img, augment=False, visualize=False)[0]
#NMS
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
Among them, augment is a model enhancement function that can improve detection results, but has higher hardware requirements.
Obtain the coordinates, confidence, and label of the detection frame, compare it with the delimited detection area, determine whether it is within the frame, and feed back the judgment result:
for *xyxy, conf, cls in reversed(det):
c = int(cls) # integer class
# print(names[c]) # Output category names
label = f'{names[c]} {conf:.2f}’ # label
annotator.box_label(xyxy, label, color=colors(c, True))
# print(xyxy)
# print(int(xyxy[0].numpy()), int(xyxy[1].numpy()), int(xyxy[2].numpy()), int(xyxy[3].numpy()) )
x1 = int(xyxy[0].numpy()) # The abscissa of the upper left corner
y1 = int(xyxy[1].numpy()) # The abscissa of the upper left corner
x2 = int(xyxy[2].numpy()) # The abscissa of the lower right corner
y2 = int(xyxy[3].numpy()) # The abscissa of the lower right corner
if x1 >= x and y1 >= y and x2 <= x + w and y2 <= y + h:
result = “True”
color = (0, 255, 0)
else:
result = “False”
color = (0, 0, 255)
cv2.putText(frame, result, (10, 30), font, 1.0, color, 2)
Draw the detection area on the original video:
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 5)
Full code:
import cv2 import torch from models.experimental import attempt_load from utils.general import is_ascii, non_max_suppression, scale_coords, set_logging from utils.plots import Annotator, colors from utils.torch_utils import select_device @torch.no_grad() def run(weights='pretrained/yolov5s.pt', # model.pt path(s) conf_thres=0.6, #confidence threshold iou_thres=0, # NMS IOU threshold max_det=1, # maximum detections per image device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu classes=0, # filter by class: --class 0, or --class 0 2 3 agnostic_nms=False, # class-agnostic NMS line_thickness=2, # bounding box thickness (pixels) half=False, # use FP16 half-precision inference ): #Initialize global color set_logging() device = select_device(device) print(device) half & amp;= device.type != 'cpu' # half precision only supported on CUDA model = attempt_load(weights, device=device) # load FP32 model names = model.module.names if hasattr(model, 'module') else model.names # get class names ascii = is_ascii(names) # names are ascii (use PIL for UTF-8) cap = cv2.VideoCapture('video2.mp4') # cap = cv2.VideoCapture(0) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get the width of the video height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get the height of the video w = 500 h = 700 x = int(width / 2 - w / 2) y = int(height / 2 - h / 2) font = cv2.FONT_HERSHEY_SIMPLEX # Set font style while True: # Get a frame q ret, frame = cap.read() # frame = cv2.resize(frame, (width, height)) # Set the width and length of the screen img = torch.from_numpy(frame).to(device) img = img.half() if half else img.float() # uint8 to fp16/32 img = img / 255 # 0 - 255 to 0.0 - 1.0 if len(img.shape) == 3: img = img[None] # expand for batch dim img = img.transpose(2, 3) img = img.transpose(1, 2) #Inference pred = model(img, augment=False, visualize=False)[0] #NMS pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) # Process predictions for i, det in enumerate(pred): # detections per image s = '' annotator = Annotator(frame, line_width=line_thickness, pil=not ascii) iflen(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], frame.shape).round() # Print results for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class s + = str(n.item()) + ' ' + str(names[int(c)]) + ' ' # add to string # Write results for *xyxy, conf, cls in reversed(det): c = int(cls) # integer class # print(names[c]) # Output category names label = f'{names[c]} {conf:.2f}' # label annotator.box_label(xyxy, label, color=colors(c, True)) # print(xyxy) # print(int(xyxy[0].numpy()), int(xyxy[1].numpy()), int(xyxy[2].numpy()), int(xyxy[3].numpy()) ) x1 = int(xyxy[0].numpy()) y1 = int(xyxy[1].numpy()) x2 = int(xyxy[2].numpy()) y2 = int(xyxy[3].numpy()) if x1 >= x and y1 >= y and x2 <= x + w and y2 <= y + h: result = "True" color = (0, 255, 0) else: result = "False" color = (0, 0, 255) cv2.putText(frame, result, (10, 30), font, 1.0, color, 2) cv2.rectangle(frame, (x, y), (x + w, y + h), color, 5) # frame is the frame to be drawn, four coordinate points, color, line width # print('result:' + s) cv2.imshow('frame', frame) k = cv2.waitKey(1) & 0xFF if k == 27: break def main(): run() if __name__ == "__main__": main()
The knowledge points of the article match the official knowledge files, and you can further learn related knowledge. OpenCV skill tree Home page Overview 23904 people are learning the system