1. Introduction of Yolov7
Paper address: https://arxiv.org/abs/2207.02696
github:GitHub – WongKinYiu/yolov7: Implementation of paper – YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors
YOLOv7 is here! Surpass YOLOv5, YOLOX, PPYOLOE, YOLOR and other target detection networks! YOLOv7 outperforms all known object detectors in speed and accuracy in the range of 5 FPS to 160 FPS, and has the highest accuracy of 56.8 among all known real-time object detectors at 30 FPS or higher on GPU V100 %AP.
2. Yolov5 joins Yolov7 detection head IDetect Head, IAuxDetect Head
2.1 Add IDetect Head and IAuxDetect to common.py
:
####################### IDetect IAuxDetect #### start ################### ############# class ImplicitA(nn.Module): def __init__(self, channel, mean=0., std=.02): super(ImplicitA, self).__init__() self. channel = channel self. mean = mean self.std = std self.implicit = nn.Parameter(torch.zeros(1, channel, 1, 1)) nn.init.normal_(self.implicit, mean=self.mean, std=self.std) def forward(self, x): return self.implicit + x class ImplicitM(nn.Module): def __init__(self, channel, mean=0., std=.02): super(ImplicitM, self).__init__() self. channel = channel self. mean = mean self.std = std self.implicit = nn.Parameter(torch.ones(1, channel, 1, 1)) nn.init.normal_(self.implicit, mean=self.mean, std=self.std) def forward(self, x): return self. implicit * x ####################### IDetect IAuxDetect #### end ###################### ##########
2.2 IDetect Head, IAuxDetect added to yolo.py
:
class IDetect(nn.Module): # YOLOR Detect head for detection models stride = None # strides computed during build dynamic = False # force grid reconstruction export = False # export mode include_nms = False end2end = False concat = False def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer super().__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.zeros(1)] * self.nl # init grid a = torch.tensor(anchors).float().view(self.nl, -1, 2) self. register_buffer('anchors', a) # shape(nl,na,2) self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2) self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv self.ia = nn.ModuleList(ImplicitA(x) for x in ch) self.im = nn.ModuleList(ImplicitM(self.no * self.na) for _ in ch) def forward(self, x): z = [] # inference output for i in range(self.nl): x[i] = self.m[i](self.ia[i](x[i])) # conv bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() if not self. training: # inference if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() if self. inplace: y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self. grid[i]) * self. stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953 xy, wh, conf = y.split((2, 2, self.no - 4), 4) # tensor_split((2, 4, 5), 4) if torch 1.8.0 xy = (xy * 2 + self. grid[i]) * self. stride[i] # xy wh = (wh * 2) ** 2 * self. anchor_grid[i] # wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, -1, self.no)) return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x) @staticmethod def _make_grid(nx=20, ny=20): yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() def fuseforward(self, x): # x = x. copy() # for profiling z = [] # inference output self.training |= self.export for i in range(self.nl): x[i] = self.m[i](x[i]) # conv bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() if not self. training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() if not torch.onnx.is_in_onnx_export(): y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh else: xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0 xy = xy * (2. * self. stride[i]) + (self. stride[i] * (self. grid[i] - 0.5)) # new xy wh = wh ** 2 * (4 * self.anchor_grid[i].data) # new wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, -1, self.no)) if self. training: out = x elif self.end2end: out = torch.cat(z, 1) elif self.include_nms: z = self. convert(z) out = (z,) elif self. concat: out = torch.cat(z, 1) else: out = (torch. cat(z, 1), x) return out def fuse(self): # print("IDetect.fuse") # fuse ImplicitA and Convolution for i in range(len(self.m)): with torch.no_grad(): c1, c2, _, _ = self.m[i].weight.shape c1_, c2_, _, _ = self.ia[i].implicit.shape self.m[i].bias + = torch.matmul(self.m[i].weight.reshape(c1, c2), self.ia[i].implicit.reshape(c2_, c1_)).squeeze(1) # fuse ImplicitM and Convolution for i in range(len(self.m)): with torch.no_grad(): c1, c2, _, _ = self.im[i].implicit.shape self.m[i].bias *= self.im[i].implicit.reshape(c2) self.m[i].weight *= self.im[i].implicit.transpose(0, 1) def convert(self, z): z = torch.cat(z, 1) box = z[:, :, :4] conf = z[:, :, 4:5] score = z[:, :, 5:] score *= conf convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=torch.float32, device=z.device) box @= convert_matrix return (box, score) class IAuxDetect(nn.Module): stride = None # strides computed during build export = False # onnx export end2end = False include_nms = False concat = False def __init__(self, nc=80, anchors=(), ch=()): # detection layer super(IAuxDetect, self).__init__() self.nc = nc # number of classes self.no = nc + 5 # number of outputs per anchor self.nl = len(anchors) # number of detection layers self.na = len(anchors[0]) // 2 # number of anchors self.grid = [torch.zeros(1)] * self.nl # init grid a = torch.tensor(anchors).float().view(self.nl, -1, 2) self. register_buffer('anchors', a) # shape(nl,na,2) self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2) self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[:self.nl]) # output conv self.m2 = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[self.nl:]) # output conv self.ia = nn.ModuleList(ImplicitA(x) for x in ch[:self.nl]) self.im = nn.ModuleList(ImplicitM(self.no * self.na) for _ in ch[:self.nl]) def forward(self, x): # x = x. copy() # for profiling z = [] # inference output self.training |= self.export for i in range(self.nl): x[i] = self.m[i](self.ia[i](x[i])) # conv x[i] = self.im[i](x[i]) bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() x[i + self.nl] = self.m2[i](x[i + self.nl]) x[i + self.nl] = x[i + self.nl].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous( ) if not self. training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() if not torch.onnx.is_in_onnx_export(): y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh else: xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0 xy = xy * (2. * self. stride[i]) + (self. stride[i] * (self. grid[i] - 0.5)) # new xy wh = wh ** 2 * (4 * self.anchor_grid[i].data) # new wh y = torch.cat((xy, wh, conf), 4) z.append(y.view(bs, -1, self.no)) return x if self. training else (torch. cat(z, 1), x[:self. nl]) def fuseforward(self, x): # x = x. copy() # for profiling z = [] # inference output self.training |= self.export for i in range(self.nl): x[i] = self.m[i](x[i]) # conv bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85) x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous() if not self. training: # inference if self.grid[i].shape[2:4] != x[i].shape[2:4]: self.grid[i] = self._make_grid(nx, ny).to(x[i].device) y = x[i].sigmoid() if not torch.onnx.is_in_onnx_export(): y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh else: xy = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].data # wh y = torch.cat((xy, wh, y[..., 4:]), -1) z.append(y.view(bs, -1, self.no)) if self. training: out = x elif self.end2end: out = torch.cat(z, 1) elif self.include_nms: z = self. convert(z) out = (z,) elif self. concat: out = torch.cat(z, 1) else: out = (torch. cat(z, 1), x) return out def fuse(self): print("IAuxDetect.fuse") # fuse ImplicitA and Convolution for i in range(len(self.m)): with torch.no_grad(): c1, c2, _, _ = self.m[i].weight.shape c1_, c2_, _, _ = self.ia[i].implicit.shape self.m[i].bias + = torch.matmul(self.m[i].weight.reshape(c1, c2), self.ia[i].implicit.reshape(c2_, c1_)).squeeze(1) # fuse ImplicitM and Convolution for i in range(len(self.m)): with torch.no_grad(): c1, c2, _, _ = self.im[i].implicit.shape self.m[i].bias *= self.im[i].implicit.reshape(c2) self.m[i].weight *= self.im[i].implicit.transpose(0, 1) @staticmethod def _make_grid(nx=20, ny=20): yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() def convert(self, z): z = torch.cat(z, 1) box = z[:, :, :4] conf = z[:, :, 4:5] score = z[:, :, 5:] score *= conf convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=torch.float32, device=z.device) box @= convert_matrix return (box, score)
class DetectionModel(BaseModel):
if isinstance(m, (Detect, Segment, ASFF_Detect, IDetect)): s = 256 # 2x min stride m.inplace = self.inplace forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x) m. stride = torch. tensor([s / x. shape[-2] for x in forward(torch. zeros(1, ch, s, s))]) # forward check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_biases() # only run once elif isinstance(m, IAuxDetect): s = 256 # 2x min stride m. stride = torch. tensor([s / x. shape[-2] for x in self. forward(torch. zeros(1, ch, s, s))[:4]]) # forward #print(m. stride) check_anchor_order(m) m.anchors /= m.stride.view(-1, 1, 1) self.stride = m.stride self._initialize_aux_biases() # only run once # print('Strides: %s' % m. stride. tolist())
def parse_model(d, ch):
elif m in {Detect, Segment, ASFF_Detect, Decoupled_Detect, IDetect, IAuxDetect}: args.append([ch[x] for x in f]) if isinstance(args[1], int): # number of anchors args[1] = [list(range(args[1] * 2))] * len(f) if m is segment: args[3] = make_divisible(args[3] * gw, 8)
2.3 Modify yolov5s_IDetect.yaml
# YOLOv5 by Ultralytics, GPL-3.0 license # Parameters nc: 1 # number of classes depth_multiple: 0.33 # model depth multiple width_multiple: 0.50 # layer channel multiple anchors: - [10,13, 16,30, 33,23] # P3/8 - [30,61, 62,45, 59,119] # P4/16 - [116,90, 156,198, 373,326] # P5/32 #YOLOv5 v6.0 backbone backbone: # [from, number, module, args] [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2 [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 [-1, 3, C3, [128]], [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 [-1, 6, C3, [256]], [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 [-1, 9, C3, [512]], [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 [-1, 3, C3, [1024]], [-1, 1, SPPF, [1024, 5]], # 9 ] #YOLOv5 v6.0 head head: [[-1, 1, Conv, [512, 1, 1]], [-1, 1, nn. Upsample, [None, 2, 'nearest']], [[-1, 6], 1, Concat, [1]], # cat backbone P4 [-1, 3, C3, [512, False]], # 13 [-1, 1, Conv, [256, 1, 1]], [-1, 1, nn. Upsample, [None, 2, 'nearest']], [[-1, 4], 1, Concat, [1]], # cat backbone P3 [-1, 3, C3, [256, False]], # 17 (P3/8-small) [-1, 1, Conv, [256, 3, 2]], [[-1, 14], 1, Concat, [1]], # cat head P4 [-1, 3, C3, [512, False]], # 20 (P4/16-medium) [-1, 1, Conv, [512, 3, 2]], [[-1, 10], 1, Concat, [1]], # cat head P5 [-1, 3, C3, [1024, False]], # 23 (P5/32-large) [[17, 20, 23], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5) ]