Point-up skills: Detect series—Yolov5 introduces Yolov7 detection head IDetect Head, IAuxDetect Head

1. Introduction of Yolov7

Paper address: https://arxiv.org/abs/2207.02696

github:GitHub – WongKinYiu/yolov7: Implementation of paper – YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors

YOLOv7 is here! Surpass YOLOv5, YOLOX, PPYOLOE, YOLOR and other target detection networks! YOLOv7 outperforms all known object detectors in speed and accuracy in the range of 5 FPS to 160 FPS, and has the highest accuracy of 56.8 among all known real-time object detectors at 30 FPS or higher on GPU V100 %AP.

2. Yolov5 joins Yolov7 detection head IDetect Head, IAuxDetect Head

2.1 Add IDetect Head and IAuxDetect to common.py:

####################### IDetect IAuxDetect #### start ################### #############

class ImplicitA(nn.Module):
    def __init__(self, channel, mean=0., std=.02):
        super(ImplicitA, self).__init__()
        self. channel = channel
        self. mean = mean
        self.std = std
        self.implicit = nn.Parameter(torch.zeros(1, channel, 1, 1))
        nn.init.normal_(self.implicit, mean=self.mean, std=self.std)

    def forward(self, x):
        return self.implicit + x


class ImplicitM(nn.Module):
    def __init__(self, channel, mean=0., std=.02):
        super(ImplicitM, self).__init__()
        self. channel = channel
        self. mean = mean
        self.std = std
        self.implicit = nn.Parameter(torch.ones(1, channel, 1, 1))
        nn.init.normal_(self.implicit, mean=self.mean, std=self.std)

    def forward(self, x):
        return self. implicit * x

####################### IDetect IAuxDetect #### end ###################### ##########

2.2 IDetect Head, IAuxDetect added to yolo.py:

class IDetect(nn.Module):
    # YOLOR Detect head for detection models
    stride = None # strides computed during build
    dynamic = False # force grid reconstruction
    export = False # export mode
    include_nms = False
    end2end = False
    concat = False

    def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
        super().__init__()
        self.nc = nc # number of classes
        self.no = nc + 5 # number of outputs per anchor
        self.nl = len(anchors) # number of detection layers
        self.na = len(anchors[0]) // 2 # number of anchors
        self.grid = [torch.zeros(1)] * self.nl # init grid
        a = torch.tensor(anchors).float().view(self.nl, -1, 2)
        self. register_buffer('anchors', a) # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)

        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv

        self.ia = nn.ModuleList(ImplicitA(x) for x in ch)
        self.im = nn.ModuleList(ImplicitM(self.no * self.na) for _ in ch)

    def forward(self, x):

        z = [] # inference output
        for i in range(self.nl):
            x[i] = self.m[i](self.ia[i](x[i])) # conv
            bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self. training: # inference
                if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                if self. inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2 - 0.5 + self. grid[i]) * self. stride[i] # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh

                else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy, wh, conf = y.split((2, 2, self.no - 4), 4) # tensor_split((2, 4, 5), 4) if torch 1.8.0
                    xy = (xy * 2 + self. grid[i]) * self. stride[i] # xy
                    wh = (wh * 2) ** 2 * self. anchor_grid[i] # wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

    def fuseforward(self, x):
        # x = x. copy() # for profiling
        z = [] # inference output
        self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i]) # conv
            bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self. training: # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                if not torch.onnx.is_in_onnx_export():
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh
                else:
                    xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0
                    xy = xy * (2. * self. stride[i]) + (self. stride[i] * (self. grid[i] - 0.5)) # new xy
                    wh = wh ** 2 * (4 * self.anchor_grid[i].data) # new wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))

        if self. training:
            out = x
        elif self.end2end:
            out = torch.cat(z, 1)
        elif self.include_nms:
            z = self. convert(z)
            out = (z,)
        elif self. concat:
            out = torch.cat(z, 1)
        else:
            out = (torch. cat(z, 1), x)

        return out

    def fuse(self):
        # print("IDetect.fuse")
        # fuse ImplicitA and Convolution
        for i in range(len(self.m)):
            with torch.no_grad():
                c1, c2, _, _ = self.m[i].weight.shape
                c1_, c2_, _, _ = self.ia[i].implicit.shape
                self.m[i].bias + = torch.matmul(self.m[i].weight.reshape(c1, c2),
                                               self.ia[i].implicit.reshape(c2_, c1_)).squeeze(1)

        # fuse ImplicitM and Convolution
        for i in range(len(self.m)):
            with torch.no_grad():
                c1, c2, _, _ = self.im[i].implicit.shape
                self.m[i].bias *= self.im[i].implicit.reshape(c2)
                self.m[i].weight *= self.im[i].implicit.transpose(0, 1)

    def convert(self, z):
        z = torch.cat(z, 1)
        box = z[:, :, :4]
        conf = z[:, :, 4:5]
        score = z[:, :, 5:]
        score *= conf
        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
                                      dtype=torch.float32,
                                      device=z.device)
        box @= convert_matrix
        return (box, score)


class IAuxDetect(nn.Module):
    stride = None # strides computed during build
    export = False # onnx export
    end2end = False
    include_nms = False
    concat = False

    def __init__(self, nc=80, anchors=(), ch=()): # detection layer
        super(IAuxDetect, self).__init__()
        self.nc = nc # number of classes
        self.no = nc + 5 # number of outputs per anchor
        self.nl = len(anchors) # number of detection layers
        self.na = len(anchors[0]) // 2 # number of anchors
        self.grid = [torch.zeros(1)] * self.nl # init grid
        a = torch.tensor(anchors).float().view(self.nl, -1, 2)
        self. register_buffer('anchors', a) # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[:self.nl]) # output conv
        self.m2 = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch[self.nl:]) # output conv

        self.ia = nn.ModuleList(ImplicitA(x) for x in ch[:self.nl])
        self.im = nn.ModuleList(ImplicitM(self.no * self.na) for _ in ch[:self.nl])

    def forward(self, x):
        # x = x. copy() # for profiling
        z = [] # inference output
        self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](self.ia[i](x[i])) # conv
            x[i] = self.im[i](x[i])
            bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            x[i + self.nl] = self.m2[i](x[i + self.nl])
            x[i + self.nl] = x[i + self.nl].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous( )

            if not self. training: # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                if not torch.onnx.is_in_onnx_export():
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh
                else:
                    xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0
                    xy = xy * (2. * self. stride[i]) + (self. stride[i] * (self. grid[i] - 0.5)) # new xy
                    wh = wh ** 2 * (4 * self.anchor_grid[i].data) # new wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))

        return x if self. training else (torch. cat(z, 1), x[:self. nl])

    def fuseforward(self, x):
        # x = x. copy() # for profiling
        z = [] # inference output
        self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i]) # conv
            bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self. training: # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                if not torch.onnx.is_in_onnx_export():
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self. anchor_grid[i] # wh
                else:
                    xy = (y[..., 0:2] * 2. - 0.5 + self. grid[i]) * self. stride[i] # xy
                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].data # wh
                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))

        if self. training:
            out = x
        elif self.end2end:
            out = torch.cat(z, 1)
        elif self.include_nms:
            z = self. convert(z)
            out = (z,)
        elif self. concat:
            out = torch.cat(z, 1)
        else:
            out = (torch. cat(z, 1), x)

        return out

    def fuse(self):
        print("IAuxDetect.fuse")
        # fuse ImplicitA and Convolution
        for i in range(len(self.m)):
            with torch.no_grad():
                c1, c2, _, _ = self.m[i].weight.shape
                c1_, c2_, _, _ = self.ia[i].implicit.shape
                self.m[i].bias + = torch.matmul(self.m[i].weight.reshape(c1, c2),
                                               self.ia[i].implicit.reshape(c2_, c1_)).squeeze(1)

        # fuse ImplicitM and Convolution
        for i in range(len(self.m)):
            with torch.no_grad():
                c1, c2, _, _ = self.im[i].implicit.shape
                self.m[i].bias *= self.im[i].implicit.reshape(c2)
                self.m[i].weight *= self.im[i].implicit.transpose(0, 1)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

    def convert(self, z):
        z = torch.cat(z, 1)
        box = z[:, :, :4]
        conf = z[:, :, 4:5]
        score = z[:, :, 5:]
        score *= conf
        convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
                                      dtype=torch.float32,
                                      device=z.device)
        box @= convert_matrix
        return (box, score)

class DetectionModel(BaseModel):

 if isinstance(m, (Detect, Segment, ASFF_Detect, IDetect)):
            s = 256 # 2x min stride
            m.inplace = self.inplace
            forward = lambda x: self.forward(x)[0] if isinstance(m, Segment) else self.forward(x)
            m. stride = torch. tensor([s / x. shape[-2] for x in forward(torch. zeros(1, ch, s, s))]) # forward
            check_anchor_order(m)
            m.anchors /= m.stride.view(-1, 1, 1)
            self.stride = m.stride
            self._initialize_biases() # only run once
        elif isinstance(m, IAuxDetect):
            s = 256 # 2x min stride
            m. stride = torch. tensor([s / x. shape[-2] for x in self. forward(torch. zeros(1, ch, s, s))[:4]]) # forward
            #print(m. stride)
            check_anchor_order(m)
            m.anchors /= m.stride.view(-1, 1, 1)
            self.stride = m.stride
            self._initialize_aux_biases() # only run once
            # print('Strides: %s' % m. stride. tolist())

def parse_model(d, ch):

 elif m in {Detect, Segment, ASFF_Detect, Decoupled_Detect, IDetect, IAuxDetect}:
            args.append([ch[x] for x in f])
            if isinstance(args[1], int): # number of anchors
                args[1] = [list(range(args[1] * 2))] * len(f)
            if m is segment:
                args[3] = make_divisible(args[3] * gw, 8)

2.3 Modify yolov5s_IDetect.yaml

# YOLOv5  by Ultralytics, GPL-3.0 license

# Parameters
nc: 1 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
anchors:
  - [10,13, 16,30, 33,23] # P3/8
  - [30,61, 62,45, 59,119] # P4/16
  - [116,90, 156,198, 373,326] # P5/32

#YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]], # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]], # 3-P3/8
   [-1, 6, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]], # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
   [-1, 3, C3, [1024]],
   [-1, 1, SPPF, [1024, 5]], # 9
  ]

#YOLOv5 v6.0 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn. Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]], # cat backbone P4
   [-1, 3, C3, [512, False]], # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn. Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]], # cat backbone P3
   [-1, 3, C3, [256, False]], # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]], # cat head P4
   [-1, 3, C3, [512, False]], # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]], # cat head P5
   [-1, 3, C3, [1024, False]], # 23 (P5/32-large)

   [[17, 20, 23], 1, IDetect, [nc, anchors]], # Detect(P3, P4, P5)
  ]