Please let me know what the problem is. It’s such a headache. The source code is as follows.
The source code is as follows: Pascal VOC Dataset Mirror
import os importsys import xml.etree.ElementTree as ET import cv2 import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Do not display prompt information below level 2 import tensorflow astf from tensorflow.python.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Reshape, Concatenate, \ concatenate, ZeroPadding2D, Convolution2D,BatchNormalization, Activation, AveragePooling2D, Add from tensorflow.python.keras.models import Model from tensorflow.python.keras.losses import categorical_crossentropy,binary_crossentropy import numpy as np from tensorflow.python.keras.saving.save import load_model from tensorflow.python.ops.init_ops_v2 import glorot_uniform from tensorflow.python.ops.losses.losses_impl import mean_squared_error batch_size=32 input_size=224 # https://juejin.cn/post/6844903908570054670 xmls_path='E:\VOCdevkit\VOC2007\Annotations' imgs_path='E:\VOCdevkit\VOC2007\JPEGImages' catalogs=['aeroplane','bicycle','bird','boat','bottle','bus','car','cat' ,'chair','cow','diningtable','dog','horse','motorbike','person','pottedplant',\ 'sheep','sofa','train','tvmonitor'] # region preprocesses images and labels, and uses iterators to handle the entire process def generator_data(): global batch_size annotations = os.listdir(xmls_path) # Randomly shuffle np.random.shuffle(annotations) images=[] classes=[] labels=[] while True: for anno in annotations: anno_path = os.path.join(xmls_path,anno) tree = ET.parse(anno_path) root = tree.getroot() # Image name img_name = root.find('filename').text width = int(root.find('size/width').text) height = int(root.find('size/height').text) obj_name = root.find('object/name').text xmin = int(root.find('object/bndbox/xmin').text) ymin = int(root.find('object/bndbox/ymin').text) xmax = int(root.find('object/bndbox/xmax').text) ymax = int(root.find('object/bndbox/ymax').text) label = [xmin,ymin,xmax,ymax] # size=[width,height] # x1, y1, x2, y2 = label # if y1 >= y2: # print(anno_path, label) # break img_path = os.path.join(imgs_path, img_name) if os.path.exists(img_path): image = cv2.imread(img_path) image , label = image_plus(image,label) #Set the resize of image to input_size image = cv2.resize(image,(input_size,input_size)) label = fix_label_scale(label,[height,width]) label = convert_to_mse(label) obj_catalog=np.zeros(dtype=float,shape=len(catalogs)) obj_catalog_idx=catalogs.index(obj_name) obj_catalog[obj_catalog_idx]=1 classes.append(obj_catalog) images.append(image) labels.append(label) if(len(images)>=batch_size): yield (np.array(images),{'class_head':np.array(classes), 'reg_head':np.array(labels)}) images= [] labels=[] classes=[] def generator_vaild_data(): global batch_size annotations = os.listdir(xmls_path) # Randomly shuffle np.random.shuffle(annotations) images=[] classes=[] labels=[] while True: for anno in annotations: anno_path = os.path.join(xmls_path,anno) tree = ET.parse(anno_path) root = tree.getroot() # Image name img_name = root.find('filename').text width = int(root.find('size/width').text) height = int(root.find('size/height').text) obj_name = root.find('object/name').text xmin = int(root.find('object/bndbox/xmin').text) ymin = int(root.find('object/bndbox/ymin').text) xmax = int(root.find('object/bndbox/xmax').text) ymax = int(root.find('object/bndbox/ymax').text) label = [xmin,ymin,xmax,ymax] # size=[width,height] img_path = os.path.join(imgs_path, img_name) if os.path.exists(img_path): image = cv2.imread(img_path) image , label = image_plus(image,label) #Set the resize of image to input_size image = cv2.resize(image,(input_size,input_size)) label = fix_label_scale(label,[height,width]) # if label[0]>=label[2]: # print('error:',label) # break label=convert_to_mse(label) obj_catalog = np.zeros(dtype=float, shape=len(catalogs)) obj_catalog_idx = catalogs.index(obj_name) obj_catalog[obj_catalog_idx] = 1 classes.append(obj_catalog) images.append(image) labels.append(label) if(len(images)>=batch_size*10): return (np.array(images),{'class_head':np.array(classes), 'reg_head':np.array(labels)}) # Convert x,y,w,h to x1,y1,x2,y2 #x,y is the center point def convert_to_point(bbox): x,y,w,h = bbox return [ x-(w/2), y-(h/2), x + (w / 2), y + (h / 2), ] # Convert x1,y1,x2,y2 to x,y,w,h def convert_to_mse(bbox): x1,y1,x2,y2=bbox return [(x2 + x1)/2, (y2 + y1)/2, x2-x1, y2-y1] #Restore the size of the network model when calibrating the input def ref_label_scale(label,scale): x1, y1, x2, y2 = label w,h=input_size/scale[1],input_size/scale[0] label = [int(round(x1 / w)), int(round(y1 / h)), int(round(x2 / w)), int(round(y2 / h))] return label # Calibrate the size of the network model when inputting def label_scale(label,scale): x1, y1, x2, y2 = label scale=input_size/scale label = [int(round(x1 * scale)), int(round(y1 * scale)), int(round(x2 * scale)), int(round(y2 * scale))] return label def fix_label_scale(label,scale): x1, y1, x2, y2 = label w,h=input_size/scale[1],input_size/scale[0] label = [int(round(x1 * w)), int(round(y1 * h)), int(round(x2 * w)), int(round(y2 * h))] return label #Data enhancement part # Random scaling. The labels corresponding to the expansion should also change. def random_scale(img, min_size,label): h, w, _ = img.shape scale = min_size / min(h, w) # Calculate the minimum scaling ratio new_w=int(round(scale*w)) new_h=int(round(scale*h)) x1,y1,x2,y2 = label label=[int(round(x1*scale)), int(round(y1*scale)), int(round(x2*scale)), int(round(y2*scale))] img = cv2.resize(img, (new_w, new_h)) # Scale the image return img,label import random #random flip def random_flip(img, flip_ratio,label): h, w, _=img.shape x1,y1,x2,y2=label if random.random() < flip_ratio: # Flip the image with random probability img = cv2.flip(img, 1) # Flip horizontally label=[w-x2,y1,w-x1,y2] else: img = cv2.flip(img, 0) # Flip vertically label = [x1,h-y2,x2,h-y1] return img,label import math # Random rotation def random_rotate(img, angle_range,label): h, w, _ = img.shape angle = np.random.uniform(-angle_range, angle_range) # Generate a random rotation angle mat = cv2.getRotationMatrix2D((w/2, h/2), angle, 1.0) # Rotation matrix img = cv2.warpAffine(img, mat, (w, h), flags=cv2.INTER_LINEAR, borderValue=(0, 0, 0)) #Affine transformation x1,y1,x2,y2=label #Border center point position x,y = (x2-x1)/2,(y2-y1)/2 #Rotation center point cx=w/2 cy=h/2 #Convert the label point to the coordinate system with the rotation center as the origin x -= cx y-=cy # x1 -=cx #y1-=cy # x2 -=cx #y2-=cy angle = angle * np.pi /180.0 # Angle to radian x_new = x * math.cos(angle) + y * math.sin(angle) y_new = -x * math.sin(angle) + y * math.cos(angle) # x1_new = x1 * math.cos(angle) + y1*math.sin(angle) # y1_new = -x1*math.sin(angle) + y1*math.cos(angle) # x2_new = x2 * math.cos(angle) + y2*math.sin(angle) # y2_new = -x2 * math.sin(angle) + y2*math.cos(angle) # Convert back to the coordinates in the original coordinate system x_new + =cx y_new + =cy # x1_new + =cx #y1_new + =cy # x2_new + =cx #y2_new + =cy #Clip the rotated coordinates to avoid crossing the boundary x_new= max(min(x_new,w),0) y_new= max(min(y_new,h),0) # x1_new = max(min(x1_new,w),0) # y1_new = max(min(y1_new, h), 0) # x2_new = max(min(x2_new, w), 0) # y2_new = max(min(y2_new, h), 0) x1_new = x_new-(x2-x1)/2 y1_new = y_new-(y2-y1)/2 x2_new = x_new + (x2-x1)/2 y2_new = y_new + (x2-x1)/2 # x1_new = (x1_new if x1_new>0 else 0 ) # y1_new =(y1_new if y1_new>0 else 0) # # x2_new =(x2_new if x2_new>0 and x2_new<=w else w) # y2_new = (y2_new if y2_new>0 and y2_new<=h else h) if y1_new>=y2_new: print('random_rotate error',[x1_new,y1_new,x2_new,y2_new]) return img,[x1_new,y1_new,x2_new,y2_new] # Image enhancement def image_plus(img,label): return img,label # idx = np.random.randint(low=1,high=3,dtype=np.uint8) # if(idx==1): #Random scaling # return random_scale(img,input_size + 10,label) # if(idx==2): #Random flip # return random_flip(img,0.5,label) # if(idx==3): # angle = np.random.randint(10,360) # return random_rotate(img,angle,label) # draw text def put_on_text(img,text,pos): font = cv2.FONT_HERSHEY_SIMPLEX fontScale=1 color = (255, 0, 0) thickness=2 img = cv2.putText(img, text, pos, font, fontScale, color, thickness) return img #endregion #region Construct ResNet50 network for extracting image features # The code here also belongs to ResNet but the code is more redundant. # Identity block def identity_block(X, f,filters): """ Implementing identity blocks in ResNet parameter: X - input tensor type data with dimensions (m, n_H_prev, n_W_prev, n_C_prev) f - an integer specifying the dimensions of the convolutional layer window in the middle of the main path filters - list of integers defining the number of filters for each convolutional layer in the main path stage - an integer used to name the layers, depending on their position in the network block - string used to name layers, depending on their location in the network return: X - the output of the identity block, tensor type, dimensions (n_H, n_W, n_C) """ # Get filters F1, F2, F3 = filters # Save input data to generate shortcut X_shortcut = X # The first part of the main path X=Convolution2D(filters=F1,kernel_size=(1,1),padding='valid',kernel_initializer=glorot_uniform(seed=0))(X) X=BatchNormalization(axis=3)(X) X=Activation('relu')(X) # The second part of the main path X=Convolution2D(filters=F2,kernel_size=(f,f),padding='same',kernel_initializer=glorot_uniform(seed=0))(X) X = BatchNormalization(axis=3)(X) X = Activation('relu')(X) # The third part of the main path X=Convolution2D(filters=F3,kernel_size=(1,1),padding='valid',kernel_initializer=glorot_uniform(seed=0))(X) X=BatchNormalization(axis=3)(X) X=Activation('relu')(X) # Add shortcut and activate it through Relu X=Add()([X,X_shortcut]) X = Activation('relu')(X) return # Bottleneck block def convolutional_block(X,f,filters,s=2): # Get filters F1, F2, F3 = filters # Save input data to generate shortcut X_shortcut = X # The first part of the main path X = Convolution2D(filters=F1, kernel_size=(1, 1), strides=(s, s), padding='valid',kernel_initializer=glorot_uniform(seed=0))(X) X = BatchNormalization(axis=3)(X) X = Activation('relu')(X) # The second part of the main path X = Convolution2D(filters=F2, kernel_size=(f, f), strides=(1, 1), padding='same',kernel_initializer=glorot_uniform(seed=0))(X) X = BatchNormalization(axis=3)(X) X = Activation('relu')(X) # The third part of the main path X = Convolution2D(filters=F3, kernel_size=(1, 1), strides=(1, 1), padding='same',kernel_initializer=glorot_uniform(seed=0))(X) X = BatchNormalization(axis=3)(X) X = Activation('relu')(X) # Shortcut path part X_shortcut = Convolution2D(filters=F3, kernel_size=(1, 1), strides=(s, s), padding='valid',kernel_initializer=glorot_uniform(seed=0))(X_shortcut) X_shortcut = BatchNormalization(axis=3)(X_shortcut) # Add shortcut and activate it through Relu X=Add()([X,X_shortcut]) X=Activation('relu')(X) return #Create model def o_check_model(num_classes=1): num_anchors=1 # Define the input tensor type, the shape is the same as the input image X_input = Input(shape=(input_size, input_size, 3), name='input_1') X = ZeroPadding2D((3, 3))(X_input) # stage 1 X = Convolution2D(filters=64, kernel_size=(7, 7), strides=(2, 2), kernel_initializer=glorot_uniform(seed=0))(X) X = BatchNormalization(axis=3)(X) X = Activation('relu')(X) X = MaxPooling2D((3, 3), strides=(2, 2))(X) # stage 2 X = convolutional_block(X, f=3, filters=[64, 64, 256], s=1) X = identity_block(X, f=3, filters=[64, 64, 256]) X = identity_block(X, f=3, filters=[64, 64, 256]) # stage 3 X = convolutional_block(X, f=3, filters=[128, 128, 512], s=2) X = identity_block(X, f=3, filters=[128, 128, 512]) X = identity_block(X, f=3, filters=[128, 128, 512]) X = identity_block(X, f=3, filters=[128, 128, 512]) # stage 4 X = convolutional_block(X, f=3, filters=[256, 256, 1024], s=2) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) # stage 5 X = convolutional_block(X, f=4, filters=[512, 512, 2048], s=2) X = identity_block(X, f=3, filters=[512, 512, 2048]) X = identity_block(X, f=3, filters=[512, 512, 2048]) resNet=X X=AveragePooling2D((2,2))(X) X=Flatten()(X) class_head=Dense(num_classes,activation='softmax',name='class_head')(X) x = Conv2D(64, (3, 3), activation='relu', padding='same',name='f_f1')(resNet) x = BatchNormalization(axis=3)(x) x = Conv2D(64, (3, 3), activation='relu', padding='same',name='f_f2')(x) x = BatchNormalization(axis=3)(x) x = Conv2D(64, (3, 3), activation='relu', padding='same',name='f_f3')(x) x = BatchNormalization(axis=3)(x) x = Flatten()(x) # Return to the head reg_head = Dense(num_anchors*4,activation='linear',name='reg_head')(x) # Build a complete model model = Model(inputs=X_input, outputs={'class_head':class_head, 'reg_head':reg_head}) # Compile model model.compile(optimizer='adam', loss={ 'class_head':'categorical_crossentropy', 'reg_head':'mean_squared_error' }, loss_weights={ 'class_head':1.0, 'reg_head':1.0 }, metrics={ 'class_head':'accuracy', 'reg_head':'mae' }) # model.compile(optimizer='adam', # loss={ # 'class_head':'categorical_crossentropy', # 'reg_head':'mse' # }, # loss_weights={ # 'class_head':1.0, # 'reg_head':1.0 # }, #metrics={ # 'class_head':'accuracy', # 'reg_head':'mae' # }) # model.compile(optimizer='adam', # loss=total_loss, #metrics={ # 'class_head': 'accuracy', # 'reg_head': 'mae' # }) # model.compile(optimizer='adam', # loss='mean_squared_error', # metrics='mae') # model.compile(optimizer='adam', # loss=total_loss_2, #metrics={ # 'class_head': 'accuracy', # 'reg_head': 'mae' # }) # model.compile(optimizer='adam', # loss=[categorical_cross_entropy_loss, smooth_l1_loss], # loss_weights=[1.0, 1.0], #metrics={ # 'class_head': 'accuracy', # 'reg_head': 'mae' # }) model.summary() return model #Basic loss function def base_loss(y_true, y_pred): return categorical_crossentropy(y_true, y_pred) def iou_loss(y_true,y_pred): print('1',y_true.shape,y_pred.shape) # Calculate intersection intersection = tf.reduce_sum(y_true * y_pred, axis=[1, 2, 3]) print('2') # Calculate union union = tf.reduce_sum(y_true + y_pred, axis=[1, 2, 3]) - intersection # Calculate IoU iou = (intersection + 1e-7) / (union + 1e-7) #Convert IoU to IoU loss iou_loss = 1 - iou print('iou',iou_loss) # Return loss return iou_loss # Overall loss function def total_loss(y_true, y_pred): print('y_true',y_true[0].shape,'y_pred',y_pred.shape) # Classification loss class_true = y_true[0] class_pred = y_pred[0] class_loss = base_loss(class_true, class_pred) #Regression loss reg_true = y_true[1] reg_pred = y_pred[1] reg_loss =mean_squared_error(reg_true,reg_pred) #iou_loss(reg_true,reg_pred) # # Combination loss total_loss = class_loss + reg_loss # Set the type to floating point type. This may need to be adjusted later. # total_loss = tf.cast(total_loss,dtype=tf.int32) return total_loss def smooth_l1_loss(y_true, y_pred): diff = tf.abs(tf.cast(y_true[1],dtype=tf.float32) - y_pred[1]) less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32) loss = (less_than_one * 0.5 * diff ** 2) + (1.0 - less_than_one) * (diff - 0.5) return tf.reduce_mean(loss) # Classification loss function: cross entropy loss def categorical_cross_entropy_loss(y_true, y_pred): return categorical_crossentropy(y_true[0], y_pred[0]) def l2_loss(y_true, y_pred): return tf.reduce_sum(tf.square(tf.cast(y_true,dtype=tf.float32) -y_pred), axis=-1) def total_loss_1(y_true,y_pred): # Classification loss class_true = y_true[0] class_pred = y_pred[0] class_loss = base_loss(class_true, class_pred) #Regression loss reg_true = y_true[1] reg_pred = y_pred[1] reg_loss =l2_loss(reg_true,reg_pred) #mean_squared_error(reg_true,reg_pred) # # Combination loss total_loss = class_loss + reg_loss return total_loss def total_loss_2(y_true,y_pred): # Classification loss class_true = y_true[0] class_pred = y_pred[0] class_loss = base_loss(class_true, class_pred) l1=smooth_l1_loss(y_true,y_pred) return class_loss + l1 def custom_loss(y_true, y_pred): # Classification loss class_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_true[..., :20], logits=y_pred[..., :20])) # Bounding box loss bbox_loss = mean_squared_error(y_true,y_pred) #Total loss total_loss = class_loss + 1 * bbox_loss return total_loss # Modify to resnet-50 # Residual block contains two types: 1: Identity Residual 2: Bottleneck Residual def resnet50_model(num_classes=1): num_anchors=1 # Define the input tensor type, the shape is the same as the input image X_input=Input(shape=(input_size,input_size,3),name='input_1') X=ZeroPadding2D((3,3))(X_input) #stage 1 X=Convolution2D(filters=64,kernel_size=(7,7),strides=(2,2),kernel_initializer=glorot_uniform(seed=0))(X) X=BatchNormalization(axis=3)(X) X=Activation('relu')(X) X=MaxPooling2D((3,3), strides=(2,2))(X) #stage 2 X=convolutional_block(X,f=3,filters=[64,64,256],s=1) X=identity_block(X,f=3,filters=[64,64,256]) X = identity_block(X, f=3, filters=[64, 64, 256]) # stage 3 X=convolutional_block(X,f=3,filters=[128,128,512],s=2) X=identity_block(X,f=3,filters=[128,128,512]) X = identity_block(X, f=3, filters=[128,128,512]) X = identity_block(X, f=3, filters=[128,128,512]) # stage 4 X = convolutional_block(X, f=3, filters=[256, 256, 1024], s=2) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) X = identity_block(X, f=3, filters=[256, 256, 1024]) # stage 5 X = convolutional_block(X, f=4, filters=[512, 512, 2048], s=2) X = identity_block(X, f=3, filters=[512, 512, 2048]) X = identity_block(X, f=3, filters=[512, 512, 2048]) model = Model(inputs=X_input,outputs=X) return model # # Average pooling # X =AveragePooling2D((2,2))(X) # # #Output layer # X=Flatten()(X) # class_head = Dense(num_classes, activation='softmax', name='class_head')(X) # print('class head ok') # # Return to the head # reg_head = Dense(num_anchors * 4, activation='linear', name='reg_head')(X) # print('reg head ok') # # Build a complete model # model = Model(inputs=X_input, outputs={'class_head':class_head, 'reg_head':reg_head}) # # Compile model # model.compile(optimizer='adam', # loss={ # 'class_head': 'categorical_crossentropy', # 'reg_head': 'mse' # }, # loss_weights={ # 'class_head': 1.0, # 'reg_head': 1.0 # }, #metrics={ # 'class_head': 'accuracy', # 'reg_head': 'mae' # }) # model.summary() # return model #endregion from tensorflow.python.keras.callbacks import Callback # Customize the callback function and output the value of y_pred class OutputCallback(Callback): def on_epoch_end(self, epoch, logs=None): # Get the predicted value of the model on the training set at the end of each epoch y_pred = self.model.predict(valid_sets[0]) print(f'y_pred: {y_pred}') print(f'y_true: {valid_sets[1]}') valid_sets=generator_vaild_data() def train(): model = o_check_model(len(catalogs)) print('load model') model.fit_generator(generator=generator_data(), steps_per_epoch=math.ceil(5011/batch_size) + 1, epochs=100, validation_data=valid_sets,) model.save('on_object_test.h5') #train() # # # Test conversion label # img = cv2.imread('D:\chinese_img\f\VOCtrainval_06-Nov-2007\VOCdevkit\VOC2007\JPEGImages\000021 .jpg') # height,width,_=img.shape # bbox=[1,235,182,388] # x1,y1,x2,y2=bbox # cv2.rectangle(img, (x1,y1), (x2,y2), (0, 255, 0), 2) # # # img=cv2.resize(img,(input_size,input_size)) # label = fix_label_scale(bbox,[height,width]) # # x1,y1,x2,y2=label # # cv2.rectangle(img, (int(x1),int(y1)), (int(x2),int(y2)), (0, 255, 0), 2) # label = convert_to_mse(label) # # x1,y1,x2,y2=label # # cv2.rectangle(img, (int(x1),int(y1)), (int(x2),int(y2)), (0, 255, 255), 2) # # label = convert_to_point(label) # # label = ref_label_scale(label,[height,width]) # img=cv2.resize(img,(width,height)) # x1,y1,x2,y2=label # cv2.rectangle(img, (int(x1),int(y1)), (int(x2),int(y2)), (0, 255, 255), 2) # # # Display image # cv2.imshow('image', img) #cv2.waitKey(0) # cv2.destroyAllWindows() model = load_model('on_object_test.h5') # D:\chinese_img\920.png # D:\chinese_img\f\VOCtrainval_06-Nov-2007\VOCdevkit\VOC2007\JPEGImages\000007.jpg # D:\chinese_img\20230507204908.jpg print('Please enter the picture name') user_input = sys.stdin.readline() while user_input!='q': user_input = user_input.strip() pos=[] img=cv2.imread('E:\VOCdevkit\VOC2007\JPEGImages' + user_input + '.jpg') image = cv2.resize(img, (224, 224)) pos.append(image) x_train = tf.convert_to_tensor(np.array(pos)) rsp=model.predict(x=x_train) # Classification class_head = rsp['class_head'][0] idx = np.argmax(class_head) catalog = catalogs[idx] print('Judge the detection result:' + catalog) # position reg_head = rsp['reg_head'][0] # x,y,w,h = reg_head # cv2.rectangle(img, (int(x),int(y)), (int(w),int(h)), (0, 255, 255), 2) print('Training results:',reg_head) height,width,_ =img.shape # Restore # Restore to x1,y1,x2,y2 bbox = convert_to_point(reg_head) print('Training restoration result:', bbox) bbox = ref_label_scale(bbox,[height,width]) print('Restore original size:', bbox) x1, y1, x2, y2 = bbox # draw area cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2) img = put_on_text(img,catalog + '(' + str(np.max(class_head)) + ')',(x1, y1)) # show image cv2.imshow('image', img) cv2.waitKey(0) cv2.destroyAllWindows() print('Please enter the picture name') user_input = sys.stdin.readline()