@@ -85,6 +85,7 @@ def _yolo_block(self, inputs, filters):
85
85
inputs = common ._conv2d_fixed_padding (inputs , filters * 2 , 3 )
86
86
return route , inputs
87
87
88
+ # 目标识别的层, 转换到合适的深度,以满足不同class_num数据的分类
88
89
def _detection_layer (self , inputs , anchors ):
89
90
num_anchors = len (anchors )
90
91
feature_map = slim .conv2d (inputs , num_anchors * (5 + self ._NUM_CLASSES ), 1 ,
@@ -93,20 +94,22 @@ def _detection_layer(self, inputs, anchors):
93
94
biases_initializer = tf .zeros_initializer ())
94
95
return feature_map
95
96
96
- # reorganization
97
+ # 讲网络计算的的缩放量和偏移量与anchors,网格位置结合,得到在原图中的绝对位置与大小
97
98
def _reorg_layer (self , feature_map , anchors ):
98
99
# 将张量转换为适合的格式
99
100
num_anchors = len (anchors ) # num_anchors=3
100
101
grid_size = feature_map .shape .as_list ()[1 :3 ] # 网格数
101
102
# the downscale image in height and weight
102
103
stride = tf .cast (self .img_size // grid_size , tf .float32 ) # [h,w] -> [y,x] 平均每个网络多少个像素值
104
+ # 讲anchors 与 目标信息拆开 (batch_size, cell, cell , anchor_num * (5 + class_num)) -->
105
+ # (batch_size, cell, cell , anchor_num ,5 + class_num)
103
106
feature_map = tf .reshape (feature_map ,
104
107
[- 1 , grid_size [0 ], grid_size [1 ], num_anchors , 5 + self ._NUM_CLASSES ]) # 特征图
105
108
106
109
box_centers , box_sizes , conf_logits , prob_logits = tf .split (
107
110
feature_map , [2 , 2 , 1 , self ._NUM_CLASSES ], axis = - 1 ) # 分离各个值,在最后一个维度进行
108
111
109
- box_centers = tf .nn .sigmoid (box_centers ) # 使得偏移量变为非负
112
+ box_centers = tf .nn .sigmoid (box_centers ) # 使得偏移量变为非负,且在0~1之间, 超过1之后,中心点就偏移到了其他的单元中
110
113
111
114
grid_x = tf .range (grid_size [1 ], dtype = tf .int32 )
112
115
grid_y = tf .range (grid_size [0 ], dtype = tf .int32 )
@@ -150,16 +153,15 @@ def _reorg_layer(self, feature_map, anchors):
150
153
x_y_offset = tf .cast (x_y_offset , tf .float32 )
151
154
152
155
box_centers = box_centers + x_y_offset # 物体的中心坐标
153
- box_centers = box_centers * stride [::- 1 ] # 在原图的坐标位置,反归一化
156
+ box_centers = box_centers * stride [::- 1 ] # 在原图的坐标位置,反归一化 [h,w] -> [y,x]
154
157
155
- # tf.exp(box_sizes) 避免缩放出现负数, box_size[13,13,3,2],anchor[3,2]
158
+ # tf.exp(box_sizes) 避免缩放出现负数, box_size[13,13,3,2], anchor[3,2]
156
159
box_sizes = tf .exp (box_sizes ) * anchors # anchors -> [w, h] 使用网络计算出的缩放量对anchors进行缩放
157
160
boxes = tf .concat ([box_centers , box_sizes ], axis = - 1 ) # 计算除所有的方框在原图中的位置
158
161
return x_y_offset , boxes , conf_logits , prob_logits
159
162
160
163
@staticmethod # 静态静态方法不睡和类和实例进行绑定
161
- def _upsample (inputs , out_shape ): # 上采样
162
-
164
+ def _upsample (inputs , out_shape ): # 上采样, 放大图片
163
165
new_height , new_width = out_shape [1 ], out_shape [2 ]
164
166
inputs = tf .image .resize_nearest_neighbor (inputs , (new_height , new_width )) # 使用最近邻改变图像大小
165
167
inputs = tf .identity (inputs , name = 'upsampled' )
@@ -215,23 +217,25 @@ def forward(self, inputs, is_training=False, reuse=False):
215
217
with tf .variable_scope ('yolo-v3' ):
216
218
# https://github.com/YunYang1994/tensorflow-yolov3/raw/master/docs/images/levio.jpeg
217
219
# https://images2018.cnblogs.com/blog/606386/201803/606386-20180327004340505-1572852891.png
218
- # feature_map1 13x13x255
220
+ # feature_map1 13x13x1024 --> 13x13x[3x(5+class_num)]
219
221
route , inputs = self ._yolo_block (inputs , 512 )
220
222
feature_map_1 = self ._detection_layer (inputs , self ._ANCHORS [6 :9 ])
221
223
feature_map_1 = tf .identity (feature_map_1 , name = 'feature_map_1' )
222
224
223
- # feature_map2 26x26x255
225
+ # feature_map2 26x26x512 --> 26x26x[3x(5+class_num)]
224
226
inputs = common ._conv2d_fixed_padding (route , 256 , 1 )
225
227
upsample_size = route_2 .get_shape ().as_list ()
228
+ # 52x52 --> 26x26
226
229
inputs = self ._upsample (inputs , upsample_size ) # 通过直接放大进行上采样
227
230
inputs = tf .concat ([inputs , route_2 ], axis = 3 ) # 在axis=3 进行连接,
228
231
route , inputs = self ._yolo_block (inputs , 256 )
229
232
feature_map_2 = self ._detection_layer (inputs , self ._ANCHORS [3 :6 ])
230
233
feature_map_2 = tf .identity (feature_map_2 , name = 'feature_map_2' )
231
234
232
- # feature_map3 52x52x255
235
+ # feature_map3 52x52x256 --> 52x52x[3x(5+class_num)]
233
236
inputs = common ._conv2d_fixed_padding (route , 128 , 1 )
234
237
upsample_size = route_1 .get_shape ().as_list ()
238
+ # 26x26 --> 52x52
235
239
inputs = self ._upsample (inputs , upsample_size )
236
240
inputs = tf .concat ([inputs , route_1 ], axis = 3 )
237
241
route , inputs = self ._yolo_block (inputs , 128 )
@@ -241,7 +245,7 @@ def forward(self, inputs, is_training=False, reuse=False):
241
245
return feature_map_1 , feature_map_2 , feature_map_3
242
246
243
247
def _reshape (self , x_y_offset , boxes , confs , probs ):
244
-
248
+ # 构成一个(batch_size, cell*cell*len(anchors) , boxes)
245
249
grid_size = x_y_offset .shape .as_list ()[:2 ] # 网格数
246
250
boxes = tf .reshape (boxes , [- 1 , grid_size [0 ] * grid_size [1 ] * 3 , 4 ]) # 3个anchor
247
251
confs = tf .reshape (confs , [- 1 , grid_size [0 ] * grid_size [1 ] * 3 , 1 ]) # 3个anchor分别对应概率
@@ -265,13 +269,14 @@ def predict(self, feature_maps):
265
269
(feature_map_2 , self ._ANCHORS [3 :6 ]),
266
270
(feature_map_3 , self ._ANCHORS [0 :3 ])]
267
271
272
+ # boxe 的相对位置转换为绝对位置
268
273
results = [self ._reorg_layer (feature_map , anchors ) for (feature_map , anchors ) in feature_map_anchors ]
269
274
boxes_list , confs_list , probs_list = [], [], []
270
275
271
276
for result in results :
272
- # print("*results==>", *result)
273
- # print("results==>", len(result))
277
+ # *result = x_y_offset, boxes, confs, probs
274
278
boxes , conf_logits , prob_logits = self ._reshape (* result )
279
+ # --> (batch_size, cell*cell*anchor_num, boxes/conf/prob)
275
280
276
281
confs = tf .sigmoid (conf_logits ) # 转化成概率
277
282
probs = tf .sigmoid (prob_logits ) # 转化成概率,每种类和不在为0
@@ -281,12 +286,12 @@ def predict(self, feature_maps):
281
286
probs_list .append (probs )
282
287
283
288
# 将3个feature_map中所有的信息,整合到一个张量
284
- # shape : [Batch_size,10647,4] 10647= 13x13x3+ 26x26x3+ 52x52x3
289
+ # shape : [Batch_size,10647,4] 10647 = 13x13x3 + 26x26x3 + 52x52x3
285
290
boxes = tf .concat (boxes_list , axis = 1 ) # [Batch_size,10647,4]
286
291
confs = tf .concat (confs_list , axis = 1 ) # [Batch_size,10647,1]
287
292
probs = tf .concat (probs_list , axis = 1 ) # [Batch_size,10647,class_num]
288
293
289
- # 坐标转化:中心坐标转化为左上角作案表 ,右下角坐标
294
+ # 坐标转化:中心坐标转化为 左上角作案表 ,右下角坐标 --> 方便计算矩形框
290
295
center_x , center_y , width , height = tf .split (boxes , [1 , 1 , 1 , 1 ], axis = - 1 )
291
296
x0 = center_x - width / 2.
292
297
y0 = center_y - height / 2.
@@ -301,7 +306,7 @@ def compute_loss(self, pred_feature_map, y_true, ignore_thresh=0.5, max_box_per_
301
306
:param pred_feature_map: list [feature_map_1,feature_map_2,feature_map3]
302
307
feature_map_1[13,13,3,(5 + self._NUM_CLASSES)]
303
308
:param y_true: list [y_true_13, y_true_26, y_true_52]
304
- y_true_13 [13,13,3,(5 + self._NUM_CLASSES)]只有含有目标的网格中存在信息,其余均为0.
309
+ y_true_13 [13,13,3,(5 + self._NUM_CLASSES)] 只有含有目标的网格中存在信息,其余均为0.
305
310
:param ignore_thresh: 0.5
306
311
:param max_box_per_image:
307
312
:return:
@@ -328,19 +333,22 @@ def loss_layer(self, feature_map_i, y_true, anchors):
328
333
grid_size = tf .shape (feature_map_i )[1 :3 ] # cellxcell
329
334
grid_size_ = feature_map_i .shape .as_list ()[1 :3 ]
330
335
331
- # 本身具有[-1, grid_size_[0], grid_size_[1], 3, 5 + self._NUM_CLASSES]的shape,只是进过tf.py_func时丢失.使用reshape重新赋予shape
336
+ # 本身具有[-1, grid_size_[0], grid_size_[1], 3, 5 + self._NUM_CLASSES]的shape,
337
+ # 但在进过tf.py_func方法时丢失shape信息,使用reshape重新赋予shape
332
338
y_true = tf .reshape (y_true , [- 1 , grid_size_ [0 ], grid_size_ [1 ], 3 , 5 + self ._NUM_CLASSES ])
333
339
334
340
# the downscale ratio in height and weight
335
341
ratio = tf .cast (self .img_size / grid_size , tf .float32 )
336
342
# N: batch_size
337
343
N = tf .cast (tf .shape (feature_map_i )[0 ], tf .float32 )
338
344
345
+ # 进过self._reorg_layer后会boxe会被换成绝对位置, 会使用ratio进行换算到cellxcell上
339
346
x_y_offset , pred_boxes , pred_conf_logits , pred_prob_logits = self ._reorg_layer (feature_map_i , anchors )
347
+
340
348
# shape: take 416x416 input image and 13*13 feature_map for example:
341
- #
342
349
# [N, 13, 13, 3, 1]
343
350
object_mask = y_true [..., 4 :5 ] # 该feature_map下所有的目标,有目标的为1,无目标的为0
351
+
344
352
# shape: [N, 13, 13, 3, 4] & [N, 13, 13, 3] ==> [V, 4]
345
353
# V: num of true gt box, 该feature_map下所有检测目标的数量
346
354
valid_true_boxes = tf .boolean_mask (y_true [..., 0 :4 ],
@@ -353,60 +361,67 @@ def loss_layer(self, feature_map_i, y_true, anchors):
353
361
pred_box_xy = pred_boxes [..., 0 :2 ]
354
362
pred_box_wh = pred_boxes [..., 2 :4 ]
355
363
356
- # calc iou 计算交并比
357
- # shape: [N, 13, 13, 3, V]
364
+ # calc iou 计算每个pre_boxe与所有true_boxe的交并比.
358
365
# true:[V,2],[V,2]
359
366
# pre : [13,13,3,2]
367
+ # out_shape: [N, 13, 13, 3, V],
360
368
iou = self ._broadcast_iou (valid_true_box_xy , valid_true_box_wh , pred_box_xy , pred_box_wh )
361
369
362
- # iou shape : [N,13,13,3,V]
363
- best_iou = tf .reduce_max (iou , axis = - 1 ) # 选择每个anchor中最大的那个.
370
+ # iou_shape : [N,13,13,3,V] 每个单元下每个anchor与所有的true_boxes的交并比
371
+ best_iou = tf .reduce_max (iou , axis = - 1 ) # 选择每个anchor中iou最大的那个.
372
+ # out_shape : [N,13,13,3]
373
+
364
374
# get_ignore_mask
365
- ignore_mask = tf .cast (best_iou < 0.5 , tf .float32 ) # 如果重合率低于0.5
366
- # shape: [N, 13, 13, 3, 1]
375
+ ignore_mask = tf .cast (best_iou < 0.5 , tf .float32 ) # 如果iou低于0.5将会丢弃此anchor\
376
+ # out_shape : [N,13,13,3] 0,1张量
377
+
367
378
ignore_mask = tf .expand_dims (ignore_mask , - 1 )
379
+ # out_shape: [N, 13, 13, 3, 1] 0,1张量
380
+
368
381
# get xy coordinates in one cell from the feature_map
369
382
# numerical range: 0 ~ 1
370
- # shape: [N, 13, 13, 3, 2] # 坐标未归一化
371
- true_xy = y_true [..., 0 :2 ] / ratio [::- 1 ] - x_y_offset # 偏移
372
- pred_xy = pred_box_xy / ratio [::- 1 ] - x_y_offset # 偏移
383
+ # shape: [N, 13, 13, 3, 2] # 坐标反归一化
384
+ true_xy = y_true [..., 0 :2 ] / ratio [::- 1 ] - x_y_offset # 绝对(image_size * image_size)信息 转换为 单元(cellxcell)相对信息
385
+ pred_xy = pred_box_xy / ratio [::- 1 ] - x_y_offset # 获取网络真实输出值
373
386
374
387
# get_tw_th, numerical range: 0 ~ 1
375
388
# shape: [N, 13, 13, 3, 2],
376
389
true_tw_th = y_true [..., 2 :4 ] / anchors # 缩放量
377
390
pred_tw_th = pred_box_wh / anchors
378
- # for numerical stability 稳定训练
391
+ # for numerical stability 稳定训练, 为0时不对anchors进行缩放, 在模型输出值特别小是e^out_put为0
379
392
true_tw_th = tf .where (condition = tf .equal (true_tw_th , 0 ),
380
393
x = tf .ones_like (true_tw_th ), y = true_tw_th )
381
394
pred_tw_th = tf .where (condition = tf .equal (pred_tw_th , 0 ),
382
395
x = tf .ones_like (pred_tw_th ), y = pred_tw_th )
383
-
384
- true_tw_th = tf .log (tf .clip_by_value (true_tw_th , 1e-9 , 1e9 )) # 网络输出的原值(有正负)
396
+ # 还原网络最原始的输出值(有正负的)
397
+ true_tw_th = tf .log (tf .clip_by_value (true_tw_th , 1e-9 , 1e9 ))
385
398
pred_tw_th = tf .log (tf .clip_by_value (pred_tw_th , 1e-9 , 1e9 ))
386
399
387
400
# box size punishment:
388
401
# box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
389
402
# 较小的面接的box有较大的权重
390
- # shape: [N, 13, 13, 3, 1] 2. - 面积
403
+ # shape: [N, 13, 13, 3, 1] 2. - 面积 为1时表示保持原始权重
391
404
box_loss_scale = 2. - (y_true [..., 2 :3 ] / tf .cast (self .img_size [1 ], tf .float32 )) * (
392
405
y_true [..., 3 :4 ] / tf .cast (self .img_size [0 ], tf .float32 ))
393
406
394
- # shape: [N, 13, 13, 3, 1] 方框损失值
407
+ # shape: [N, 13, 13, 3, 1] 方框损失值, 中心坐标均方差损失 * mask[N, 13, 13, 3, 1]
408
+ # 仅仅计算有目标单元的loss, 不计算那些错误预测的boxes, 在预测是首先会排除那些conf,iou底的单元
395
409
xy_loss = tf .reduce_sum (tf .square (true_xy - pred_xy ) * object_mask * box_loss_scale ) / N # N:batch_size
396
410
wh_loss = tf .reduce_sum (tf .square (true_tw_th - pred_tw_th ) * object_mask * box_loss_scale ) / N
397
411
398
412
# shape: [N, 13, 13, 3, 1]
399
413
conf_pos_mask = object_mask # 只要存在目标的boxe
400
414
conf_neg_mask = (1 - object_mask ) * ignore_mask # 选择不存在目标,同时iou小于阈值(0.5),
415
+
401
416
# 分离正样本和负样本
402
417
# 正样本损失
403
418
conf_loss_pos = conf_pos_mask * tf .nn .sigmoid_cross_entropy_with_logits (labels = object_mask ,
404
419
logits = pred_conf_logits )
405
420
# 处理后的负样本损失,只计算那些是单元格中没有目标,同时IOU小于0.5的单元,
406
- # 只惩罚IOU<0.5,而不惩罚IOU>0.5 的原因是因为该单元内是有目标的,但是目标中心点却没有落在该单元中.
421
+ # 只惩罚IOU<0.5,而不惩罚IOU>0.5 的原因是可能该单元内是有目标的,仅仅只是目标中心点却没有落在该单元中.所以不计算该loss
407
422
conf_loss_neg = conf_neg_mask * tf .nn .sigmoid_cross_entropy_with_logits (labels = object_mask ,
408
423
logits = pred_conf_logits )
409
- conf_loss = tf .reduce_sum (conf_loss_pos + conf_loss_neg ) / N # 平均交叉熵,同时提高正确率,压低错误率
424
+ conf_loss = tf .reduce_sum (conf_loss_pos + conf_loss_neg ) / N # 平均交叉熵,同时提高正确分类,压低错误分类
410
425
411
426
# shape: [N, 13, 13, 3, 1], 分类loss
412
427
# boject_mask 只看与anchors相匹配的anchors
0 commit comments