

网络构架部分代码见Mask_RCNN/mrcnn/model.py中class MaskRCNN的build方法的"inference"分支。



  1. import os
  2. os.environ["CUDA_VISIBLE_DEVICES"] = "2"


  1. import tensorflow as tf
  2. import keras.backend.tensorflow_backend as KTF
  4. config = tf.ConfigProto()
  5. config.gpu_options.allow_growth=True #不全部占满显存, 按需分配
  6. # config.gpu_options.per_process_gpu_memory_fraction = 0.3 #指定分配30%空间
  7. sess = tf.Session(config=config)# 设置session
  8. KTF.set_session(sess)


下面的交互方法几乎都是对keras的函数式API操作的,不过keras的函数模型转换为model对象也极为方便,KM.Model(input_tensors, output_tensors)操作一下即可。




  1. import tensorflow as tf
  2. import keras.backend as K
  4. rpn_match = tf.placeholder(tf.int8, [10, 2])
  5. tf.where(K.equal(rpn_match, 1))


  1. class PyramidROIAlign(KE.Layer):
  2. """Implements ROI Pooling on multiple levels of the feature pyramid.
  3. Params:
  4. - pool_shape: [pool_height, pool_width] of the output pooled regions. Usually [7, 7]
  5. Inputs:
  6. - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
  7. coordinates. Possibly padded with zeros if not enough
  8. boxes to fill the array.
  9. - image_meta: [batch, (meta data)] Image details. See compose_image_meta()
  10. - feature_maps: List of feature maps from different levels of the pyramid.
  11. Each is [batch, height, width, channels]
  12. Output:
  13. Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
  14. The width and height are those specific in the pool_shape in the layer
  15. constructor.
  16. """
  18. def __init__(self, pool_shape, **kwargs):
  19. super(PyramidROIAlign, self).__init__(**kwargs)
  20. self.pool_shape = tuple(pool_shape)
  22. def call(self, inputs):
  23. # num_boxes指的是proposal数目,它们均会作用于每张图片上,只是不同的proposal作用于图片
  24. # 的特征级别不同,我通过循环特征层寻找符合的proposal,应用ROIAlign
  25. # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
  26. boxes = inputs[0]
  28. # Image meta
  29. # Holds details about the image. See compose_image_meta()
  30. image_meta = inputs[1]
  32. # Feature Maps. List of feature maps from different level of the
  33. # feature pyramid. Each is [batch, height, width, channels]
  34. feature_maps = inputs[2:]
  36. # Assign each ROI to a level in the pyramid based on the ROI area.
  37. y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
  38. h = y2 - y1
  39. w = x2 - x1
  40. # Use shape of first image. Images in a batch must have the same size.
  41. image_shape = parse_image_meta_graph(image_meta)['image_shape'][0] # h, w, c
  42. # Equation 1 in the Feature Pyramid Networks paper. Account for
  43. # the fact that our coordinates are normalized here.
  44. # e.g. a 224x224 ROI (in pixels) maps to P4
  45. image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
  46. roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) # h、w已经归一化
  47. roi_level = tf.minimum(5, tf.maximum(
  48. 2, 4 + tf.cast(tf.round(roi_level), tf.int32))) # 确保值位于2到5之间
  49. roi_level = tf.squeeze(roi_level, 2) # [batch, num_boxes]
  51. # Loop through levels and apply ROI pooling to each. P2 to P5.
  52. pooled = []
  53. box_to_level = []
  54. for i, level in enumerate(range(2, 6)):
  55. # tf.where 返回值格式 [坐标1, 坐标2……]
  56. # np.where 返回值格式 [[坐标1.x, 坐标2.x……], [坐标1.y, 坐标2.y……]]
  57. ix = tf.where(tf.equal(roi_level, level)) # 返回坐标表示:第n张图片的第i个proposal
  58. level_boxes = tf.gather_nd(boxes, ix) # [本level的proposal数目, 4]
  60. # Box indices for crop_and_resize.
  61. box_indices = tf.cast(ix[:, 0], tf.int32) # 记录每个propose对应图片序号
  63. # Keep track of which box is mapped to which level
  64. box_to_level.append(ix)
  66. # Stop gradient propogation to ROI proposals
  67. level_boxes = tf.stop_gradient(level_boxes)
  68. box_indices = tf.stop_gradient(box_indices)
  70. # Crop and Resize
  71. # From Mask R-CNN paper: "We sample four regular locations, so
  72. # that we can evaluate either max or average pooling. In fact,
  73. # interpolating only a single value at each bin center (without
  74. # pooling) is nearly as effective."
  75. #
  76. # Here we use the simplified approach of a single value per bin,
  77. # which is how it's done in tf.crop_and_resize()
  78. # Result: [this_level_num_boxes, pool_height, pool_width, channels]
  79. pooled.append(tf.image.crop_and_resize(
  80. feature_maps[i], level_boxes, box_indices, self.pool_shape,
  81. method="bilinear"))
  82. # 输入参数shape:
  83. # [batch, image_height, image_width, channels]
  84. # [this_level_num_boxes, 4]
  85. # [this_level_num_boxes]
  86. # [height, pool_width]
  88. # Pack pooled features into one tensor
  89. pooled = tf.concat(pooled, axis=0) # [batch*num_boxes, pool_height, pool_width, channels]
  91. # Pack box_to_level mapping into one array and add another
  92. # column representing the order of pooled boxes
  93. box_to_level = tf.concat(box_to_level, axis=0) # [batch*num_boxes, 2]
  94. box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) # [batch*num_boxes, 1]
  95. box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
  96. axis=1) # [batch*num_boxes, 3]
  98. # 截止到目前,我们获取了记录全部ROIAlign结果feat集合的张量pooled,和记录这些feat相关信息的张量box_to_level,
  99. # 由于提取方法的原因,此时的feat并不是按照原始顺序排序(先按batch然后按box index排序),下面我们设法将之恢复顺
  100. # 序(ROIAlign作用于对应图片的对应proposal生成feat)
  101. # Rearrange pooled features to match the order of the original boxes
  102. # Sort box_to_level by batch then box index
  103. # TF doesn't have a way to sort by two columns, so merge them and sort.
  104. # box_to_level[i, 0]表示的是当前feat隶属的图片索引,box_to_level[i, 1]表示的是其box序号
  105. sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1] # [batch*num_boxes]
  106. ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
  107. box_to_level)[0]).indices[::-1]
  108. ix = tf.gather(box_to_level[:, 2], ix)
  109. pooled = tf.gather(pooled, ix)
  111. # Re-add the batch dimension
  112. # [batch, num_boxes, (y1, x1, y2, x2)], [batch*num_boxes, pool_height, pool_width, channels]
  113. shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
  114. pooled = tf.reshape(pooled, shape)
  115. return pooled # [batch, num_boxes, pool_height, pool_width, channels]
  117. def compute_output_shape(self, input_shape):
  118. return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )



  1. rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)



  1. class BatchNorm(KL.BatchNormalization):
  2. """Extends the Keras BatchNormalization class to allow a central place
  3. to make changes if needed.
  4. Batch normalization has a negative effect on training if batches are small
  5. so this layer is often frozen (via setting in Config class) and functions
  6. as linear layer.
  7. """
  8. def call(self, inputs, training=None):
  9. """
  10. Note about training values:
  11. None: Train BN layers. This is the normal mode
  12. False: Freeze BN layers. Good when batch size is small
  13. True: (don't use). Set layer in training mode even when making inferences
  14. """
  15. return super(self.__class__, self).call(inputs, training=training)









  1. class MaskRCNN():
  2. """Encapsulates the Mask RCNN model functionality.
  4. The actual Keras model is in the keras_model property.
  5. """
  7. def __init__(self, mode, config, model_dir):
  8. """
  9. mode: Either "training" or "inference"
  10. config: A Sub-class of the Config class
  11. model_dir: Directory to save training logs and trained weights
  12. """
  13. assert mode in ['training', 'inference']
  14. self.mode = mode
  15. self.config = config
  16. self.model_dir = model_dir
  17. self.set_log_dir()
  18. self.keras_model = self.build(mode=mode, config=config)
  20. def build(self, mode, config):
  21. """Build Mask R-CNN architecture.
  22. input_shape: The shape of the input image.
  23. mode: Either "training" or "inference". The inputs and
  24. outputs of the model differ accordingly.
  25. """
  26. assert mode in ['training', 'inference']
  28. # Image size must be dividable by 2 multiple times
  29. h, w = config.IMAGE_SHAPE[:2] # [1024 1024 3]
  30. if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
  31. raise Exception("Image size must be dividable by 2 at least 6 times "
  32. "to avoid fractions when downscaling and upscaling." # <-----
  33. "For example, use 256, 320, 384, 448, 512, ... etc. ")
  35. # Inputs
  36. input_image = KL.Input(
  37. shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
  38. input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
  39. name="input_image_meta")
  40. if mode == "training":
  41. ……
  42. elif mode == "inference":
  43. # Anchors in normalized coordinates
  44. input_anchors = KL.Input(shape=[None, 4], name="input_anchors")



input_image:输入图片,[batch, None, None, config.IMAGE_SHAPE[2]]

input_image_meta:图片的信息(包含形状、预处理信息等,后面会介绍),[batch, config.IMAGE_META_SIZE]

input_anchors:锚框,[batch, None, 4]



  1. # Build the shared convolutional layers.
  2. # Bottom-up Layers
  3. # Returns a list of the last layers of each stage, 5 in total.
  4. # Don't create the thead (stage 5), so we pick the 4th item in the list.
  5. if callable(config.BACKBONE):
  6. _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True,
  7. train_bn=config.TRAIN_BN)
  8. else:
  9. _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE,
  10. stage5=True, train_bn=config.TRAIN_BN)



  1. ############################################################
  2. # Resnet Graph
  3. ############################################################
  5. # Code adopted from:
  6. # https://github.com/fchollet/deep-learning-models/blob/master/resnet50.py
  8. def identity_block(input_tensor, kernel_size, filters, stage, block,
  9. use_bias=True, train_bn=True):
  10. """The identity_block is the block that has no conv layer at shortcut
  11. # Arguments
  12. input_tensor: input tensor
  13. kernel_size: default 3, the kernel size of middle conv layer at main path
  14. filters: list of integers, the nb_filters of 3 conv layer at main path
  15. stage: integer, current stage label, used for generating layer names
  16. block: 'a','b'..., current block label, used for generating layer names
  17. use_bias: Boolean. To use or not use a bias in conv layers.
  18. train_bn: Boolean. Train or freeze Batch Norm layers
  19. """
  20. nb_filter1, nb_filter2, nb_filter3 = filters
  21. conv_name_base = 'res' + str(stage) + block + '_branch'
  22. bn_name_base = 'bn' + str(stage) + block + '_branch'
  24. x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
  25. use_bias=use_bias)(input_tensor)
  26. x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
  27. x = KL.Activation('relu')(x)
  29. x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
  30. name=conv_name_base + '2b', use_bias=use_bias)(x)
  31. x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
  32. x = KL.Activation('relu')(x)
  34. x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
  35. use_bias=use_bias)(x)
  36. x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
  38. x = KL.Add()([x, input_tensor])
  39. x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
  40. return x
  42. def conv_block(input_tensor, kernel_size, filters, stage, block,
  43. strides=(2, 2), use_bias=True, train_bn=True):
  44. """conv_block is the block that has a conv layer at shortcut
  45. # Arguments
  46. input_tensor: input tensor
  47. kernel_size: default 3, the kernel size of middle conv layer at main path
  48. filters: list of integers, the nb_filters of 3 conv layer at main path
  49. stage: integer, current stage label, used for generating layer names
  50. block: 'a','b'..., current block label, used for generating layer names
  51. use_bias: Boolean. To use or not use a bias in conv layers.
  52. train_bn: Boolean. Train or freeze Batch Norm layers
  53. Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
  54. And the shortcut should have subsample=(2,2) as well
  55. """
  56. nb_filter1, nb_filter2, nb_filter3 = filters
  57. conv_name_base = 'res' + str(stage) + block + '_branch'
  58. bn_name_base = 'bn' + str(stage) + block + '_branch'
  60. x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
  61. name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
  62. x = BatchNorm(name=bn_name_base + '2a')(x, training=train_bn)
  63. x = KL.Activation('relu')(x)
  65. x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
  66. name=conv_name_base + '2b', use_bias=use_bias)(x)
  67. x = BatchNorm(name=bn_name_base + '2b')(x, training=train_bn)
  68. x = KL.Activation('relu')(x)
  70. x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base +
  71. '2c', use_bias=use_bias)(x)
  72. x = BatchNorm(name=bn_name_base + '2c')(x, training=train_bn)
  74. shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
  75. name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
  76. shortcut = BatchNorm(name=bn_name_base + '1')(shortcut, training=train_bn)
  78. x = KL.Add()([x, shortcut])
  79. x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
  80. return x
  82. def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
  83. """Build a ResNet graph.
  84. architecture: Can be resnet50 or resnet101
  85. stage5: Boolean. If False, stage5 of the network is not created
  86. train_bn: Boolean. Train or freeze Batch Norm layers
  87. """
  88. assert architecture in ["resnet50", "resnet101"]
  89. # Stage 1
  90. x = KL.ZeroPadding2D((3, 3))(input_image)
  91. x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
  92. x = BatchNorm(name='bn_conv1')(x, training=train_bn)
  93. x = KL.Activation('relu')(x)
  94. C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
  95. # Stage 2
  96. x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)
  97. x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
  98. C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
  99. # Stage 3
  100. x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
  101. x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
  102. x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
  103. C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
  104. # Stage 4
  105. x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
  106. block_count = {"resnet50": 5, "resnet101": 22}[architecture]
  107. for i in range(block_count):
  108. x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
  109. C4 = x
  110. # Stage 5
  111. if stage5:
  112. x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
  113. x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
  114. C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
  115. else:
  116. C5 = None
  117. return [C1, C2, C3, C4, C5]


  1. class BatchNorm(KL.BatchNormalization):
  2. """Extends the Keras BatchNormalization class to allow a central place
  3. to make changes if needed.
  5. Batch normalization has a negative effect on training if batches are small
  6. so this layer is often frozen (via setting in Config class) and functions
  7. as linear layer.
  8. """
  9. def call(self, inputs, training=None):
  10. """
  11. Note about training values:
  12. None: Train BN layers. This is the normal mode
  13. False: Freeze BN layers. Good when batch size is small
  14. True: (don't use). Set layer in training mode even when making inferences
  15. """
  16. return super(self.__class__, self).call(inputs, training=training)



  1. # Top-down Layers
  2. # TODO: add assert to varify feature map sizes match what's in config
  3. P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) # 256
  4. P4 = KL.Add(name="fpn_p4add")([
  5. KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
  6. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
  7. P3 = KL.Add(name="fpn_p3add")([
  8. KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
  9. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
  10. P2 = KL.Add(name="fpn_p2add")([
  11. KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
  12. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
  13. # Attach 3x3 conv to all P layers to get the final feature maps.
  14. P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
  15. P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
  16. P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
  17. P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)
  18. # P6 is used for the 5th anchor scale in RPN. Generated by
  19. # subsampling from P5 with stride of 2.
  20. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)


  1. # Note that P6 is used in RPN, but not in the classifier heads.
  2. rpn_feature_maps = [P2, P3, P4, P5, P6]
  3. mrcnn_feature_maps = [P2, P3, P4, P5]

其中rpn_feature_maps对应图中的实线输出,送入RPN网络分类/回归得到锚框的前景/背景鉴别结果;而mrcnn_feature_maps则是后面进行ROI Align时的切割目标。


  1. def build(self, mode, config):
  2. """Build Mask R-CNN architecture.
  3. input_shape: The shape of the input image.
  4. mode: Either "training" or "inference". The inputs and
  5. outputs of the model differ accordingly.
  6. """
  7. assert mode in ['training', 'inference']
  9. # Image size must be dividable by 2 multiple times
  10. h, w = config.IMAGE_SHAPE[:2] # [1024 1024 3]
  11. if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): # 这里就限定了下采样不会产生坐标误差
  12. raise Exception("Image size must be dividable by 2 at least 6 times "
  13. "to avoid fractions when downscaling and upscaling."
  14. "For example, use 256, 320, 384, 448, 512, ... etc. ")
  16. # Inputs
  17. input_image = KL.Input(
  18. shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
  19. input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
  20. name="input_image_meta")
  21. if mode == "training":
  22. # RPN GT
  23. input_rpn_match = KL.Input(
  24. shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
  25. input_rpn_bbox = KL.Input(
  26. shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
  28. # Detection GT (class IDs, bounding boxes, and masks)
  29. # 1. GT Class IDs (zero padded)
  30. input_gt_class_ids = KL.Input(
  31. shape=[None], name="input_gt_class_ids", dtype=tf.int32)
  32. # 2. GT Boxes in pixels (zero padded)
  33. # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
  34. input_gt_boxes = KL.Input(
  35. shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
  36. # Normalize coordinates
  37. gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(
  38. x, K.shape(input_image)[1:3]))(input_gt_boxes)
  39. # 3. GT Masks (zero padded)
  40. # [batch, height, width, MAX_GT_INSTANCES]
  41. if config.USE_MINI_MASK:
  42. input_gt_masks = KL.Input(
  43. shape=[config.MINI_MASK_SHAPE[0],
  44. config.MINI_MASK_SHAPE[1], None],
  45. name="input_gt_masks", dtype=bool)
  46. else:
  47. input_gt_masks = KL.Input(
  48. shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
  49. name="input_gt_masks", dtype=bool)
  50. elif mode == "inference":
  51. # Anchors in normalized coordinates
  52. input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
  54. # Build the shared convolutional layers.
  55. # Bottom-up Layers
  56. # Returns a list of the last layers of each stage, 5 in total.
  57. # Don't create the thead (stage 5), so we pick the 4th item in the list.
  58. if callable(config.BACKBONE):
  59. _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True,
  60. train_bn=config.TRAIN_BN)
  61. else:
  62. _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE,
  63. stage5=True, train_bn=config.TRAIN_BN)
  64. # Top-down Layers
  65. # TODO: add assert to varify feature map sizes match what's in config
  66. P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) # 256
  67. P4 = KL.Add(name="fpn_p4add")([
  68. KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
  69. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
  70. P3 = KL.Add(name="fpn_p3add")([
  71. KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
  72. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
  73. P2 = KL.Add(name="fpn_p2add")([
  74. KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
  75. KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
  76. # Attach 3x3 conv to all P layers to get the final feature maps.
  77. P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
  78. P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
  79. P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
  80. P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)
  81. # P6 is used for the 5th anchor scale in RPN. Generated by
  82. # subsampling from P5 with stride of 2.
  83. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
  85. # Note that P6 is used in RPN, but not in the classifier heads.
  86. rpn_feature_maps = [P2, P3, P4, P5, P6]
  87. mrcnn_feature_maps = [P2, P3, P4, P5]
  89. # Anchors
  90. if mode == "training":
  91. anchors = self.get_anchors(config.IMAGE_SHAPE)
  92. # Duplicate across the batch dimension because Keras requires it
  93. # TODO: can this be optimized to avoid duplicating the anchors?
  94. anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
  95. # A hack to get around Keras's bad support for constants
  96. anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
  97. else:
  98. anchors = input_anchors
  100. # RPN Model, 返回的是keras的Module对象, 注意keras中的Module对象是可call的
  101. rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, # 1 3 256
  102. len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE)
  103. # Loop through pyramid layers
  104. layer_outputs = [] # list of lists
  105. for p in rpn_feature_maps:
  106. layer_outputs.append(rpn([p])) # 保存各pyramid特征经过RPN之后的结果
  107. # Concatenate layer outputs
  108. # Convert from list of lists of level outputs to list of lists
  109. # of outputs across levels.
  110. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
  111. output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
  112. outputs = list(zip(*layer_outputs)) # [[logits2,……6], [class2,……6], [bbox2,……6]]
  113. outputs = [KL.Concatenate(axis=1, name=n)(list(o))
  114. for o, n in zip(outputs, output_names)]
  116. # [batch, num_anchors, 2/4]
  117. # 其中num_anchors指的是全部特征层上的anchors总数
  118. rpn_class_logits, rpn_class, rpn_bbox = outputs
  120. # Generate proposals
  121. # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
  122. # and zero padded.
  124. # POST_NMS_ROIS_TRAINING = 2000
  125. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
  126. else config.POST_NMS_ROIS_INFERENCE
  127. # [IMAGES_PER_GPU, num_rois, (y1, x1, y2, x2)]
  128. # IMAGES_PER_GPU取代了batch,之后说的batch都是IMAGES_PER_GPU
  129. rpn_rois = ProposalLayer(
  130. proposal_count=proposal_count,
  131. nms_threshold=config.RPN_NMS_THRESHOLD, # 0.7
  132. name="ROI",
  133. config=config)([rpn_class, rpn_bbox, anchors])
  135. if mode == "training":
  136. # Class ID mask to mark class IDs supported by the dataset the image
  137. # came from.
  138. active_class_ids = KL.Lambda(
  139. lambda x: parse_image_meta_graph(x)["active_class_ids"]
  140. )(input_image_meta)
  142. if not config.USE_RPN_ROIS:
  143. # Ignore predicted ROIs and use ROIs provided as an input.
  144. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
  145. name="input_roi", dtype=np.int32)
  146. # Normalize coordinates
  147. target_rois = KL.Lambda(lambda x: norm_boxes_graph(
  148. x, K.shape(input_image)[1:3]))(input_rois)
  149. else:
  150. target_rois = rpn_rois
  152. # Generate detection targets
  153. # Subsamples proposals and generates target outputs for training
  154. # Note that proposal class IDs, gt_boxes, and gt_masks are zero
  155. # padded. Equally, returned rois and targets are zero padded.
  156. rois, target_class_ids, target_bbox, target_mask =\
  157. DetectionTargetLayer(config, name="proposal_targets")([
  158. target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
  160. # Network Heads
  161. # TODO: verify that this handles zero padded ROIs
  162. mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
  163. fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
  164. config.POOL_SIZE, config.NUM_CLASSES,
  165. train_bn=config.TRAIN_BN,
  166. fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
  168. mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,
  169. input_image_meta,
  170. config.MASK_POOL_SIZE,
  171. config.NUM_CLASSES,
  172. train_bn=config.TRAIN_BN)
  174. # TODO: clean up (use tf.identify if necessary)
  175. output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)
  177. # Losses
  178. rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(
  179. [input_rpn_match, rpn_class_logits])
  180. rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
  181. [input_rpn_bbox, input_rpn_match, rpn_bbox])
  182. class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
  183. [target_class_ids, mrcnn_class_logits, active_class_ids])
  184. bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
  185. [target_bbox, target_class_ids, mrcnn_bbox])
  186. mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
  187. [target_mask, target_class_ids, mrcnn_mask])
  189. # Model
  190. inputs = [input_image, input_image_meta,
  191. input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
  192. if not config.USE_RPN_ROIS:
  193. inputs.append(input_rois)
  194. outputs = [rpn_class_logits, rpn_class, rpn_bbox,
  195. mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
  196. rpn_rois, output_rois,
  197. rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
  198. model = KM.Model(inputs, outputs, name='mask_rcnn')
  199. else:
  200. # Network Heads
  201. # Proposal classifier and BBox regressor heads
  202. # output shapes:
  203. # mrcnn_class_logits: [batch, num_rois, NUM_CLASSES] classifier logits (before softmax)
  204. # mrcnn_class: [batch, num_rois, NUM_CLASSES] classifier probabilities
  205. # mrcnn_bbox(deltas): [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
  206. mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
  207. fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
  208. config.POOL_SIZE, config.NUM_CLASSES,
  209. train_bn=config.TRAIN_BN,
  210. fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
  212. # Detections
  213. # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
  214. # normalized coordinates
  215. detections = DetectionLayer(config, name="mrcnn_detection")(
  216. [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])
  218. # Create masks for detections
  219. detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
  220. mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps,
  221. input_image_meta,
  222. config.MASK_POOL_SIZE,
  223. config.NUM_CLASSES,
  224. train_bn=config.TRAIN_BN)
  226. model = KM.Model([input_image, input_image_meta, input_anchors],
  227. [detections, mrcnn_class, mrcnn_bbox,
  228. mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],
  229. name='mask_rcnn')
  231. # Add multi-GPU support.
  232. if config.GPU_COUNT > 1:
  233. from mrcnn.parallel_model import ParallelModel
  234. model = ParallelModel(model, config.GPU_COUNT)
  236. return model


  『计算机视觉』经典RCNN_其二:Faster-RCNN

    项目源码 一.Faster-RCNN简介 『cs231n』Faster_RCNN 『计算机视觉』Faster-RCNN学习_其一:目标检测及RCNN谱系 一篇讲的非常明白的文章:一文读懂Faster ...

  『计算机视觉』经典RCNN_其一:从RCNN到Faster-RCNN

    RCNN介绍 目标检测-RCNN系列 一文读懂Faster RCNN 一.目标检测 1.两个任务 目标检测可以拆分成两个任务:识别和定位 图像识别(classification)输入:图片输出:物体的 ...

  『计算机视觉』Mask-RCNN_训练网络其三:训练Model

    Github地址:Mask_RCNN 『计算机视觉』Mask-RCNN_论文学习 『计算机视觉』Mask-RCNN_项目文档翻译 『计算机视觉』Mask-RCNN_推断网络其一:总览 『计算机视觉』M ...

  『计算机视觉』Mask-RCNN_训练网络其二:train网络结构&损失函数

    Github地址:Mask_RCNN 『计算机视觉』Mask-RCNN_论文学习 『计算机视觉』Mask-RCNN_项目文档翻译 『计算机视觉』Mask-RCNN_推断网络其一:总览 『计算机视觉』M ...

  『计算机视觉』Mask-RCNN_训练网络其一:数据集与Dataset类

    Github地址:Mask_RCNN 『计算机视觉』Mask-RCNN_论文学习 『计算机视觉』Mask-RCNN_项目文档翻译 『计算机视觉』Mask-RCNN_推断网络其一:总览 『计算机视觉』M ...

  『计算机视觉』Mask-RCNN_从服装关键点检测看KeyPoints分支

    下图Github地址:Mask_RCNN       Mask_RCNN_KeyPoints『计算机视觉』Mask-RCNN_论文学习『计算机视觉』Mask-RCNN_项目文档翻译『计算机视觉』Mas ...

  『计算机视觉』Mask-RCNN_锚框生成

    Github地址:Mask_RCNN 『计算机视觉』Mask-RCNN_论文学习 『计算机视觉』Mask-RCNN_项目文档翻译 『计算机视觉』Mask-RCNN_推断网络其一:总览 『计算机视觉』M ...

  『计算机视觉』Mask-RCNN

    一.Mask-RCNN流程 Mask R-CNN是一个实例分割(Instance segmentation)算法,通过增加不同的分支,可以完成目标分类.目标检测.语义分割.实例分割.人体姿势识别等多种 ...

  『计算机视觉』Mask-RCNN_推断网络其六:Mask生成

    一.Mask生成概览 上一节的末尾,我们已经获取了待检测图片的分类回归信息,我们将回归信息(即待检测目标的边框信息)单独提取出来,结合金字塔特征mrcnn_feature_maps,进行Mask生成工 ...


