Faster-RCNN笔记

Faster-RCNN

faster-rcnn先将整幅图forward 到最后一层conv层,在pool5层分出两支。一支后面接RPN,另一只接ROI Pooling,等待从RPN得到的框。

Faster-RCNN笔记

RPN

RPN在最后的conv层上做滑动窗口,每一个窗口产生256维的向量(针对于ZF),根据这一个向量又分出两支,分别用于分类和回归框。如图所示。Faster-RCNN笔记

prototxt

layer {
  name: "rpn_conv/3x3"
  type: "Convolution"
  bottom: "conv5"
  top: "rpn/output"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  convolution_param {
    num_output: 256
    kernel_size: 3 pad: 1 stride: 1
    weight_filler { type: "gaussian" std: 0.01 }
    bias_filler { type: "constant" value: 0 }
  }
}
layer {
  name: "rpn_relu/3x3"
  type: "ReLU"
  bottom: "rpn/output"
  top: "rpn/output"
}

layer {
  name: "rpn_cls_score"
  type: "Convolution"
  bottom: "rpn/output"
  top: "rpn_cls_score"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  convolution_param {
    num_output: 18   # 2(bg/fg) * 9(anchors)
    kernel_size: 1 pad: 0 stride: 1
    weight_filler { type: "gaussian" std: 0.01 }
    bias_filler { type: "constant" value: 0 }
  }
}
layer {
  name: "rpn_bbox_pred"
  type: "Convolution"
  bottom: "rpn/output"
  top: "rpn_bbox_pred"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  convolution_param {
    num_output: 36   # 4 * 9(anchors)
    kernel_size: 1 pad: 0 stride: 1
    weight_filler { type: "gaussian" std: 0.01 }
    bias_filler { type: "constant" value: 0 }
  }
}
layer {
   bottom: "rpn_cls_score"
   top: "rpn_cls_score_reshape"
   name: "rpn_cls_score_reshape"
   type: "Reshape"
   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
}
layer {
  name: 'rpn-data'
  type: 'Python'
  bottom: 'rpn_cls_score'
  bottom: 'gt_boxes'
  bottom: 'im_info'
  bottom: 'data'
  top: 'rpn_labels'
  top: 'rpn_bbox_targets'
  top: 'rpn_bbox_inside_weights'
  top: 'rpn_bbox_outside_weights'
  python_param {
    module: 'rpn.anchor_target_layer'
    layer: 'AnchorTargetLayer'
    param_str: "'feat_stride': 16"
  }
}
layer {
  name: "rpn_loss_cls"
  type: "SoftmaxWithLoss"
  bottom: "rpn_cls_score_reshape"
  bottom: "rpn_labels"
  propagate_down: 1
  propagate_down: 0
  top: "rpn_cls_loss"
  loss_weight: 1
  loss_param {
    ignore_label: -1
    normalize: true
  }
}
layer {
  name: "rpn_loss_bbox"
  type: "SmoothL1Loss"
  bottom: "rpn_bbox_pred"
  bottom: "rpn_bbox_targets"
  bottom: 'rpn_bbox_inside_weights'
  bottom: 'rpn_bbox_outside_weights'
  top: "rpn_loss_bbox"
  loss_weight: 1
  smooth_l1_loss_param { sigma: 3.0 }
}

anchor_target_layer

  1. 产生anchors,与gt_boxes比较IOU,判断该anchor的label是前景还是背景。

  2. anchors通过bbox regression,生成 bbox_target。

  3. bbox_inside_weights中正类为1权重为(1.0,1.0,1.0,1.0),其他为0。

  4. bbox_outside_weights外部权重,负例外部权重=正例外部权重=np.ones((1,4))*1.0/np.sum(labels>=0)。

input


bottom: 'rpn_cls_score'#只是为了确定大小height、width
bottom: 'gt_boxes'框的ground truth
bottom: 'im_info'过滤不在图片内部的
bottom: 'data'

output


top: 'rpn_labels'大小是1,1,A*height,width,A是anchar的数目
top: 'rpn_bbox_targets'大小是1,A*4,height,width
top: 'rpn_bbox_inside_weights'大小是1,A*4,height,width
top: 'rpn_bbox_outside_weights'大小是1,A*4,height,width

RoI Proposal

prototxt


layer {
  name: "rpn_cls_prob"
  type: "Softmax"
  bottom: "rpn_cls_score_reshape"
  top: "rpn_cls_prob"
}
layer {
  name: 'rpn_cls_prob_reshape'
  type: 'Reshape'
  bottom: 'rpn_cls_prob'
  top: 'rpn_cls_prob_reshape'
  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
}
layer {
  name: 'proposal'
  type: 'Python'
  bottom: 'rpn_cls_prob_reshape'
  bottom: 'rpn_bbox_pred'
  bottom: 'im_info'
  top: 'rpn_rois'
#  top: 'rpn_scores'
  python_param {
    module: 'rpn.proposal_layer'
    layer: 'ProposalLayer'
    param_str: "'feat_stride': 16"
  }
}
layer {
  name: 'roi-data'
  type: 'Python'
  bottom: 'rpn_rois'
  bottom: 'gt_boxes'
  top: 'rois'
  top: 'labels'
  top: 'bbox_targets'
  top: 'bbox_inside_weights'
  top: 'bbox_outside_weights'
  python_param {
    module: 'rpn.proposal_target_layer'
    layer: 'ProposalTargetLayer'
    param_str: "'num_classes': 21"
  }
}

proposal_layer

  1. 通过anchors和bbox_deltas产生proposals,并对其进行过滤等操作,最后得到符合条件的proposals和scores。


        # Convert anchors into proposals via bbox transformations
        proposals = bbox_transform_inv(anchors, bbox_deltas)

        # 2. clip predicted boxes to image
        proposals = clip_boxes(proposals, im_info[:2])

        # 3. remove predicted boxes with either height or width < threshold
        # (NOTE: convert min_size to input image scale stored in im_info[2])
        keep = _filter_boxes(proposals, min_size * im_info[2])
        proposals = proposals[keep, :]
        scores = scores[keep]

input


bottom: 'rpn_cls_prob_reshape'#用于nms
bottom: 'rpn_bbox_pred'#
bottom: 'im_info'#

        scores = bottom[0].data[:, self._num_anchors:, :, :]
        bbox_deltas = bottom[1].data
        im_info = bottom[2].data[0, :]

output


输出:rpn_rois

proposal_target_layer

  1. 根据从proposal层得到proposals,再加上gt_boxes,根据overlap和bbox regression等操作得到labels,过滤后的proposals和bbox等信息。

    
            # Sample rois with classification labels and bounding box regression
            # targets
            labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
                all_rois, gt_boxes, fg_rois_per_image,
                rois_per_image, self._num_classes)

input


bottom: 'rpn_rois'#rpn得到的所有区域
bottom: 'gt_boxes'#包括boxes、gt_classes

output


top: 'rois'#区域块(0,x1,y1,x2,y2)
top: 'labels'#标签
top: 'bbox_targets'#4N(计算的rois和最新gt的target)
top: 'bbox_inside_weights'#1
top: 'bbox_outside_weights'#1

RCNN

prototxt


layer {
  name: "roi_pool_conv5"
  type: "ROIPooling"
  bottom: "conv5"
  bottom: "rois"
  top: "roi_pool_conv5"
  roi_pooling_param {
    pooled_w: 6
    pooled_h: 6
    spatial_scale: 0.0625 # 1/16
  }
}
layer {
  name: "fc6"
  type: "InnerProduct"
  bottom: "roi_pool_conv5"
  top: "fc6"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  inner_product_param {
    num_output: 4096
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
    scale_train: false
  }
}
layer {
  name: "fc7"
  type: "InnerProduct"
  bottom: "fc6"
  top: "fc7"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  inner_product_param {
    num_output: 4096
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
    scale_train: false
  }
}
layer {
  name: "cls_score"
  type: "InnerProduct"
  bottom: "fc7"
  top: "cls_score"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  inner_product_param {
    num_output: 21
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "bbox_pred"
  type: "InnerProduct"
  bottom: "fc7"
  top: "bbox_pred"
  param { lr_mult: 1.0 }
  param { lr_mult: 2.0 }
  inner_product_param {
    num_output: 84
    weight_filler {
      type: "gaussian"
      std: 0.001
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "loss_cls"
  type: "SoftmaxWithLoss"
  bottom: "cls_score"
  bottom: "labels"
  propagate_down: 1
  propagate_down: 0
  top: "cls_loss"
  loss_weight: 1
  loss_param {
    ignore_label: -1
    normalize: true
  }
}
layer {
  name: "loss_bbox"
  type: "SmoothL1Loss"
  bottom: "bbox_pred"
  bottom: "bbox_targets"
  bottom: "bbox_inside_weights"
  bottom: "bbox_outside_weights"
  top: "bbox_loss"
  loss_weight: 1
}

RoIPooling

  1. 得到num_rois*6*6*256(conv5的output)大小的top_data.

    
    template <typename Dtype>
    void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
          const vector<Blob<Dtype>*>& top) {
      const Dtype* bottom_data = bottom[0]->cpu_data();
      const Dtype* bottom_rois = bottom[1]->cpu_data();
      // Number of ROIs
      int num_rois = bottom[1]->num();
      int batch_size = bottom[0]->num();
      int top_count = top[0]->count();
      Dtype* top_data = top[0]->mutable_cpu_data();
      caffe_set(top_count, Dtype(-FLT_MAX), top_data);
      int* argmax_data = max_idx_.mutable_cpu_data();
      caffe_set(top_count, -1, argmax_data);
    
      // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
      for (int n = 0; n < num_rois; ++n) {
        int roi_batch_ind = bottom_rois[0];
        int roi_start_w = round(bottom_rois[1] * spatial_scale_);
        int roi_start_h = round(bottom_rois[2] * spatial_scale_);
        int roi_end_w = round(bottom_rois[3] * spatial_scale_);
        int roi_end_h = round(bottom_rois[4] * spatial_scale_);
        CHECK_GE(roi_batch_ind, 0);
        CHECK_LT(roi_batch_ind, batch_size);
    
        int roi_height = max(roi_end_h - roi_start_h + 1, 1);
        int roi_width = max(roi_end_w - roi_start_w + 1, 1);
        const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                 / static_cast<Dtype>(pooled_height_);
        const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                 / static_cast<Dtype>(pooled_width_);
    
        const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
    
        for (int c = 0; c < channels_; ++c) {
          for (int ph = 0; ph < pooled_height_; ++ph) {
            for (int pw = 0; pw < pooled_width_; ++pw) {
              // Compute pooling region for this output unit:
              //  start (included) = floor(ph * roi_height / pooled_height_)
              //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
              int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                                  * bin_size_h));
              int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                                  * bin_size_w));
              int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                               * bin_size_h));
              int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                               * bin_size_w));
    
              hstart = min(max(hstart + roi_start_h, 0), height_);
              hend = min(max(hend + roi_start_h, 0), height_);
              wstart = min(max(wstart + roi_start_w, 0), width_);
              wend = min(max(wend + roi_start_w, 0), width_);
    
              bool is_empty = (hend <= hstart) || (wend <= wstart);
    
              const int pool_index = ph * pooled_width_ + pw;
              if (is_empty) {
                top_data[pool_index] = 0;
                argmax_data[pool_index] = -1;
              }
    
              for (int h = hstart; h < hend; ++h) {
                for (int w = wstart; w < wend; ++w) {
                  const int index = h * width_ + w;
                  if (batch_data[index] > top_data[pool_index]) {
                    top_data[pool_index] = batch_data[index];
                    argmax_data[pool_index] = index;
                  }
                }
              }
            }
          }
          // Increment all data pointers by one channel
          batch_data += bottom[0]->offset(0, 1);
          top_data += top[0]->offset(0, 1);
          argmax_data += max_idx_.offset(0, 1);
        }
        // Increment ROI data pointer
        bottom_rois += bottom[1]->offset(1);
      }
    }