pytorch/caffe2/operators/roi_pool_op.cc

#include "roi_pool_op.h"

#include <cfloat>

namespace caffe2 {

using std::max;
using std::min;

template <>
bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
  const auto& X = Input(0); // Input data to pool
  const auto& R = Input(1); // RoIs
  auto* Y = Output(0); // RoI pooled data
  auto* A = is_test_ ? nullptr : Output(1); // argmaxes

  // Each ROI is of the form [batch_index x1 y1 x2 y2]
  CAFFE_ENFORCE_EQ(R.dim32(1), 5);

  // TODO: Handle the storage_order properly to get the NCWH.
  int batch_size = X.dim32(0);
  int channels = X.dim32(1);
  int height = X.dim32(2);
  int width = X.dim32(3);
  int num_rois = R.dim32(0);

  Y->Resize(num_rois, channels, pooled_height_, pooled_width_);
  if (!is_test_) {
    A->Resize(Y->sizes());
  }

  const float* Xdata = X.data<float>();
  const float* rois = R.data<float>();
  float* Ydata = Y->template mutable_data<float>();
  int* argmax_data = is_test_ ? nullptr : A->template mutable_data<int>();

  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
  for (int n = 0; n < num_rois; ++n) {
    int roi_batch_id = rois[0];
    int roi_start_w = round(rois[1] * spatial_scale_);
    int roi_start_h = round(rois[2] * spatial_scale_);
    int roi_end_w = round(rois[3] * spatial_scale_);
    int roi_end_h = round(rois[4] * spatial_scale_);
    CAFFE_ENFORCE_GE(roi_batch_id, 0);
    CAFFE_ENFORCE_LT(roi_batch_id, batch_size);

    // Force malformed ROIs to be 1x1
    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
    int roi_width = max(roi_end_w - roi_start_w + 1, 1);

    const float bin_size_h =
        static_cast<float>(roi_height) / static_cast<float>(pooled_height_);
    const float bin_size_w =
        static_cast<float>(roi_width) / static_cast<float>(pooled_width_);

    const float* batch_data = Xdata + roi_batch_id * X.size_from_dim(1);

    for (int c = 0; c < channels; ++c) {
      for (int ph = 0; ph < pooled_height_; ++ph) {
        for (int pw = 0; pw < pooled_width_; ++pw) {
          // Compute pooling region for this output unit:
          //  start (included) = floor(ph * roi_height / pooled_height_)
          //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
          int hstart =
              static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
          int wstart =
              static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
          int hend =
              static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
          int wend =
              static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));

          // Add roi offsets and clip to input boundaries
          hstart = min(max(hstart + roi_start_h, 0), height);
          hend = min(max(hend + roi_start_h, 0), height);
          wstart = min(max(wstart + roi_start_w, 0), width);
          wend = min(max(wend + roi_start_w, 0), width);

          const int pool_index = ph * pooled_width_ + pw;

          // Define an empty pooling region to be zero
          bool is_empty = (hend <= hstart) || (wend <= wstart);
          Ydata[pool_index] = is_empty ? 0 : -FLT_MAX;
          if (!is_test_) {
            // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
            argmax_data[pool_index] = -1;
          }

          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = h * width + w;
              if (batch_data[index] > Ydata[pool_index]) {
                Ydata[pool_index] = batch_data[index];
                if (!is_test_) {
                  argmax_data[pool_index] = index;
                }
              }
            }
          }
        }
      }
      // Increment all data pointers by one channel
      batch_data += X.size_from_dim(2);
      Ydata += Y->size_from_dim(2);
      if (!is_test_) {
        argmax_data += A->size_from_dim(2);
      }
    }
    // Increment ROI data pointer
    rois += R.size_from_dim(1);
  }

  return true;
}

REGISTER_CPU_OPERATOR(RoIPool, RoIPoolOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CPUContext>);

// Input: X, rois
// Output case #1: Y, argmaxes (train mode)
// Output case #2: Y           (test mode)
OPERATOR_SCHEMA(RoIPool)
    .NumInputs(2)
    .NumOutputs({1, 2})
    .TensorInferenceFunction([](const OperatorDef& def,
                                const vector<TensorShape>& in) {
      ArgumentHelper helper(def);
      const StorageOrder order = StringToStorageOrder(
          helper.GetSingleArgument<string>("order", "NCHW"));
      const TensorShape& X = in[0];
      const int num_channels =
          (order == StorageOrder::NCHW ? X.dims(1) : X.dims(3));
      const TensorShape& R = in[1];
      const int num_rois = R.dims(0);
      const int pooled_height = helper.GetSingleArgument<int>("pooled_h", 1);
      const int pooled_width = helper.GetSingleArgument<int>("pooled_w", 1);
      TensorShape Y = CreateTensorShape(
          vector<int>({num_rois, num_channels, pooled_height, pooled_width}),
          X.data_type());

      bool is_test = helper.GetSingleArgument<int>(OpSchema::Arg_IsTest, 0);
      if (!is_test) {
        TensorShape argmaxes = Y;
        argmaxes.set_data_type(TensorProto_DataType_INT32);
        return vector<TensorShape>({Y, argmaxes});
      } else {
        return vector<TensorShape>({Y});
      }
    })
    .SetDoc(R"DOC(
Carries out ROI Pooling for Faster-RCNN.
Depending on the mode, there are multiple output cases:

  Output case #1: Y, argmaxes (train mode)
  Output case #2: Y           (test mode)
)DOC")
    .Arg(
        "is_test",
        "If set, run in test mode and skip computation of argmaxes (used for "
        "gradient computation). Only one output tensor is produced. "
        "(Default: false).")
    .Arg("order", "A StorageOrder string (Default: \"NCHW\").")
    .Arg("pooled_h", "The pooled output height (Default: 1).")
    .Arg("pooled_w", "The pooled output width (Default: 1).")
    .Arg(
        "spatial_scale",
        "Multiplicative spatial scale factor to translate ROI coords from "
        "their input scale to the scale used when pooling (Default: 1.0).")
    .Input(
        0,
        "X",
        "The input 4-D tensor of data. Only NCHW order is currently supported.")
    .Input(
        1,
        "rois",
        "RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of "
        "shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].")
    .Output(
        0,
        "Y",
        "RoI pooled output 4-D tensor of shape "
        "(num_rois, channels, pooled_h, pooled_w).")
    .Output(
        1,
        "argmaxes",
        "Argmaxes corresponding to indices in X used for gradient computation. "
        "Only output if arg \"is_test\" is false.");

// Input: X, rois, argmaxes, dY (aka "gradOutput")
// Output: dX (aka "gradInput")
OPERATOR_SCHEMA(RoIPoolGradient).NumInputs(4).NumOutputs(1);

class GetRoIPoolGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "RoIPoolGradient",
        "",
        vector<string>{I(0), I(1), O(1), GO(0)},
        vector<string>{GI(0)});
  }
};

REGISTER_GRADIENT(RoIPool, GetRoIPoolGradient);

} // namespace caffe2