pytorch/modules/detectron/batch_permutation_op.cu

/**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "batch_permutation_op.h"
#include "caffe2/core/context_gpu.h"

namespace caffe2 {

namespace {
template <bool forward>
__global__ void BatchPermutationKernel(
    int N,
    int C,
    int H,
    int W,
    const float* src,
    const int* indices,
    float* dst) {
  CUDA_1D_KERNEL_LOOP(index, N * C * H * W) {
    int w = index % W;
    int h = (index / W) % H;
    int c = (index / W / H) % C;
    int n = (index / W / H / C);
    int idx = indices[n];
    if (forward) {
      dst[n * C * H * W + c * H * W + h * W + w] =
          src[idx * C * H * W + c * H * W + h * W + w];
    } else {
      dst[idx * C * H * W + c * H * W + h * W + w] =
          src[n * C * H * W + c * H * W + h * W + w];
    }
  }
}
}

template <>
bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& indices = Input(1);
  auto* Y = Output(0);

  CAFFE_ENFORCE(indices.ndim() == 1, "indices must be 1-d");
  CAFFE_ENFORCE(
      X.dim32(0) == indices.dim32(0),
      "X.dim32(0) must be equal to indices.dim32(0)",
      "(",
      X.dim32(0),
      " vs. ",
      indices.dim32(0),
      ")");

  Y->ResizeLike(X);

  BatchPermutationKernel<true><<<
      CAFFE_GET_BLOCKS(X.size()),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      X.dim32(0),
      X.dim32(1),
      X.dim32(2),
      X.dim32(3),
      X.data<float>(),
      indices.data<int>(),
      Y->mutable_data<float>());

  return true;
}

template <>
bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& indices = Input(0);
  auto& dY = Input(1);
  auto* dX = Output(0);
  dX->ResizeLike(dY);

  BatchPermutationKernel<false><<<
      CAFFE_GET_BLOCKS(dY.size()),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      dY.dim32(0),
      dY.dim32(1),
      dY.dim32(2),
      dY.dim32(3),
      dY.data<float>(),
      indices.data<int>(),
      dX->mutable_data<float>());

  return true;
}

REGISTER_CUDA_OPERATOR(
    BatchPermutation,
    BatchPermutationOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
    BatchPermutationGradient,
    BatchPermutationGradientOp<float, CUDAContext>);
} // namespace caffe2