pytorch/caffe2/operators/local_response_normalization_op.cc
Xiaomeng Yang 265fa0ce4d Move math::Axpy function to elementwise lib (#18316)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18316

Move math::Axpy function to elementwise lib

i-am-not-moving-c2-to-c10

Reviewed By: houseroad

Differential Revision: D14574697

fbshipit-source-id: 7cfbb2da295c8966c5328bd6b577cce2638eea62
2019-03-26 12:19:19 -07:00

529 lines
15 KiB
C++

#include "caffe2/operators/local_response_normalization_op.h"
namespace caffe2 {
template<>
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
// Note(Yangqing): this one is copied from my Caffe implementation.
auto& X = Input(0);
DCHECK_EQ(X.dim(), 4);
const int N = X.dim32(0);
const int C = X.dim32(1);
const int H = X.dim32(2);
const int W = X.dim32(3);
const int image_size = C * H * W;
const float* Xdata = X.data<float>();
auto* Y = Output(0, X.sizes(), at::dtype<float>());
float* Ydata = Y->template mutable_data<float>();
if (OutputSize() > 1) {
scale_ = Output(1);
} else {
if (!scale_) {
scale_ = &local_scale_tensor_;
}
}
scale_->ResizeLike(X);
float* scale_data = scale_->template mutable_data<float>();
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
Tensor padded_square(vector<int64_t>{C + size_ - 1, H, W}, CPU);
float* padded_square_data = padded_square.template mutable_data<float>();
math::Set<float, CPUContext>(
padded_square.numel(), 0., padded_square_data, &context_);
const float alpha_over_size = alpha_ / size_;
// go through the images
for (int n = 0; n < N; ++n) {
// compute the padded square
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
padded_square_data + pre_pad_ * H * W,
&context_);
// Create the first channel scale
for (int c = 0; c < size_; ++c) {
math::Axpy<float, float, CPUContext>(
H * W,
alpha_over_size,
padded_square_data + c * H * W,
scale_data + image_size * n,
&context_);
}
for (int c = 1; c < C; ++c) {
float* this_scale_slice = scale_data + n * image_size + c * H * W;
// copy previous scale
context_.CopyFromCPU<float>(
H * W, this_scale_slice - H * W, this_scale_slice);
// add head
math::Axpy<float, float, CPUContext>(
H * W,
alpha_over_size,
padded_square_data + (c + size_ - 1) * H * W,
this_scale_slice,
&context_);
// subtract tail
math::Axpy<float, float, CPUContext>(
H * W,
-alpha_over_size,
padded_square_data + (c - 1) * H * W,
this_scale_slice,
&context_);
}
}
math::Powx<float, CPUContext>(
X.numel(), scale_data, -beta_, Ydata, &context_);
math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
return true;
}
template<>
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
// Note(Yangqing): This one is copied from my Decaf implementation. How many
// variants have I written...?
auto& X = Input(0);
DCHECK_EQ(X.dim(), 4);
const int N = X.dim32(0);
const int H = X.dim32(1);
const int W = X.dim32(2);
const int C = X.dim32(3);
const int num_rows = N * H * W;
const float* Xdata = X.data<float>();
auto* Y = Output(0, X.sizes(), at::dtype<float>());
float* Ydata = Y->template mutable_data<float>();
if (OutputSize() > 1) {
scale_ = Output(1);
} else {
if (!scale_) {
scale_ = &local_scale_tensor_;
}
}
scale_->ResizeLike(X);
float* scale_data = scale_->template mutable_data<float>();
Tensor padded_square(vector<int64_t>(1, C + size_ - 1), CPU);
float* padded_square_data = padded_square.template mutable_data<float>();
math::Set<float, CPUContext>(
padded_square.numel(), 0., padded_square_data, &context_);
const float alpha_over_size = alpha_ / size_;
for (int n = 0; n < num_rows; ++n) {
for (int c = 0; c < C; ++c) {
padded_square_data[c + pre_pad_] =
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
}
float accum_scale = 0.;
for (int i = 0; i < size_ - 1; ++i) {
accum_scale += padded_square_data[i];
}
for (int c = 0; c < C; ++c) {
accum_scale += padded_square_data[c + size_ - 1];
scale_data[n * C + c] = bias_ + accum_scale;
accum_scale -= padded_square_data[c];
}
}
math::Powx<float, CPUContext>(
X.numel(), scale_data, -beta_, Ydata, &context_);
math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
return true;
}
template <>
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
DCHECK_EQ(X.dim(), 4);
const int N = X.dim32(0);
const int C = X.dim32(1);
const int H = X.dim32(2);
const int W = X.dim32(3);
const int image_size = C * H * W;
// Loosely checking the size, assuming that the shapes will be the same as
// long as the sizes check out.
DCHECK_EQ(X.numel(), Y.numel());
DCHECK_EQ(X.numel(), dY.numel());
auto* dX = Output(0, X.sizes(), at::dtype<float>());
const float* Xdata = X.data<float>();
const float* Ydata = Y.data<float>();
if (!scale_) {
scale_ = &local_scale_tensor_;
}
scale_->ResizeLike(X);
float* scale_data = scale_->template mutable_data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->template mutable_data<float>();
Tensor padded_ratio(vector<int64_t>{C + size_ - 1, H, W}, CPU);
float* padded_ratio_data = padded_ratio.template mutable_data<float>();
// Compute scale(copied from LRNOp) - reusing padded_ratio
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
math::Set<float, CPUContext>(
padded_ratio.numel(), 0., padded_ratio_data, &context_);
const float alpha_over_size = alpha_ / size_;
// go through the images
for (int n = 0; n < N; ++n) {
// compute the padded square
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
padded_ratio_data + pre_pad_ * H * W,
&context_);
// Create the first channel scale
for (int c = 0; c < size_; ++c) {
math::Axpy<float, float, CPUContext>(
H * W,
alpha_over_size,
padded_ratio_data + c * H * W,
scale_data + image_size * n,
&context_);
}
for (int c = 1; c < C; ++c) {
float* this_scale_slice = scale_data + n * image_size + c * H * W;
// copy previous scale
context_.CopyFromCPU<float>(
H * W, this_scale_slice - H * W, this_scale_slice);
// add head
math::Axpy<float, float, CPUContext>(
H * W,
alpha_over_size,
padded_ratio_data + (c + size_ - 1) * H * W,
this_scale_slice,
&context_);
// subtract tail
math::Axpy<float, float, CPUContext>(
H * W,
-alpha_over_size,
padded_ratio_data + (c - 1) * H * W,
this_scale_slice,
&context_);
}
}
math::Set<float, CPUContext>(
padded_ratio.numel(), 0., padded_ratio_data, &context_);
Tensor accum_ratio(vector<int64_t>{H, W}, CPU);
float* accum_ratio_data = accum_ratio.template mutable_data<float>();
const float cache_ratio = 2. * alpha_ * beta_ / size_;
const int inverse_pre_pad = size_ - (size_ + 1) / 2;
int offset = 0;
for (int n = 0; n < N; ++n) {
// first, compute diff_i * y_i / s_i
math::Mul<float, CPUContext>(
image_size, dYdata + offset, Ydata + offset,
padded_ratio_data + inverse_pre_pad * H * W, &context_);
math::Div<float, CPUContext>(
image_size, padded_ratio_data + inverse_pre_pad * H * W,
scale_data + offset,
padded_ratio_data + inverse_pre_pad * H * W, &context_);
// Now, compute the accumulated ratios and the bottom diff
math::Set<float, CPUContext>(
accum_ratio.numel(), 0., accum_ratio_data, &context_);
for (int c = 0; c < size_ - 1; ++c) {
math::Axpy<float, float, CPUContext>(
H * W, 1, padded_ratio_data + c * H * W, accum_ratio_data, &context_);
}
for (int c = 0; c < C; ++c) {
for (int hw = 0; hw < H * W; ++hw) {
accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
dXdata[offset] =
dYdata[offset] * pow(scale_data[offset], -beta_) -
cache_ratio * accum_ratio_data[hw] * Xdata[offset];
accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
++offset;
}
}
}
return true;
}
template <>
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
DCHECK_EQ(X.dim(), 4);
const int N = X.dim32(0);
const int H = X.dim32(1);
const int W = X.dim32(2);
const int C = X.dim32(3);
const int num_rows = N * H * W;
const float* Xdata = X.data<float>();
// Loosely checking the size, assuming that the shapes will be the same as
// long as the sizes check out.
DCHECK_EQ(X.numel(), Y.numel());
DCHECK_EQ(X.numel(), dY.numel());
auto* dX = Output(0, X.sizes(), at::dtype<float>());
if (!scale_) {
scale_ = &local_scale_tensor_;
}
scale_->ResizeLike(X);
Tensor padded_ratio(vector<int64_t>(1, C + size_ - 1), CPU);
float* padded_ratio_data = padded_ratio.template mutable_data<float>();
float* scale_data = scale_->template mutable_data<float>();
// Compute scale(copied from LRNOp) - reusing padded_ratio
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
math::Set<float, CPUContext>(
padded_ratio.numel(), 0., padded_ratio_data, &context_);
const float alpha_over_size = alpha_ / size_;
for (int n = 0; n < num_rows; ++n) {
for (int c = 0; c < C; ++c) {
padded_ratio_data[c + pre_pad_] =
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
}
float accum_scale = 0.;
for (int i = 0; i < size_ - 1; ++i) {
accum_scale += padded_ratio_data[i];
}
for (int c = 0; c < C; ++c) {
accum_scale += padded_ratio_data[c + size_ - 1];
scale_data[n * C + c] = bias_ + accum_scale;
accum_scale -= padded_ratio_data[c];
}
}
math::Set<float, CPUContext>(
padded_ratio.numel(), 0., padded_ratio_data, &context_);
// the ratio 2*alpha*beta/size
const float cache_ratio = 2. * alpha_ * beta_ / size_;
const float* Ydata = Y.data<float>();
const float* dYdata = dY.data<float>();
float* dXdata = dX->template mutable_data<float>();
for (int n = 0; n < num_rows; ++n) {
const int offset = n * C;
for (int c = 0; c < C; ++c) {
padded_ratio_data[c + pre_pad_] =
Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
}
float accum_ratio = 0.;
for (int c = 0; c < size_ - 1; ++c) {
accum_ratio += padded_ratio_data[c];
}
for (int c = 0; c < C; ++c) {
accum_ratio += padded_ratio_data[c + size_ - 1];
dXdata[offset + c] =
dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
cache_ratio * Xdata[offset + c] * accum_ratio;
accum_ratio -= padded_ratio_data[c];
}
}
return true;
}
REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
OPERATOR_SCHEMA(LRN)
.NumInputs(1)
.NumOutputs(1, 2)
.SetDoc(R"DOC(
`LRN` applies Local Response Normalization to an input blob. This operation performs
a kind of "lateral inhibition" by normalizing over local input regions, where
normalization is applied across channels. This operator is typically used to
normalize an unbounded activation (such as ReLU). The output shape is the same as
the input shape. The `brew` module has a wrapper for this operator for use in a
`ModelHelper` object.
The formula for LRN is as follows:
$$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$
Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc
<details>
<summary> <b>Example</b> </summary>
**Code**
```
workspace.ResetWorkspace()
op = core.CreateOperator("LRN",
["X"],
["Y", "Y_scale"],
size=11,
alpha=0.001,
beta=0.5,
bias=2.0,
order="NHWC"
)
workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW
print("X:\n", workspace.FetchBlob("X"), "\n")
workspace.RunOperatorOnce(op)
print("Y:\n", workspace.FetchBlob("Y"))
print("Y_scale:\n", workspace.FetchBlob("Y_scale"))
```
**Result**
```
X:
[[[[ 0.72985137]
[-0.3753357 ]
[ 2.7344604 ]
[-0.5937792 ]
[ 0.38440478]
[-2.1659644 ]]
[[-0.92846817]
[-0.9996144 ]
[ 0.212943 ]
[-1.968045 ]
[-0.77839696]
[ 0.45492038]]
[[-0.11263168]
[ 1.9901097 ]
[ 0.19275683]
[ 0.15630436]
[ 0.7536298 ]
[-0.77339894]]
[[ 0.8353551 ]
[-0.7784452 ]
[ 1.779317 ]
[ 0.22421335]
[ 1.3846219 ]
[-3.0546608 ]]
[[ 0.09977621]
[ 2.2071757 ]
[ 0.79971045]
[ 3.563886 ]
[-0.7169287 ]
[ 0.77170426]]
[[-1.4296649 ]
[ 0.19181213]
[ 0.45961624]
[-1.0201577 ]
[ 0.62854475]
[-0.6395456 ]]]]
Y:
[[[[ 0.5160766 ]
[-0.26540157]
[ 1.9332271 ]
[-0.41986194]
[ 0.27181432]
[-1.5314047 ]]
[[-0.6565133 ]
[-0.7068181 ]
[ 0.15057328]
[-1.3914955 ]
[-0.5504022 ]
[ 0.32167578]]
[[-0.0796426 ]
[ 1.4070934 ]
[ 0.13629955]
[ 0.11052381]
[ 0.53288984]
[-0.5468682 ]]
[[ 0.5906759 ]
[-0.5504363 ]
[ 1.2580767 ]
[ 0.1585426 ]
[ 0.9790328 ]
[-2.1595135 ]]
[[ 0.07055242]
[ 1.5605361 ]
[ 0.5654725 ]
[ 2.5193207 ]
[-0.50693923]
[ 0.54567 ]]
[[-1.0108787 ]
[ 0.13563155]
[ 0.3249962 ]
[-0.72134334]
[ 0.44444424]
[-0.45222285]]]]
Y_scale:
[[[[2.0000484]
[2.0000129]
[2.0006797]
[2.000032 ]
[2.0000134]
[2.0004265]]
[[2.0000784]
[2.0000908]
[2.000004 ]
[2.0003521]
[2.000055 ]
[2.0000188]]
[[2.0000012]
[2.00036 ]
[2.0000033]
[2.0000021]
[2.0000517]
[2.0000544]]
[[2.0000634]
[2.000055 ]
[2.0002878]
[2.0000045]
[2.0001743]
[2.0008483]]
[[2.000001 ]
[2.000443 ]
[2.0000582]
[2.0011547]
[2.0000467]
[2.0000541]]
[[2.0001857]
[2.0000033]
[2.0000193]
[2.0000947]
[2.000036 ]
[2.0000372]]]]
```
</details>
)DOC")
.Arg(
"size",
"*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
.Arg(
"alpha",
"*(type: float; default: 0)* Multiplicative (scaling) factor.")
.Arg("beta", "*(type: float; default: 0)* Exponent.")
.Arg("bias", "*(type: float; default: 1.0)* Additive factor.")
.Arg("order", "*(type: float; default: 'NCHW')* Order of blob dimensions.")
.Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
.Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
.Output(1, "Y_scale", "*(type: Tensor`<float>`)* Output scale.")
.InheritOnnxSchema();
OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
class GetLRNGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"LRNGradient", "",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(LRN, GetLRNGradient);
} // namespace caffe2