mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18316 Move math::Axpy function to elementwise lib i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D14574697 fbshipit-source-id: 7cfbb2da295c8966c5328bd6b577cce2638eea62
529 lines
15 KiB
C++
529 lines
15 KiB
C++
#include "caffe2/operators/local_response_normalization_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
template<>
|
|
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
|
// Note(Yangqing): this one is copied from my Caffe implementation.
|
|
auto& X = Input(0);
|
|
|
|
DCHECK_EQ(X.dim(), 4);
|
|
const int N = X.dim32(0);
|
|
const int C = X.dim32(1);
|
|
const int H = X.dim32(2);
|
|
const int W = X.dim32(3);
|
|
const int image_size = C * H * W;
|
|
const float* Xdata = X.data<float>();
|
|
auto* Y = Output(0, X.sizes(), at::dtype<float>());
|
|
float* Ydata = Y->template mutable_data<float>();
|
|
|
|
if (OutputSize() > 1) {
|
|
scale_ = Output(1);
|
|
} else {
|
|
if (!scale_) {
|
|
scale_ = &local_scale_tensor_;
|
|
}
|
|
}
|
|
scale_->ResizeLike(X);
|
|
float* scale_data = scale_->template mutable_data<float>();
|
|
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
|
|
Tensor padded_square(vector<int64_t>{C + size_ - 1, H, W}, CPU);
|
|
float* padded_square_data = padded_square.template mutable_data<float>();
|
|
math::Set<float, CPUContext>(
|
|
padded_square.numel(), 0., padded_square_data, &context_);
|
|
const float alpha_over_size = alpha_ / size_;
|
|
// go through the images
|
|
for (int n = 0; n < N; ++n) {
|
|
// compute the padded square
|
|
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
|
|
padded_square_data + pre_pad_ * H * W,
|
|
&context_);
|
|
// Create the first channel scale
|
|
for (int c = 0; c < size_; ++c) {
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
alpha_over_size,
|
|
padded_square_data + c * H * W,
|
|
scale_data + image_size * n,
|
|
&context_);
|
|
}
|
|
for (int c = 1; c < C; ++c) {
|
|
float* this_scale_slice = scale_data + n * image_size + c * H * W;
|
|
// copy previous scale
|
|
context_.CopyFromCPU<float>(
|
|
H * W, this_scale_slice - H * W, this_scale_slice);
|
|
// add head
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
alpha_over_size,
|
|
padded_square_data + (c + size_ - 1) * H * W,
|
|
this_scale_slice,
|
|
&context_);
|
|
// subtract tail
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
-alpha_over_size,
|
|
padded_square_data + (c - 1) * H * W,
|
|
this_scale_slice,
|
|
&context_);
|
|
}
|
|
}
|
|
math::Powx<float, CPUContext>(
|
|
X.numel(), scale_data, -beta_, Ydata, &context_);
|
|
math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
|
|
return true;
|
|
}
|
|
|
|
template<>
|
|
bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
|
// Note(Yangqing): This one is copied from my Decaf implementation. How many
|
|
// variants have I written...?
|
|
auto& X = Input(0);
|
|
|
|
DCHECK_EQ(X.dim(), 4);
|
|
const int N = X.dim32(0);
|
|
const int H = X.dim32(1);
|
|
const int W = X.dim32(2);
|
|
const int C = X.dim32(3);
|
|
const int num_rows = N * H * W;
|
|
const float* Xdata = X.data<float>();
|
|
auto* Y = Output(0, X.sizes(), at::dtype<float>());
|
|
float* Ydata = Y->template mutable_data<float>();
|
|
|
|
if (OutputSize() > 1) {
|
|
scale_ = Output(1);
|
|
} else {
|
|
if (!scale_) {
|
|
scale_ = &local_scale_tensor_;
|
|
}
|
|
}
|
|
scale_->ResizeLike(X);
|
|
float* scale_data = scale_->template mutable_data<float>();
|
|
|
|
Tensor padded_square(vector<int64_t>(1, C + size_ - 1), CPU);
|
|
float* padded_square_data = padded_square.template mutable_data<float>();
|
|
math::Set<float, CPUContext>(
|
|
padded_square.numel(), 0., padded_square_data, &context_);
|
|
const float alpha_over_size = alpha_ / size_;
|
|
|
|
for (int n = 0; n < num_rows; ++n) {
|
|
for (int c = 0; c < C; ++c) {
|
|
padded_square_data[c + pre_pad_] =
|
|
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
|
|
}
|
|
float accum_scale = 0.;
|
|
for (int i = 0; i < size_ - 1; ++i) {
|
|
accum_scale += padded_square_data[i];
|
|
}
|
|
for (int c = 0; c < C; ++c) {
|
|
accum_scale += padded_square_data[c + size_ - 1];
|
|
scale_data[n * C + c] = bias_ + accum_scale;
|
|
accum_scale -= padded_square_data[c];
|
|
}
|
|
}
|
|
math::Powx<float, CPUContext>(
|
|
X.numel(), scale_data, -beta_, Ydata, &context_);
|
|
math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
|
|
auto& X = Input(0);
|
|
auto& Y = Input(1);
|
|
auto& dY = Input(2);
|
|
|
|
DCHECK_EQ(X.dim(), 4);
|
|
const int N = X.dim32(0);
|
|
const int C = X.dim32(1);
|
|
const int H = X.dim32(2);
|
|
const int W = X.dim32(3);
|
|
const int image_size = C * H * W;
|
|
// Loosely checking the size, assuming that the shapes will be the same as
|
|
// long as the sizes check out.
|
|
DCHECK_EQ(X.numel(), Y.numel());
|
|
DCHECK_EQ(X.numel(), dY.numel());
|
|
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
|
|
|
const float* Xdata = X.data<float>();
|
|
const float* Ydata = Y.data<float>();
|
|
if (!scale_) {
|
|
scale_ = &local_scale_tensor_;
|
|
}
|
|
scale_->ResizeLike(X);
|
|
float* scale_data = scale_->template mutable_data<float>();
|
|
const float* dYdata = dY.data<float>();
|
|
float* dXdata = dX->template mutable_data<float>();
|
|
|
|
Tensor padded_ratio(vector<int64_t>{C + size_ - 1, H, W}, CPU);
|
|
float* padded_ratio_data = padded_ratio.template mutable_data<float>();
|
|
// Compute scale(copied from LRNOp) - reusing padded_ratio
|
|
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
|
|
math::Set<float, CPUContext>(
|
|
padded_ratio.numel(), 0., padded_ratio_data, &context_);
|
|
const float alpha_over_size = alpha_ / size_;
|
|
// go through the images
|
|
for (int n = 0; n < N; ++n) {
|
|
// compute the padded square
|
|
math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
|
|
padded_ratio_data + pre_pad_ * H * W,
|
|
&context_);
|
|
// Create the first channel scale
|
|
for (int c = 0; c < size_; ++c) {
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
alpha_over_size,
|
|
padded_ratio_data + c * H * W,
|
|
scale_data + image_size * n,
|
|
&context_);
|
|
}
|
|
for (int c = 1; c < C; ++c) {
|
|
float* this_scale_slice = scale_data + n * image_size + c * H * W;
|
|
// copy previous scale
|
|
context_.CopyFromCPU<float>(
|
|
H * W, this_scale_slice - H * W, this_scale_slice);
|
|
// add head
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
alpha_over_size,
|
|
padded_ratio_data + (c + size_ - 1) * H * W,
|
|
this_scale_slice,
|
|
&context_);
|
|
// subtract tail
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W,
|
|
-alpha_over_size,
|
|
padded_ratio_data + (c - 1) * H * W,
|
|
this_scale_slice,
|
|
&context_);
|
|
}
|
|
}
|
|
|
|
math::Set<float, CPUContext>(
|
|
padded_ratio.numel(), 0., padded_ratio_data, &context_);
|
|
Tensor accum_ratio(vector<int64_t>{H, W}, CPU);
|
|
float* accum_ratio_data = accum_ratio.template mutable_data<float>();
|
|
|
|
const float cache_ratio = 2. * alpha_ * beta_ / size_;
|
|
const int inverse_pre_pad = size_ - (size_ + 1) / 2;
|
|
|
|
int offset = 0;
|
|
for (int n = 0; n < N; ++n) {
|
|
// first, compute diff_i * y_i / s_i
|
|
math::Mul<float, CPUContext>(
|
|
image_size, dYdata + offset, Ydata + offset,
|
|
padded_ratio_data + inverse_pre_pad * H * W, &context_);
|
|
math::Div<float, CPUContext>(
|
|
image_size, padded_ratio_data + inverse_pre_pad * H * W,
|
|
scale_data + offset,
|
|
padded_ratio_data + inverse_pre_pad * H * W, &context_);
|
|
// Now, compute the accumulated ratios and the bottom diff
|
|
math::Set<float, CPUContext>(
|
|
accum_ratio.numel(), 0., accum_ratio_data, &context_);
|
|
for (int c = 0; c < size_ - 1; ++c) {
|
|
math::Axpy<float, float, CPUContext>(
|
|
H * W, 1, padded_ratio_data + c * H * W, accum_ratio_data, &context_);
|
|
}
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int hw = 0; hw < H * W; ++hw) {
|
|
accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
|
|
dXdata[offset] =
|
|
dYdata[offset] * pow(scale_data[offset], -beta_) -
|
|
cache_ratio * accum_ratio_data[hw] * Xdata[offset];
|
|
accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
|
|
++offset;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
|
|
auto& X = Input(0);
|
|
auto& Y = Input(1);
|
|
auto& dY = Input(2);
|
|
|
|
DCHECK_EQ(X.dim(), 4);
|
|
const int N = X.dim32(0);
|
|
const int H = X.dim32(1);
|
|
const int W = X.dim32(2);
|
|
const int C = X.dim32(3);
|
|
const int num_rows = N * H * W;
|
|
const float* Xdata = X.data<float>();
|
|
// Loosely checking the size, assuming that the shapes will be the same as
|
|
// long as the sizes check out.
|
|
DCHECK_EQ(X.numel(), Y.numel());
|
|
DCHECK_EQ(X.numel(), dY.numel());
|
|
auto* dX = Output(0, X.sizes(), at::dtype<float>());
|
|
if (!scale_) {
|
|
scale_ = &local_scale_tensor_;
|
|
}
|
|
scale_->ResizeLike(X);
|
|
Tensor padded_ratio(vector<int64_t>(1, C + size_ - 1), CPU);
|
|
float* padded_ratio_data = padded_ratio.template mutable_data<float>();
|
|
float* scale_data = scale_->template mutable_data<float>();
|
|
// Compute scale(copied from LRNOp) - reusing padded_ratio
|
|
math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
|
|
math::Set<float, CPUContext>(
|
|
padded_ratio.numel(), 0., padded_ratio_data, &context_);
|
|
const float alpha_over_size = alpha_ / size_;
|
|
|
|
for (int n = 0; n < num_rows; ++n) {
|
|
for (int c = 0; c < C; ++c) {
|
|
padded_ratio_data[c + pre_pad_] =
|
|
Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
|
|
}
|
|
float accum_scale = 0.;
|
|
for (int i = 0; i < size_ - 1; ++i) {
|
|
accum_scale += padded_ratio_data[i];
|
|
}
|
|
for (int c = 0; c < C; ++c) {
|
|
accum_scale += padded_ratio_data[c + size_ - 1];
|
|
scale_data[n * C + c] = bias_ + accum_scale;
|
|
accum_scale -= padded_ratio_data[c];
|
|
}
|
|
}
|
|
|
|
math::Set<float, CPUContext>(
|
|
padded_ratio.numel(), 0., padded_ratio_data, &context_);
|
|
// the ratio 2*alpha*beta/size
|
|
const float cache_ratio = 2. * alpha_ * beta_ / size_;
|
|
const float* Ydata = Y.data<float>();
|
|
|
|
const float* dYdata = dY.data<float>();
|
|
float* dXdata = dX->template mutable_data<float>();
|
|
for (int n = 0; n < num_rows; ++n) {
|
|
const int offset = n * C;
|
|
for (int c = 0; c < C; ++c) {
|
|
padded_ratio_data[c + pre_pad_] =
|
|
Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
|
|
}
|
|
float accum_ratio = 0.;
|
|
for (int c = 0; c < size_ - 1; ++c) {
|
|
accum_ratio += padded_ratio_data[c];
|
|
}
|
|
for (int c = 0; c < C; ++c) {
|
|
accum_ratio += padded_ratio_data[c + size_ - 1];
|
|
dXdata[offset + c] =
|
|
dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
|
|
cache_ratio * Xdata[offset + c] * accum_ratio;
|
|
accum_ratio -= padded_ratio_data[c];
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
|
|
REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
|
|
|
|
OPERATOR_SCHEMA(LRN)
|
|
.NumInputs(1)
|
|
.NumOutputs(1, 2)
|
|
.SetDoc(R"DOC(
|
|
|
|
`LRN` applies Local Response Normalization to an input blob. This operation performs
|
|
a kind of "lateral inhibition" by normalizing over local input regions, where
|
|
normalization is applied across channels. This operator is typically used to
|
|
normalize an unbounded activation (such as ReLU). The output shape is the same as
|
|
the input shape. The `brew` module has a wrapper for this operator for use in a
|
|
`ModelHelper` object.
|
|
|
|
The formula for LRN is as follows:
|
|
|
|
$$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$
|
|
|
|
|
|
Github Links:
|
|
|
|
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h
|
|
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc
|
|
|
|
|
|
<details>
|
|
|
|
<summary> <b>Example</b> </summary>
|
|
|
|
**Code**
|
|
|
|
```
|
|
workspace.ResetWorkspace()
|
|
|
|
op = core.CreateOperator("LRN",
|
|
["X"],
|
|
["Y", "Y_scale"],
|
|
size=11,
|
|
alpha=0.001,
|
|
beta=0.5,
|
|
bias=2.0,
|
|
order="NHWC"
|
|
)
|
|
|
|
workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW
|
|
print("X:\n", workspace.FetchBlob("X"), "\n")
|
|
workspace.RunOperatorOnce(op)
|
|
print("Y:\n", workspace.FetchBlob("Y"))
|
|
print("Y_scale:\n", workspace.FetchBlob("Y_scale"))
|
|
```
|
|
|
|
**Result**
|
|
|
|
```
|
|
X:
|
|
[[[[ 0.72985137]
|
|
[-0.3753357 ]
|
|
[ 2.7344604 ]
|
|
[-0.5937792 ]
|
|
[ 0.38440478]
|
|
[-2.1659644 ]]
|
|
|
|
[[-0.92846817]
|
|
[-0.9996144 ]
|
|
[ 0.212943 ]
|
|
[-1.968045 ]
|
|
[-0.77839696]
|
|
[ 0.45492038]]
|
|
|
|
[[-0.11263168]
|
|
[ 1.9901097 ]
|
|
[ 0.19275683]
|
|
[ 0.15630436]
|
|
[ 0.7536298 ]
|
|
[-0.77339894]]
|
|
|
|
[[ 0.8353551 ]
|
|
[-0.7784452 ]
|
|
[ 1.779317 ]
|
|
[ 0.22421335]
|
|
[ 1.3846219 ]
|
|
[-3.0546608 ]]
|
|
|
|
[[ 0.09977621]
|
|
[ 2.2071757 ]
|
|
[ 0.79971045]
|
|
[ 3.563886 ]
|
|
[-0.7169287 ]
|
|
[ 0.77170426]]
|
|
|
|
[[-1.4296649 ]
|
|
[ 0.19181213]
|
|
[ 0.45961624]
|
|
[-1.0201577 ]
|
|
[ 0.62854475]
|
|
[-0.6395456 ]]]]
|
|
|
|
Y:
|
|
[[[[ 0.5160766 ]
|
|
[-0.26540157]
|
|
[ 1.9332271 ]
|
|
[-0.41986194]
|
|
[ 0.27181432]
|
|
[-1.5314047 ]]
|
|
|
|
[[-0.6565133 ]
|
|
[-0.7068181 ]
|
|
[ 0.15057328]
|
|
[-1.3914955 ]
|
|
[-0.5504022 ]
|
|
[ 0.32167578]]
|
|
|
|
[[-0.0796426 ]
|
|
[ 1.4070934 ]
|
|
[ 0.13629955]
|
|
[ 0.11052381]
|
|
[ 0.53288984]
|
|
[-0.5468682 ]]
|
|
|
|
[[ 0.5906759 ]
|
|
[-0.5504363 ]
|
|
[ 1.2580767 ]
|
|
[ 0.1585426 ]
|
|
[ 0.9790328 ]
|
|
[-2.1595135 ]]
|
|
|
|
[[ 0.07055242]
|
|
[ 1.5605361 ]
|
|
[ 0.5654725 ]
|
|
[ 2.5193207 ]
|
|
[-0.50693923]
|
|
[ 0.54567 ]]
|
|
|
|
[[-1.0108787 ]
|
|
[ 0.13563155]
|
|
[ 0.3249962 ]
|
|
[-0.72134334]
|
|
[ 0.44444424]
|
|
[-0.45222285]]]]
|
|
Y_scale:
|
|
[[[[2.0000484]
|
|
[2.0000129]
|
|
[2.0006797]
|
|
[2.000032 ]
|
|
[2.0000134]
|
|
[2.0004265]]
|
|
|
|
[[2.0000784]
|
|
[2.0000908]
|
|
[2.000004 ]
|
|
[2.0003521]
|
|
[2.000055 ]
|
|
[2.0000188]]
|
|
|
|
[[2.0000012]
|
|
[2.00036 ]
|
|
[2.0000033]
|
|
[2.0000021]
|
|
[2.0000517]
|
|
[2.0000544]]
|
|
|
|
[[2.0000634]
|
|
[2.000055 ]
|
|
[2.0002878]
|
|
[2.0000045]
|
|
[2.0001743]
|
|
[2.0008483]]
|
|
|
|
[[2.000001 ]
|
|
[2.000443 ]
|
|
[2.0000582]
|
|
[2.0011547]
|
|
[2.0000467]
|
|
[2.0000541]]
|
|
|
|
[[2.0001857]
|
|
[2.0000033]
|
|
[2.0000193]
|
|
[2.0000947]
|
|
[2.000036 ]
|
|
[2.0000372]]]]
|
|
```
|
|
|
|
</details>
|
|
|
|
)DOC")
|
|
.Arg(
|
|
"size",
|
|
"*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
|
|
.Arg(
|
|
"alpha",
|
|
"*(type: float; default: 0)* Multiplicative (scaling) factor.")
|
|
.Arg("beta", "*(type: float; default: 0)* Exponent.")
|
|
.Arg("bias", "*(type: float; default: 1.0)* Additive factor.")
|
|
.Arg("order", "*(type: float; default: 'NCHW')* Order of blob dimensions.")
|
|
.Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
|
|
.Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
|
|
.Output(1, "Y_scale", "*(type: Tensor`<float>`)* Output scale.")
|
|
.InheritOnnxSchema();
|
|
OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
|
|
|
|
class GetLRNGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
vector<OperatorDef> GetGradientDefs() override {
|
|
return SingleGradientDef(
|
|
"LRNGradient", "",
|
|
vector<string>{I(0), O(0), GO(0)},
|
|
vector<string>{GI(0)});
|
|
}
|
|
};
|
|
REGISTER_GRADIENT(LRN, GetLRNGradient);
|
|
} // namespace caffe2
|