2017-05-26 23:46:04 +00:00
|
|
|
from __future__ import absolute_import
|
|
|
|
|
from __future__ import division
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2017-08-03 07:17:36 +00:00
|
|
|
from caffe2.python import schema
|
|
|
|
|
from caffe2.python.layers.layers import ModelLayer
|
2017-05-26 23:46:04 +00:00
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BatchNormalization(ModelLayer):
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
model,
|
|
|
|
|
input_record,
|
|
|
|
|
name='batch_normalization',
|
|
|
|
|
scale_optim=None,
|
|
|
|
|
bias_optim=None,
|
|
|
|
|
momentum=0.9,
|
|
|
|
|
order='NCHW',
|
2020-01-10 19:54:07 +00:00
|
|
|
scale_init_value=1.0,
|
2017-05-26 23:46:04 +00:00
|
|
|
**kwargs
|
|
|
|
|
):
|
|
|
|
|
super(BatchNormalization, self).__init__(
|
|
|
|
|
model, name, input_record, **kwargs)
|
|
|
|
|
|
|
|
|
|
assert isinstance(input_record, schema.Scalar), "Incorrect input type"
|
|
|
|
|
|
|
|
|
|
self.input_shape = input_record.field_type().shape
|
|
|
|
|
|
|
|
|
|
if len(self.input_shape) == 3:
|
|
|
|
|
if order == "NCHW":
|
|
|
|
|
input_dims = self.input_shape[0]
|
|
|
|
|
elif order == "NHWC":
|
2017-07-14 01:22:01 +00:00
|
|
|
input_dims = self.input_shape[2]
|
2017-05-26 23:46:04 +00:00
|
|
|
else:
|
|
|
|
|
raise ValueError("Please specify a correct order")
|
|
|
|
|
else:
|
|
|
|
|
assert len(self.input_shape) == 1, (
|
|
|
|
|
"This layer supports only 4D or 2D tesnors")
|
|
|
|
|
input_dims = self.input_shape[0]
|
|
|
|
|
|
|
|
|
|
self.output_schema = schema.Scalar(
|
|
|
|
|
(np.float32, self.input_shape),
|
2017-08-03 07:17:36 +00:00
|
|
|
self.get_next_blob_reference('output')
|
2017-05-26 23:46:04 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.momentum = momentum
|
|
|
|
|
self.order = order
|
|
|
|
|
|
2017-08-03 07:17:36 +00:00
|
|
|
self.scale = self.create_param(param_name='scale',
|
|
|
|
|
shape=[input_dims],
|
2020-01-10 19:54:07 +00:00
|
|
|
initializer=('ConstantFill', {'value': scale_init_value}),
|
2017-08-03 07:17:36 +00:00
|
|
|
optimizer=scale_optim)
|
|
|
|
|
self.bias = self.create_param(param_name='bias',
|
|
|
|
|
shape=[input_dims],
|
|
|
|
|
initializer=('ConstantFill', {'value': 0.0}),
|
|
|
|
|
optimizer=bias_optim)
|
|
|
|
|
self.rm = self.create_param(param_name='running_mean',
|
|
|
|
|
shape=[input_dims],
|
|
|
|
|
initializer=('ConstantFill', {'value': 0.0}),
|
|
|
|
|
optimizer=model.NoOptim)
|
|
|
|
|
self.riv = self.create_param(param_name='running_inv_var',
|
|
|
|
|
shape=[input_dims],
|
|
|
|
|
initializer=('ConstantFill', {'value': 1.0}),
|
|
|
|
|
optimizer=model.NoOptim)
|
2017-05-26 23:46:04 +00:00
|
|
|
|
implement drelu and unittest
Summary:
In this revision, I mainly implemented the DRelu activation. See https://arxiv.org/pdf/1706.06978v1.pdf for details.
To sum up, different from standard relu and purely, which divide the scope into two parts with boundary at zero, DRelu calculate another value p to divide the activation into two part. P is the softmax value of the output of Batch Normalization. For f(x)=x part in relu, you can find similar patten in f(x)=px, and for f(x)=0 part in rely, you can find similar pattern in f(x)=a(1-p)x, in which a is a parameter to tune. Drelu activation result is the sum of these two parts, f(x) = a(1-p)x + px.
To implement DRelu, I take BatchNormalization as super class and then use the above formula for computation. In order to allow users to choose activation methods, which usually takes place when calling add_mlp function in processor_util.py, I pass the parameter transfer in model_option from UI to the details, just as what dropout do. Currently, I place it in extra_option, but can modify it if AML team needs to redesign the UI.
I also add units test for DRelu. We check the shape of output and also do the numeric unit tests.
For Unit test, I first check the numeric value of BatchNormalization, since there is no similar test before. I then compute the value of DRelu outputs and compare the results with current DRelu layer.
Reviewed By: chocjy
Differential Revision: D5341464
fbshipit-source-id: 896b4dcc49cfd5493d97a8b448401b19e9c80630
2017-07-20 18:37:39 +00:00
|
|
|
def _add_ops(self, net, is_test, out_blob=None):
|
|
|
|
|
original_input_blob = self.input_record.field_blobs()
|
|
|
|
|
input_blob = net.NextScopedBlob('expand_input')
|
2017-05-26 23:46:04 +00:00
|
|
|
if len(self.input_shape) == 1:
|
implement drelu and unittest
Summary:
In this revision, I mainly implemented the DRelu activation. See https://arxiv.org/pdf/1706.06978v1.pdf for details.
To sum up, different from standard relu and purely, which divide the scope into two parts with boundary at zero, DRelu calculate another value p to divide the activation into two part. P is the softmax value of the output of Batch Normalization. For f(x)=x part in relu, you can find similar patten in f(x)=px, and for f(x)=0 part in rely, you can find similar pattern in f(x)=a(1-p)x, in which a is a parameter to tune. Drelu activation result is the sum of these two parts, f(x) = a(1-p)x + px.
To implement DRelu, I take BatchNormalization as super class and then use the above formula for computation. In order to allow users to choose activation methods, which usually takes place when calling add_mlp function in processor_util.py, I pass the parameter transfer in model_option from UI to the details, just as what dropout do. Currently, I place it in extra_option, but can modify it if AML team needs to redesign the UI.
I also add units test for DRelu. We check the shape of output and also do the numeric unit tests.
For Unit test, I first check the numeric value of BatchNormalization, since there is no similar test before. I then compute the value of DRelu outputs and compare the results with current DRelu layer.
Reviewed By: chocjy
Differential Revision: D5341464
fbshipit-source-id: 896b4dcc49cfd5493d97a8b448401b19e9c80630
2017-07-20 18:37:39 +00:00
|
|
|
input_blob = net.ExpandDims(original_input_blob,
|
2017-05-26 23:46:04 +00:00
|
|
|
dims=[2, 3])
|
implement drelu and unittest
Summary:
In this revision, I mainly implemented the DRelu activation. See https://arxiv.org/pdf/1706.06978v1.pdf for details.
To sum up, different from standard relu and purely, which divide the scope into two parts with boundary at zero, DRelu calculate another value p to divide the activation into two part. P is the softmax value of the output of Batch Normalization. For f(x)=x part in relu, you can find similar patten in f(x)=px, and for f(x)=0 part in rely, you can find similar pattern in f(x)=a(1-p)x, in which a is a parameter to tune. Drelu activation result is the sum of these two parts, f(x) = a(1-p)x + px.
To implement DRelu, I take BatchNormalization as super class and then use the above formula for computation. In order to allow users to choose activation methods, which usually takes place when calling add_mlp function in processor_util.py, I pass the parameter transfer in model_option from UI to the details, just as what dropout do. Currently, I place it in extra_option, but can modify it if AML team needs to redesign the UI.
I also add units test for DRelu. We check the shape of output and also do the numeric unit tests.
For Unit test, I first check the numeric value of BatchNormalization, since there is no similar test before. I then compute the value of DRelu outputs and compare the results with current DRelu layer.
Reviewed By: chocjy
Differential Revision: D5341464
fbshipit-source-id: 896b4dcc49cfd5493d97a8b448401b19e9c80630
2017-07-20 18:37:39 +00:00
|
|
|
else:
|
|
|
|
|
input_blob = original_input_blob[0]
|
2017-05-26 23:46:04 +00:00
|
|
|
|
implement drelu and unittest
Summary:
In this revision, I mainly implemented the DRelu activation. See https://arxiv.org/pdf/1706.06978v1.pdf for details.
To sum up, different from standard relu and purely, which divide the scope into two parts with boundary at zero, DRelu calculate another value p to divide the activation into two part. P is the softmax value of the output of Batch Normalization. For f(x)=x part in relu, you can find similar patten in f(x)=px, and for f(x)=0 part in rely, you can find similar pattern in f(x)=a(1-p)x, in which a is a parameter to tune. Drelu activation result is the sum of these two parts, f(x) = a(1-p)x + px.
To implement DRelu, I take BatchNormalization as super class and then use the above formula for computation. In order to allow users to choose activation methods, which usually takes place when calling add_mlp function in processor_util.py, I pass the parameter transfer in model_option from UI to the details, just as what dropout do. Currently, I place it in extra_option, but can modify it if AML team needs to redesign the UI.
I also add units test for DRelu. We check the shape of output and also do the numeric unit tests.
For Unit test, I first check the numeric value of BatchNormalization, since there is no similar test before. I then compute the value of DRelu outputs and compare the results with current DRelu layer.
Reviewed By: chocjy
Differential Revision: D5341464
fbshipit-source-id: 896b4dcc49cfd5493d97a8b448401b19e9c80630
2017-07-20 18:37:39 +00:00
|
|
|
if out_blob is None:
|
|
|
|
|
bn_output = self.output_schema.field_blobs()
|
|
|
|
|
else:
|
|
|
|
|
bn_output = out_blob
|
2017-05-26 23:46:04 +00:00
|
|
|
if is_test:
|
|
|
|
|
output_blobs = bn_output
|
|
|
|
|
else:
|
|
|
|
|
output_blobs = bn_output + [self.rm, self.riv,
|
|
|
|
|
net.NextScopedBlob('bn_saved_mean'),
|
|
|
|
|
net.NextScopedBlob('bn_saved_iv')]
|
|
|
|
|
|
|
|
|
|
net.SpatialBN([input_blob, self.scale,
|
|
|
|
|
self.bias, self.rm, self.riv],
|
|
|
|
|
output_blobs,
|
|
|
|
|
momentum=self.momentum,
|
|
|
|
|
is_test=is_test,
|
|
|
|
|
order=self.order)
|
|
|
|
|
|
|
|
|
|
if len(self.input_shape) == 1:
|
|
|
|
|
net.Squeeze(bn_output,
|
|
|
|
|
bn_output,
|
|
|
|
|
dims=[2, 3])
|
|
|
|
|
|
|
|
|
|
def add_train_ops(self, net):
|
|
|
|
|
self._add_ops(net, is_test=False)
|
|
|
|
|
|
|
|
|
|
def add_eval_ops(self, net):
|
|
|
|
|
self._add_ops(net, is_test=True)
|
|
|
|
|
|
|
|
|
|
def add_ops(self, net):
|
|
|
|
|
self.add_eval_ops(net)
|