batch size 0 support in norm operators (#26894)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26894

Add batch_size == 0 testings of norm DNNLOWP operators.

Test Plan: CI

Reviewed By: jianyuh

Differential Revision: D17595416

fbshipit-source-id: 23086ecf8818be30da031eb4fc2922daea79ea7c
This commit is contained in:
Jongsoo Park 2019-09-26 16:04:06 -07:00 committed by Facebook Github Bot
parent f99bc714c7
commit ec1f0f08f1
5 changed files with 45 additions and 21 deletions

View file

@ -41,13 +41,18 @@ class GroupNormOp final : public Operator<Context> {
const int ndim = X.dim();
const int N = X.dim32(0);
const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
const size_t HxW = X.numel() / (N * C);
const size_t HxW = order_ == StorageOrder::NCHW
? X.size_from_dim(2)
: X.size_between_dim(0, ndim - 1);
CAFFE_ENFORCE_EQ(C % group_, 0);
CAFFE_ENFORCE_EQ(gamma.numel(), C);
CAFFE_ENFORCE_EQ(beta.numel(), C);
const int G = group_;
const int K = C / G;
auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
if (N == 0) {
return true;
}
T* mu_data = nullptr;
T* rsig_data = nullptr;
if (OutputSize() == 3) {

View file

@ -122,9 +122,14 @@ void GroupNormDNNLowPOp<T>::QuantizeBeta() {
const auto& beta_int8 = this->template Input<int8::Int8TensorCPU>(BETA);
beta_qparams.scale = beta_int8.scale;
beta_qparams.zero_point = beta_int8.zero_point;
CAFFE_ENFORCE_LE(
std::abs(beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
1e-4);
const auto& X = InputTensorCPU_(INPUT);
const int N = X.dim32(0);
if (N > 0) {
CAFFE_ENFORCE_LE(
std::abs(
beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
1e-4);
}
CAFFE_ENFORCE_EQ(beta_qparams.zero_point, 0);
beta_quantized_data_ = beta.template data<int32_t>();
if (dequantize_output_) {
@ -300,7 +305,7 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
const auto& X = InputTensorCPU_(INPUT);
const int N = X.dim32(0);
const int C = X.dim32(1);
const int HxW = X.size() / (N * C);
const int HxW = X.size_from_dim(2);
const int G = group_;
CAFFE_ENFORCE_EQ(C % G, 0);
const int K = C / G;
@ -312,6 +317,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
if (dequantize_output_) {
float* Y_data = Y->template mutable_data<float>();
if (N == 0) {
return true;
}
mu_dequantized_.resize(N * G);
rsig_dequantized_.resize(N * G);
float* mu_data = mu_dequantized_.data();
@ -335,6 +343,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
} else {
T* Y_data = GetQuantizedOutputData_();
if (N == 0) {
return true;
}
mu_quantized_.resize(N * G);
rsig_quantized_.resize(N * G);
int32_t* mu_data = mu_quantized_.data();
@ -368,7 +379,7 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
const int ndim = X.dim();
const int N = X.dim32(0);
const int C = X.dim32(ndim - 1);
const int HxW = X.size() / (N * C);
const int HxW = X.size_between_dim(0, ndim - 1);
const int G = group_;
CAFFE_ENFORCE_EQ(C % G, 0);
const int K = C / G;
@ -380,6 +391,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
if (dequantize_output_) {
float* Y_data = Y->template mutable_data<float>();
if (N == 0) {
return true;
}
mu_dequantized_.resize(N * G);
rsig_dequantized_.resize(N * G);
float* mu_data = mu_dequantized_.data();
@ -403,6 +417,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
} else {
T* Y_data = GetQuantizedOutputData_();
if (N == 0) {
return true;
}
mu_quantized_.resize(N * G);
rsig_quantized_.resize(N * G);
int32_t* mu_data = mu_quantized_.data();

View file

@ -17,7 +17,7 @@ workspace.GlobalInit(["caffe2", "--caffe2_omp_num_threads=11"])
class DNNLowPOpGroupNormTest(hu.HypothesisTestCase):
@given(
N=st.integers(1, 4),
N=st.integers(0, 4),
G=st.integers(2, 4),
K=st.integers(2, 12),
H=st.integers(4, 16),
@ -80,7 +80,9 @@ class DNNLowPOpGroupNormTest(hu.HypothesisTestCase):
)
net.Proto().op.extend([int8_given_tensor_fill])
X_q_param = dnnlowp_utils.choose_quantization_params(X.min(), X.max())
X_min = 0 if X.size == 0 else X.min()
X_max = 0 if X.size == 0 else X.max()
X_q_param = dnnlowp_utils.choose_quantization_params(X_min, X_max)
int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
beta, "beta_q", X_q_param, gamma_q_param
)

View file

@ -70,10 +70,7 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
const int N = X.dim32(0);
const int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
const int HxW =
std::accumulate(
X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
C;
const int HxW = X.size_from_dim(1) / C;
CAFFE_ENFORCE_EQ(scale.numel(), C);
CAFFE_ENFORCE_EQ(bias.numel(), C);
@ -89,13 +86,18 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
&beta_, {C}, at::dtype<float>().device(CPUContext::GetDeviceType()));
float* alpha_data = alpha_.template mutable_data<float>();
float* beta_data = beta_.template mutable_data<float>();
if (N == 0) {
return true;
}
const auto& mean = Input(EST_MEAN);
const auto& var = Input(EST_VAR);
CAFFE_ENFORCE_EQ(mean.numel(), C);
CAFFE_ENFORCE_EQ(var.numel(), C);
auto* Y = OutputTensorCPU_(OUTPUT);
Y->Resize(X.sizes());
T* Y_data = GetQuantizedOutputData_();
if (N == 0) {
return true;
}
ComputeFusedParam_(
C,
scale_data,
@ -108,9 +110,6 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
vector<T> X_temp;
const T* X_data =
dnnlowp::QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
auto* Y = OutputTensorCPU_(OUTPUT);
Y->Resize(X.sizes());
T* Y_data = GetQuantizedOutputData_();
if (order_ == StorageOrder::NCHW) {
for (int c = 0; c < C; ++c) {

View file

@ -21,7 +21,7 @@ class DNNLowPOpSpatialBNTest(hu.HypothesisTestCase):
size=st.integers(10, 16),
input_channels=st.integers(2, 16),
output_channels=st.integers(2, 16),
batch_size=st.integers(1, 3),
batch_size=st.integers(0, 3),
order=st.sampled_from(["NCHW", "NHWC"]),
in_quantized=st.booleans(),
out_quantized=st.booleans(),
@ -46,8 +46,9 @@ class DNNLowPOpSpatialBNTest(hu.HypothesisTestCase):
X = np.round(np.random.rand(batch_size, size, size, input_channels)).astype(
np.float32
)
X[0, 0, 0, 0] = X_min
X[0, 0, 0, 1] = X_max
if batch_size != 0:
X[0, 0, 0, 0] = X_min
X[0, 0, 0, 1] = X_max
epsilon = np.abs(np.random.rand())
scale = np.random.rand(input_channels).astype(np.float32)