batch size 0 support in norm operators (#26894)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/26894 Add batch_size == 0 testings of norm DNNLOWP operators. Test Plan: CI Reviewed By: jianyuh Differential Revision: D17595416 fbshipit-source-id: 23086ecf8818be30da031eb4fc2922daea79ea7c
2026-05-14 20:57:59 +00:00 · 2019-09-26 16:04:06 -07:00 · 2019-09-26 16:04:06 -07:00 · ec1f0f08f1
commit ec1f0f08f1
parent f99bc714c7
5 changed files with 45 additions and 21 deletions
--- a/caffe2/operators/group_norm_op.h
+++ b/caffe2/operators/group_norm_op.h
@ -41,13 +41,18 @@ class GroupNormOp final : public Operator<Context> {
    const int ndim = X.dim();
    const int N = X.dim32(0);
    const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
-    const size_t HxW = X.numel() / (N * C);
+    const size_t HxW = order_ == StorageOrder::NCHW
+        ? X.size_from_dim(2)
+        : X.size_between_dim(0, ndim - 1);
    CAFFE_ENFORCE_EQ(C % group_, 0);
    CAFFE_ENFORCE_EQ(gamma.numel(), C);
    CAFFE_ENFORCE_EQ(beta.numel(), C);
    const int G = group_;
    const int K = C / G;
    auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
+    if (N == 0) {
+      return true;
+    }
    T* mu_data = nullptr;
    T* rsig_data = nullptr;
    if (OutputSize() == 3) {
--- a/caffe2/quantization/server/group_norm_dnnlowp_op.cc
+++ b/caffe2/quantization/server/group_norm_dnnlowp_op.cc
@ -122,9 +122,14 @@ void GroupNormDNNLowPOp<T>::QuantizeBeta() {
      const auto& beta_int8 = this->template Input<int8::Int8TensorCPU>(BETA);
      beta_qparams.scale = beta_int8.scale;
      beta_qparams.zero_point = beta_int8.zero_point;
-      CAFFE_ENFORCE_LE(
-          std::abs(beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
-          1e-4);
+      const auto& X = InputTensorCPU_(INPUT);
+      const int N = X.dim32(0);
+      if (N > 0) {
+        CAFFE_ENFORCE_LE(
+            std::abs(
+                beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
+            1e-4);
+      }
      CAFFE_ENFORCE_EQ(beta_qparams.zero_point, 0);
      beta_quantized_data_ = beta.template data<int32_t>();
      if (dequantize_output_) {
@ -300,7 +305,7 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
  const auto& X = InputTensorCPU_(INPUT);
  const int N = X.dim32(0);
  const int C = X.dim32(1);
-  const int HxW = X.size() / (N * C);
+  const int HxW = X.size_from_dim(2);
  const int G = group_;
  CAFFE_ENFORCE_EQ(C % G, 0);
  const int K = C / G;
@ -312,6 +317,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {

  if (dequantize_output_) {
    float* Y_data = Y->template mutable_data<float>();
+    if (N == 0) {
+      return true;
+    }
    mu_dequantized_.resize(N * G);
    rsig_dequantized_.resize(N * G);
    float* mu_data = mu_dequantized_.data();
@ -335,6 +343,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
        N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
  } else {
    T* Y_data = GetQuantizedOutputData_();
+    if (N == 0) {
+      return true;
+    }
    mu_quantized_.resize(N * G);
    rsig_quantized_.resize(N * G);
    int32_t* mu_data = mu_quantized_.data();
@ -368,7 +379,7 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
  const int ndim = X.dim();
  const int N = X.dim32(0);
  const int C = X.dim32(ndim - 1);
-  const int HxW = X.size() / (N * C);
+  const int HxW = X.size_between_dim(0, ndim - 1);
  const int G = group_;
  CAFFE_ENFORCE_EQ(C % G, 0);
  const int K = C / G;
@ -380,6 +391,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {

  if (dequantize_output_) {
    float* Y_data = Y->template mutable_data<float>();
+    if (N == 0) {
+      return true;
+    }
    mu_dequantized_.resize(N * G);
    rsig_dequantized_.resize(N * G);
    float* mu_data = mu_dequantized_.data();
@ -403,6 +417,9 @@ bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
        N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
  } else {
    T* Y_data = GetQuantizedOutputData_();
+    if (N == 0) {
+      return true;
+    }
    mu_quantized_.resize(N * G);
    rsig_quantized_.resize(N * G);
    int32_t* mu_data = mu_quantized_.data();
--- a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
@ -17,7 +17,7 @@ workspace.GlobalInit(["caffe2", "--caffe2_omp_num_threads=11"])

 class DNNLowPOpGroupNormTest(hu.HypothesisTestCase):
    @given(
-        N=st.integers(1, 4),
+        N=st.integers(0, 4),
        G=st.integers(2, 4),
        K=st.integers(2, 12),
        H=st.integers(4, 16),
@ -80,7 +80,9 @@ class DNNLowPOpGroupNormTest(hu.HypothesisTestCase):
                )
                net.Proto().op.extend([int8_given_tensor_fill])

-                X_q_param = dnnlowp_utils.choose_quantization_params(X.min(), X.max())
+                X_min = 0 if X.size == 0 else X.min()
+                X_max = 0 if X.size == 0 else X.max()
+                X_q_param = dnnlowp_utils.choose_quantization_params(X_min, X_max)
                int8_bias_tensor_fill = dnnlowp_utils.create_int8_bias_tensor_fill(
                    beta, "beta_q", X_q_param, gamma_q_param
                )
--- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc
+++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc
@ -70,10 +70,7 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
  const int N = X.dim32(0);
  const int C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
  const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-  const int HxW =
-      std::accumulate(
-          X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
-      C;
+  const int HxW = X.size_from_dim(1) / C;
  CAFFE_ENFORCE_EQ(scale.numel(), C);
  CAFFE_ENFORCE_EQ(bias.numel(), C);

@ -89,13 +86,18 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
      &beta_, {C}, at::dtype<float>().device(CPUContext::GetDeviceType()));
  float* alpha_data = alpha_.template mutable_data<float>();
  float* beta_data = beta_.template mutable_data<float>();
-  if (N == 0) {
-    return true;
-  }
  const auto& mean = Input(EST_MEAN);
  const auto& var = Input(EST_VAR);
  CAFFE_ENFORCE_EQ(mean.numel(), C);
  CAFFE_ENFORCE_EQ(var.numel(), C);
+
+  auto* Y = OutputTensorCPU_(OUTPUT);
+  Y->Resize(X.sizes());
+  T* Y_data = GetQuantizedOutputData_();
+  if (N == 0) {
+    return true;
+  }
+
  ComputeFusedParam_(
      C,
      scale_data,
@ -108,9 +110,6 @@ bool SpatialBNDNNLowPOp<T, ReluFused>::RunOnDevice() {
  vector<T> X_temp;
  const T* X_data =
      dnnlowp::QuantizeInputIfNeeded(this, 0, in_qparams_[0], X_temp);
-  auto* Y = OutputTensorCPU_(OUTPUT);
-  Y->Resize(X.sizes());
-  T* Y_data = GetQuantizedOutputData_();

  if (order_ == StorageOrder::NCHW) {
    for (int c = 0; c < C; ++c) {
--- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
@ -21,7 +21,7 @@ class DNNLowPOpSpatialBNTest(hu.HypothesisTestCase):
        size=st.integers(10, 16),
        input_channels=st.integers(2, 16),
        output_channels=st.integers(2, 16),
-        batch_size=st.integers(1, 3),
+        batch_size=st.integers(0, 3),
        order=st.sampled_from(["NCHW", "NHWC"]),
        in_quantized=st.booleans(),
        out_quantized=st.booleans(),
@ -46,8 +46,9 @@ class DNNLowPOpSpatialBNTest(hu.HypothesisTestCase):
        X = np.round(np.random.rand(batch_size, size, size, input_channels)).astype(
            np.float32
        )
-        X[0, 0, 0, 0] = X_min
-        X[0, 0, 0, 1] = X_max
+        if batch_size != 0:
+            X[0, 0, 0, 0] = X_min
+            X[0, 0, 0, 1] = X_max

        epsilon = np.abs(np.random.rand())
        scale = np.random.rand(input_channels).astype(np.float32)