From 55f5a52b23ee5e5c19d1597e9304383f24c95a0a Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Wed, 18 Sep 2019 17:17:28 -0700
Subject: [PATCH] Support opset-11 DepthToSpace CPU kernel (#1759)

* Initial commit

* Uncomment tests

* Updates

* Updates

* Disable CRD mode DepthToSpace for NGraph builds

* Disable test

* Update tests

* PR feedback

* Add unit test for CRD mode

* Reflect class variable in naming

* Add a test to NGRAPH disabled list

* Update main.cc

* Update main.cc
---
 .../providers/cpu/cpu_execution_provider.cc   |   7 +-
 .../providers/cpu/tensor/space_depth_ops.cc   |  37 ++++--
 .../providers/cpu/tensor/space_depth_ops.h    |  13 ++
 onnxruntime/test/onnx/main.cc                 |   4 +-
 .../cpu/tensor/space_depth_ops_test.cc        | 119 +++++++++++++++++-
 .../test/python/onnx_backend_test_series.py   |   2 +-
 6 files changed, 165 insertions(+), 17 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index cb3cdc5ada..c7f13f1ee0 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -195,7 +195,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, int64_t, Slice);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, string, Slice);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, SpaceToDepth);
-class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, DepthToSpace);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, Split);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Squeeze);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Tile);
@@ -315,6 +315,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Ha
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LogSoftmax);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Softmax);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Loop);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, DepthToSpace);
 
 void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -495,7 +496,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, int64_t, Slice)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, string, Slice)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, SpaceToDepth)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, DepthToSpace)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, Split)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Tile)>,
@@ -615,7 +616,7 @@ void RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, LogSoftmax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Softmax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Loop)>,
-
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, DepthToSpace)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
index 968b5eb6ed..28a0a7d7f4 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.cc
@@ -15,8 +15,13 @@ ONNX_CPU_OPERATOR_KERNEL(
 
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     DepthToSpace,
-    1,
-    4,
+    1, 10,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    DepthToSpace<float>);
+
+ONNX_CPU_OPERATOR_KERNEL(
+    DepthToSpace,
+    11,
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     DepthToSpace<float>);
 
@@ -75,14 +80,28 @@ Status DepthToSpace<float>::Compute(OpKernelContext* context) const {
   const int64_t output_width = input_width * blocksize_;
 
   Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
+  
+  // Process "DCR" mode
+  if (is_dcr_) {
+    std::array<int64_t, IntermediateTensorRank> permutation{{0, 3, 4, 1, 5, 2}};
+    EigenTensorMap(output.template MutableData<float>(), batch, input_depth / blocksize_ / blocksize_,
+                   input_height, blocksize_, input_width, blocksize_) =
+        EigenTensorMap(const_cast<float*>(input.template Data<float>()), batch,
+                       blocksize_, blocksize_, input_depth / blocksize_ / blocksize_,
+                       input_height, input_width)
+            .shuffle(permutation);   
+  }
 
-  std::array<int64_t, IntermediateTensorRank> permutation{{0, 3, 4, 1, 5, 2}};
-  EigenTensorMap(output.template MutableData<float>(), batch, input_depth / blocksize_ / blocksize_,
-                 input_height, blocksize_, input_width, blocksize_) =
-      EigenTensorMap(const_cast<float*>(input.template Data<float>()), batch,
-                     blocksize_, blocksize_, input_depth / blocksize_ / blocksize_,
-                     input_height, input_width)
-          .shuffle(permutation);
+  // Process "CRD" mode
+  else {
+    std::array<int64_t, IntermediateTensorRank> permutation{{0, 1, 4, 2, 5, 3}};
+    EigenTensorMap(output.template MutableData<float>(), batch, input_depth / blocksize_ / blocksize_,
+                   input_height, blocksize_, input_width, blocksize_) =
+        EigenTensorMap(const_cast<float*>(input.template Data<float>()), batch,
+                       input_depth / blocksize_ / blocksize_, blocksize_, blocksize_,
+                       input_height, input_width)
+            .shuffle(permutation);  
+  }
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
index 660d36e90d..28c4c4ee70 100644
--- a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
+++ b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -31,9 +31,22 @@ template <typename T>
 class DepthToSpace final : public SpaceDepthBase {
  public:
   DepthToSpace(const OpKernelInfo& info) : SpaceDepthBase(info) {
+    std::string mode;
+    // if  mode doesn't exist, then it is the default "DCR" mode
+    // (or) it is an opset < 11 model for which the only mode is "DCR" mode
+    if (info.GetAttr("mode", &mode).IsOK()) {
+      if (mode == "CRD")
+        is_dcr_ = false;
+
+      else if (mode != "DCR")
+        ORT_THROW("DepthToSpace op: only 'DCR' and 'CRD' modes are supported"); 
+    }
   }
 
   Status Compute(OpKernelContext* context) const override;
+
+ private:
+  bool is_dcr_ = true;
 };
 
 }  //namespace onnxruntime
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 477cb59728..99bbce454d 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -404,8 +404,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
       {"round", "not implemented yet"},
       {"gather_elements_1", "not implemented yet"},
       {"gather_elements_0", "not implemented yet"},
-      {"depthtospace_crd_mode_example", "not implemented yet"},
-      {"depthtospace_crd_mode", "not implemented yet"},
       {"cumsum_2d_axis_1", "not implemented yet"},
       {"cumsum_2d_axis_0", "not implemented yet"},
       {"cumsum_1d_reverse_exclusive", "not implemented yet"},
@@ -457,6 +455,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   broken_tests.insert({"clip_default_min", "not implemented yet for opset 11"});	
   broken_tests.insert({"clip_default_max", "not implemented yet for opset 11"});
   broken_tests.insert({"clip", "not implemented yet for opset 11"});
+  broken_tests.insert({"depthtospace_crd_mode_example", "NGraph does not support CRD mode"});
+  broken_tests.insert({"depthtospace_crd_mode", "NGraph does not support CRD mode"});
 #endif
 
 #ifdef USE_MKLDNN
diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
index 8db47d2a19..7a8f7b10ef 100644
--- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc
@@ -73,7 +73,7 @@ TEST(TensorOpTest, SpaceToDepthTest_2) {
 }
 
 TEST(TensorOpTest, DepthToSpaceTest_1) {
-  OpTester test("DepthToSpace");
+  OpTester test("DepthToSpace", 7); // create an opset 7 model
   const int64_t blocksize = 2;
   test.AddAttribute("blocksize", blocksize);
 
@@ -104,7 +104,7 @@ TEST(TensorOpTest, DepthToSpaceTest_1) {
 }
 
 TEST(TensorOpTest, DepthToSpaceTest_2) {
-  OpTester test("DepthToSpace");
+  OpTester test("DepthToSpace", 7); // create an opset 7 model
   const int64_t blocksize = 2;
   test.AddAttribute("blocksize", blocksize);
 
@@ -145,5 +145,120 @@ TEST(TensorOpTest, DepthToSpaceTest_2) {
   test.AddOutput<float>("output", {2, 3, 6, 4}, result);
   test.Run();
 }
+
+TEST(TensorOpTest, DepthToSpaceTest_3) {
+  OpTester test("DepthToSpace", 11);  // create an opset 11 model with missing default attribute
+  const int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+
+  const int64_t N = 2, C = 12, H = 3, W = 2;
+  const std::vector<float> X = {
+      0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.,
+      11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21.,
+      22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+      33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43.,
+      44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
+      55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+      66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76.,
+      77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87.,
+      88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98.,
+      99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
+      110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
+      121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131.,
+      132., 133., 134., 135., 136., 137., 138., 139., 140., 141., 142.,
+      143.};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  const std::vector<float> result = {
+      0., 18., 1., 19., 36., 54., 37., 55., 2., 20., 3.,
+      21., 38., 56., 39., 57., 4., 22., 5., 23., 40., 58.,
+      41., 59., 6., 24., 7., 25., 42., 60., 43., 61., 8.,
+      26., 9., 27., 44., 62., 45., 63., 10., 28., 11., 29.,
+      46., 64., 47., 65., 12., 30., 13., 31., 48., 66., 49.,
+      67., 14., 32., 15., 33., 50., 68., 51., 69., 16., 34.,
+      17., 35., 52., 70., 53., 71., 72., 90., 73., 91., 108.,
+      126., 109., 127., 74., 92., 75., 93., 110., 128., 111., 129.,
+      76., 94., 77., 95., 112., 130., 113., 131., 78., 96., 79.,
+      97., 114., 132., 115., 133., 80., 98., 81., 99., 116., 134.,
+      117., 135., 82., 100., 83., 101., 118., 136., 119., 137., 84.,
+      102., 85., 103., 120., 138., 121., 139., 86., 104., 87., 105.,
+      122., 140., 123., 141., 88., 106., 89., 107., 124., 142., 125.,
+      143.};
+  test.AddOutput<float>("output", {2, 3, 6, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_4) {
+  OpTester test("DepthToSpace", 11);  // create an opset 11 model with attribute present = "DCR" mode
+  const int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "DCR");
+
+  const int64_t N = 2, C = 12, H = 3, W = 2;
+  const std::vector<float> X = {
+      0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.,
+      11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21.,
+      22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
+      33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43.,
+      44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54.,
+      55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
+      66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76.,
+      77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87.,
+      88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98.,
+      99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
+      110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
+      121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131.,
+      132., 133., 134., 135., 136., 137., 138., 139., 140., 141., 142.,
+      143.};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  const std::vector<float> result = {
+      0., 18., 1., 19., 36., 54., 37., 55., 2., 20., 3.,
+      21., 38., 56., 39., 57., 4., 22., 5., 23., 40., 58.,
+      41., 59., 6., 24., 7., 25., 42., 60., 43., 61., 8.,
+      26., 9., 27., 44., 62., 45., 63., 10., 28., 11., 29.,
+      46., 64., 47., 65., 12., 30., 13., 31., 48., 66., 49.,
+      67., 14., 32., 15., 33., 50., 68., 51., 69., 16., 34.,
+      17., 35., 52., 70., 53., 71., 72., 90., 73., 91., 108.,
+      126., 109., 127., 74., 92., 75., 93., 110., 128., 111., 129.,
+      76., 94., 77., 95., 112., 130., 113., 131., 78., 96., 79.,
+      97., 114., 132., 115., 133., 80., 98., 81., 99., 116., 134.,
+      117., 135., 82., 100., 83., 101., 118., 136., 119., 137., 84.,
+      102., 85., 103., 120., 138., 121., 139., 86., 104., 87., 105.,
+      122., 140., 123., 141., 88., 106., 89., 107., 124., 142., 125.,
+      143.};
+  test.AddOutput<float>("output", {2, 3, 6, 4}, result);
+  test.Run();
+}
+
+TEST(TensorOpTest, DepthToSpaceTest_5) {
+  OpTester test("DepthToSpace", 11);  // create an opset 11 model with attribute present = "CRD" mode
+  const int64_t blocksize = 2;
+  test.AddAttribute("blocksize", blocksize);
+  test.AddAttribute("mode", "CRD");
+
+  const int64_t N = 1, C = 4, H = 2, W = 3;
+  const std::vector<float> X = {0., 1., 2.,
+                                3., 4., 5.,
+                                9., 10., 11.,
+                                12., 13., 14.,
+                                18., 19., 20.,
+                                21., 22., 23.,
+                                27., 28., 29.,
+                                30., 31., 32.};
+
+  test.AddInput<float>("input", {N, C, H, W}, X);
+
+  const std::vector<float> result = {0., 9., 1., 10., 2., 11.,
+                                     18., 27., 19., 28., 20., 29.,
+                                     3., 12., 4., 13., 5., 14.,
+                                    21., 30., 22., 31., 23., 32.};
+
+  test.AddOutput<float>("output", {1, 1, 4, 6}, result);
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 61552a83b4..1022bd7688 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -113,7 +113,6 @@ def create_backend_test(testname=None):
                                  '^test_dynamicquantizelinear_expanded*',
                                  '^test_dynamicquantizelinear_max_adjusted_expanded*',
                                  '^test_dynamicquantizelinear_min_adjusted_expanded*',
-                                 '^test_depthtospace*',
                                  '^test_gather_elements*',
                                  '^test_scatter_elements*',
                                  '^test_top_k*',
@@ -159,6 +158,7 @@ def create_backend_test(testname=None):
         #    current_failing_tests = current_failing_tests + ('|^test_operator_repeat_dim_overflow_cpu.*',)
         if c2.supports_device('NGRAPH'):
             current_failing_tests = current_failing_tests + ('|^test_clip*',)
+            current_failing_tests = current_failing_tests + ('|^test_depthtospace_crd*',)
 
         filters = current_failing_tests + \
                   tests_with_pre_opset7_dependencies_filters() + \