From 4ac8ecb76ef2aea628b30e5e8a5c8dff9b67009d Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 16 Oct 2017 18:18:04 -0700
Subject: [PATCH] Some bug-fixs in mpscnn backend

Summary: att

Reviewed By: ajtulloch

Differential Revision: D6037723

fbshipit-source-id: d7405b27089210abfd48a33ecee47a87f67ae9a0
---
 caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal |  36 +++++-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.h     |   2 +-
 caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm    | 111 +++++++++++++++---
 .../contrib/ios/mpscnn/mpscnn_graph_mask.mm   |  35 ++++--
 .../contrib/ios/mpscnn/mpscnn_kernels.h       |  35 +++++-
 .../mobile/contrib/ios/mpscnn/mpscnn_test.mm  |  67 ++++++++++-
 6 files changed, 257 insertions(+), 29 deletions(-)

diff --git a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
index 35cb409b29e..90bbe9d0cff 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
+++ b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
@@ -654,23 +654,55 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
                             texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
                             constant float* in1[[buffer(1)]],
                             ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
   if (in0_is_tex) {
     if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
       return;
     }
+    idx = gid.y * out.get_width() + gid.x;
   } else {
     if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
       return;
     }
+    idx = gid.y * outa.get_width() + gid.x;
   }
   ushort2 gid_ = ushort2(gid.x, gid.y);
   if (in0_is_tex) {
-    out.write(in0.read(gid_) * in1[0], gid_);
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
   } else {
-    outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
   }
 }
 
+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = ushort2(gid.x, gid.y);
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+
 kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
                                      texture2d<half, access::read> in1[[texture(1)]],
                                      texture2d<half, access::write> out[[texture(2)]],
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
index 2ed3531e330..aafcb6ea089 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
@@ -21,7 +21,7 @@
 namespace caffe2 {
 static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
 static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
-
+static constexpr const int kMetalMaxTextureArrLength = 2048;
 // We currently only try to convert a fixed set of operators that handle a subset of a full
 // CNN. We also only run when MPSCNN is available, provides a speedup.
 // On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
index e2f1f067a00..98826536ca3 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"
 
@@ -781,11 +780,13 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
  public:
   MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
-                           "OpenGLMul only supports broadcast");
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNMul only supports broadcast");
 
-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
-                           "OpenGLMul does not support axis");
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNMul does not support axis");
   }
 
   bool RunOnDevice() override {
@@ -795,11 +796,15 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
     MPSImage* X0 = wrapper0.getImage();
 
     const auto& X1 = Input(1);
-    CAFFE_ENFORCE_EQ(X1.size(), 1); // only scalar is supported
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now");
 
-    auto X1_ = [getMPSCNNContext().device newBufferWithBytes:X1.template data<float>()
-                                                      length:sizeof(float)
-                                                     options:MTLResourceOptionCPUCacheModeDefault];
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];
 
     auto outputWrapper = MPSImageWrapper(
         this, &wrapper0, X0.numberOfImages, X0.height, X0.width, X0.featureChannels);
@@ -807,8 +812,12 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
     MPSImage* output = outputWrapper.getImage();
 
     id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
-    id<MTLComputePipelineState> state = getMPSCNNContext().getSpecializedPipelineState(
-        @"elementwise_mul", {{ushort(X0.numberOfImages), ushort(X0.featureChannels)}});
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_mul",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});
 
     [encoder setComputePipelineState:state];
     [encoder setTexture:[X0 texture] atIndex:0];
@@ -828,6 +837,74 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
 REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp);
 OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
 
+class MPSCNNSubOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNSub only supports broadcast");
+
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNSub does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+
+    auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X0 = wrapper0.getImage();
+
+    const auto& X1 = Input(1);
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now");
+
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X0.numberOfImages,
+        X0.height,
+        X0.width,
+        X0.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_sub",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X0 texture] atIndex:0];
+    [encoder setBuffer:X1_ offset:0 atIndex:1];
+    [encoder setTexture:[output texture] atIndex:2];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    wrapper0.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp);
+OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
 class MPSCNNAddOp final : public Operator<CPUContext> {
  public:
   MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws)
@@ -1224,7 +1301,8 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
     CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW);
     CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1);
     CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1);
-    if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > MAX_ARR_LENGTH) {
+    if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) >
+        kMetalMaxTextureArrLength) {
       LOG(INFO) << "ConvTranspose " << X.numberOfImages << " " << output_channels << " " << kH
                 << " " << kW;
       LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
@@ -1280,8 +1358,6 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
   id<MTLBuffer> biasBuffer_;
-  int MAX_ARR_LENGTH = 2048;
-
   MPSCNNConvolution* conv_{nullptr};
 };
 
@@ -1568,6 +1644,13 @@ class MPSCNNRoIWarpOp final : public Operator<CPUContext> {
       inputWrapper.cleanup();
       return false;
     }
+    if (divRoundUp(R.dim32(0) * featureChannels, 4) >
+        kMetalMaxTextureArrLength) {
+      LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels;
+      LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
+      inputWrapper.cleanup();
+      return false;
+    }
     auto outputWrapper = MPSImageWrapper(
         this, &inputWrapper, R.dim32(0), pooled_height_, pooled_width_, featureChannels);
     auto commandBuffer = outputWrapper.getCommandBuffer();
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
index df088bb30dd..90e76442e09 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #include "mpscnn_graph_mask.h"
 #include "caffe2/core/operator.h"
 #include "mpscnn_context.h"
@@ -48,17 +47,35 @@ string as_string(StorageType st) {
 
 std::unordered_map<string, std::vector<StorageType>> inputStorageTypeMap = {
     {"MPSCNNGenerateProposalsCPP",
-     std::vector<StorageType>{
-         StorageType::CPU, StorageType::CPU, StorageType::CPU, StorageType::CPU}},
-    {"MPSCNNRoIWarp", std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNRoIWarp",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
     {"MPSCNNConvRelu",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
     {"MPSCNNFC",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
     {"MPSCNNConv",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
     {"MPSCNNConvTranspose",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}};
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNMul",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
+    {"MPSCNNSub",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}}};
 std::unordered_map<string, std::vector<StorageType>> outputStorageTypeMap = {
     {"MPSCNNGenerateProposalsCPP", std::vector<StorageType>{StorageType::CPU, StorageType::CPU}}};
 std::vector<string> opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"};
@@ -430,7 +447,7 @@ NetDef insertCopies(const NetDef& def) {
         expectedBlobType = StorageType::CPU;
       }
       auto inputBlob = ogOp.input(j);
-      auto version = analysis.ssa[i].outVersions[inputBlob];
+      auto version = analysis.ssa[i].inVersions[inputBlob];
       // Check whether the blob is produced by previous operators
       if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
           analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) {
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
index 25d6f91a209..ff1b0108454 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
@@ -674,20 +674,51 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
                             texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
                             constant float* in1[[buffer(1)]],
                             ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
   if (in0_is_tex) {
     if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
       return;
     }
+    idx = gid.y * out.get_width() + gid.x;
   } else {
     if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
       return;
     }
+    idx = gid.y * outa.get_width() + gid.x;
   }
   ushort2 gid_ = ushort2(gid.x, gid.y);
   if (in0_is_tex) {
-    out.write(in0.read(gid_) * in1[0], gid_);
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
   } else {
-    outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = ushort2(gid.x, gid.y);
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
   }
 }
 
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
index 3bd945871c8..f03faefd23c 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@@ -1582,7 +1582,7 @@ void testMPSCNN() {
 
     {
       auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
-      t->Resize(1);
+      t->Resize(72);
       CPUContext ctx;
       math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
     }
@@ -1633,6 +1633,71 @@ void testMPSCNN() {
     }
   }
 
+  {
+    LOG(INFO) << "MPSCNNSub Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNSub");
+      op.add_input("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("Y_mtl");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Sub");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("Y_ref");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.01);
+    }
+  }
+
   {
     LOG(INFO) << "MPSAdd Test";
     Workspace ws;