Some bug-fixs in mpscnn backend

Summary: att Reviewed By: ajtulloch Differential Revision: D6037723 fbshipit-source-id: d7405b27089210abfd48a33ecee47a87f67ae9a0
2026-05-15 21:00:47 +00:00 · 2017-10-16 18:18:04 -07:00 · 2017-10-16 18:18:04 -07:00 · 4ac8ecb76e
commit 4ac8ecb76e
parent 6ac393a32b
6 changed files with 257 additions and 29 deletions
--- a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
+++ b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal
@ -654,23 +654,55 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
                            constant float* in1[[buffer(1)]],
                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
  if (in0_is_tex) {
    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
      return;
    }
+    idx = gid.y * out.get_width() + gid.x;
  } else {
    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
      return;
    }
+    idx = gid.y * outa.get_width() + gid.x;
  }
  ushort2 gid_ = ushort2(gid.x, gid.y);
  if (in0_is_tex) {
-    out.write(in0.read(gid_) * in1[0], gid_);
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
  } else {
-    outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
  }
 }

+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = ushort2(gid.x, gid.y);
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+
 kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
                                     texture2d<half, access::read> in1[[texture(1)]],
                                     texture2d<half, access::write> out[[texture(2)]],
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h
@ -21,7 +21,7 @@
 namespace caffe2 {
 static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
 static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
-
+static constexpr const int kMetalMaxTextureArrLength = 2048;
 // We currently only try to convert a fixed set of operators that handle a subset of a full
 // CNN. We also only run when MPSCNN is available, provides a speedup.
 // On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@ -14,7 +14,6 @@
 * limitations under the License.
 */

-
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"

@ -781,11 +780,13 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
 public:
  MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
-                           "OpenGLMul only supports broadcast");
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNMul only supports broadcast");

-    OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
-                           "OpenGLMul does not support axis");
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNMul does not support axis");
  }

  bool RunOnDevice() override {
@ -795,11 +796,15 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
    MPSImage* X0 = wrapper0.getImage();

    const auto& X1 = Input(1);
-    CAFFE_ENFORCE_EQ(X1.size(), 1); // only scalar is supported
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now");

-    auto X1_ = [getMPSCNNContext().device newBufferWithBytes:X1.template data<float>()
-                                                      length:sizeof(float)
-                                                     options:MTLResourceOptionCPUCacheModeDefault];
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];

    auto outputWrapper = MPSImageWrapper(
        this, &wrapper0, X0.numberOfImages, X0.height, X0.width, X0.featureChannels);
@ -807,8 +812,12 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
    MPSImage* output = outputWrapper.getImage();

    id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
-    id<MTLComputePipelineState> state = getMPSCNNContext().getSpecializedPipelineState(
-        @"elementwise_mul", {{ushort(X0.numberOfImages), ushort(X0.featureChannels)}});
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_mul",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});

    [encoder setComputePipelineState:state];
    [encoder setTexture:[X0 texture] atIndex:0];
@ -828,6 +837,74 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
 REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp);
 OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});

+class MPSCNNSubOp final : public Operator<CPUContext> {
+ public:
+  MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
+        "MPSCNNSub only supports broadcast");
+
+    OPERATOR_NEEDS_FEATURE(
+        OperatorBase::HasArgument("axis") == false,
+        "MPSCNNSub does not support axis");
+  }
+
+  bool RunOnDevice() override {
+    caffe2::Timer t;
+
+    auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
+    MPSImage* X0 = wrapper0.getImage();
+
+    const auto& X1 = Input(1);
+    CAFFE_ENFORCE_EQ(
+        X1.ndim(),
+        1,
+        "MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now");
+
+    auto X1_ = [getMPSCNNContext().device
+        newBufferWithBytes:X1.template data<float>()
+                    length:sizeof(float) * X1.size()
+                   options:MTLResourceOptionCPUCacheModeDefault];
+
+    auto outputWrapper = MPSImageWrapper(
+        this,
+        &wrapper0,
+        X0.numberOfImages,
+        X0.height,
+        X0.width,
+        X0.featureChannels);
+    auto commandBuffer = outputWrapper.getCommandBuffer();
+    MPSImage* output = outputWrapper.getImage();
+
+    id<MTLComputeCommandEncoder> encoder =
+        [commandBuffer computeCommandEncoder];
+    id<MTLComputePipelineState> state =
+        getMPSCNNContext().getSpecializedPipelineState(
+            @"elementwise_sub",
+            {{ushort(X0.numberOfImages),
+              ushort(X0.featureChannels),
+              ushort(X1.dim32(0))}});
+
+    [encoder setComputePipelineState:state];
+    [encoder setTexture:[X0 texture] atIndex:0];
+    [encoder setBuffer:X1_ offset:0 atIndex:1];
+    [encoder setTexture:[output texture] atIndex:2];
+    const auto& launchParams =
+        spatialPointwiseKernelLaunchParams(state, output);
+    [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
+            threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
+    [encoder endEncoding];
+    wrapper0.markRead();
+    outputWrapper.copyToOutputBlob(Outputs()[0]);
+    VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds();
+    return true;
+  }
+};
+
+REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp);
+OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
+
 class MPSCNNAddOp final : public Operator<CPUContext> {
 public:
  MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws)
@ -1224,7 +1301,8 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
    CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW);
    CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1);
    CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1);
-    if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > MAX_ARR_LENGTH) {
+    if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) >
+        kMetalMaxTextureArrLength) {
      LOG(INFO) << "ConvTranspose " << X.numberOfImages << " " << output_channels << " " << kH
                << " " << kW;
      LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
@ -1280,8 +1358,6 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
  // Output: Y
  INPUT_TAGS(INPUT, FILTER, BIAS);
  id<MTLBuffer> biasBuffer_;
-  int MAX_ARR_LENGTH = 2048;
-
  MPSCNNConvolution* conv_{nullptr};
 };

@ -1568,6 +1644,13 @@ class MPSCNNRoIWarpOp final : public Operator<CPUContext> {
      inputWrapper.cleanup();
      return false;
    }
+    if (divRoundUp(R.dim32(0) * featureChannels, 4) >
+        kMetalMaxTextureArrLength) {
+      LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels;
+      LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
+      inputWrapper.cleanup();
+      return false;
+    }
    auto outputWrapper = MPSImageWrapper(
        this, &inputWrapper, R.dim32(0), pooled_height_, pooled_width_, featureChannels);
    auto commandBuffer = outputWrapper.getCommandBuffer();
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm
@ -14,7 +14,6 @@
 * limitations under the License.
 */

-
 #include "mpscnn_graph_mask.h"
 #include "caffe2/core/operator.h"
 #include "mpscnn_context.h"
@ -48,17 +47,35 @@ string as_string(StorageType st) {

 std::unordered_map<string, std::vector<StorageType>> inputStorageTypeMap = {
    {"MPSCNNGenerateProposalsCPP",
-     std::vector<StorageType>{
-         StorageType::CPU, StorageType::CPU, StorageType::CPU, StorageType::CPU}},
-    {"MPSCNNRoIWarp", std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNRoIWarp",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
    {"MPSCNNConvRelu",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
    {"MPSCNNFC",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
    {"MPSCNNConv",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
    {"MPSCNNConvTranspose",
-     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}};
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU,
+                              StorageType::CPU}},
+    {"MPSCNNMul",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}},
+    {"MPSCNNSub",
+     std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
+                              StorageType::CPU}}};
 std::unordered_map<string, std::vector<StorageType>> outputStorageTypeMap = {
    {"MPSCNNGenerateProposalsCPP", std::vector<StorageType>{StorageType::CPU, StorageType::CPU}}};
 std::vector<string> opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"};
@ -430,7 +447,7 @@ NetDef insertCopies(const NetDef& def) {
        expectedBlobType = StorageType::CPU;
      }
      auto inputBlob = ogOp.input(j);
-      auto version = analysis.ssa[i].outVersions[inputBlob];
+      auto version = analysis.ssa[i].inVersions[inputBlob];
      // Check whether the blob is produced by previous operators
      if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
          analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) {
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
@ -674,20 +674,51 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
                            constant float* in1[[buffer(1)]],
                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
  if (in0_is_tex) {
    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
      return;
    }
+    idx = gid.y * out.get_width() + gid.x;
  } else {
    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
      return;
    }
+    idx = gid.y * outa.get_width() + gid.x;
  }
  ushort2 gid_ = ushort2(gid.x, gid.y);
  if (in0_is_tex) {
-    out.write(in0.read(gid_) * in1[0], gid_);
+    out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
  } else {
-    outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
+    outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
+  }
+}
+
+kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
+                            texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
+                            texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
+                            constant float* in1[[buffer(1)]],
+                            ushort3 gid[[thread_position_in_grid]]) {
+  ushort last_dim = ushort_arg_2;
+  ushort idx;
+  if (in0_is_tex) {
+    if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
+      return;
+    }
+    idx = gid.y * out.get_width() + gid.x;
+  } else {
+    if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
+      return;
+    }
+    idx = gid.y * outa.get_width() + gid.x;
+  }
+  ushort2 gid_ = ushort2(gid.x, gid.y);
+  if (in0_is_tex) {
+    out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
+  } else {
+    outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
  }
 }

--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@ -1582,7 +1582,7 @@ void testMPSCNN() {

    {
      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
-      t->Resize(1);
+      t->Resize(72);
      CPUContext ctx;
      math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
    }
@ -1633,6 +1633,71 @@ void testMPSCNN() {
    }
  }

+  {
+    LOG(INFO) << "MPSCNNSub Test";
+    Workspace ws;
+    {
+      auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
+      t->Resize(1, 12, 57, 72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    {
+      auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
+      t->Resize(72);
+      CPUContext ctx;
+      math::RandGaussian<float, CPUContext>(
+          t->size(), 0, 1, t->mutable_data<float>(), &ctx);
+    }
+
+    NetDef netdef;
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyToMPSCNN");
+      op.add_input("X0_cpu");
+      op.add_output("X0_mtl");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("MPSCNNSub");
+      op.add_input("X0_mtl");
+      op.add_input("X1_cpu");
+      op.add_output("Y_mtl");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("CopyFromMPSCNN");
+      op.add_input("Y_mtl");
+      op.add_output("Y_cpu");
+    }
+
+    {
+      auto& op = *(netdef.add_op());
+      op.set_type("Sub");
+      op.add_input("X0_cpu");
+      op.add_input("X1_cpu");
+      op.add_output("Y_ref");
+      add_arg_int(op, "broadcast", 1);
+    }
+
+    ws.RunNetOnce(netdef);
+    const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
+    const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
+
+    CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
+    for (auto i = 0; i < t1.size(); ++i) {
+      // FP16 <-> FP32 round trip, accumulation, etc.
+      const float t1_i = t1.data<float>()[i];
+      const float t2_i = t2.data<float>()[i];
+      CHECK_NEAR(t1_i, t2_i, 0.01);
+    }
+  }
+
  {
    LOG(INFO) << "MPSAdd Test";
    Workspace ws;