From 4ac8ecb76ef2aea628b30e5e8a5c8dff9b67009d Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Mon, 16 Oct 2017 18:18:04 -0700 Subject: [PATCH] Some bug-fixs in mpscnn backend Summary: att Reviewed By: ajtulloch Differential Revision: D6037723 fbshipit-source-id: d7405b27089210abfd48a33ecee47a87f67ae9a0 --- caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal | 36 +++++- caffe2/mobile/contrib/ios/mpscnn/mpscnn.h | 2 +- caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm | 111 +++++++++++++++--- .../contrib/ios/mpscnn/mpscnn_graph_mask.mm | 35 ++++-- .../contrib/ios/mpscnn/mpscnn_kernels.h | 35 +++++- .../mobile/contrib/ios/mpscnn/mpscnn_test.mm | 67 ++++++++++- 6 files changed, 257 insertions(+), 29 deletions(-) diff --git a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal index 35cb409b29e..90bbe9d0cff 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal +++ b/caffe2/mobile/contrib/ios/mpscnn/MPSCNN.metal @@ -654,23 +654,55 @@ kernel void elementwise_mul(texture2d in0[[texture(0), funct texture2d_array outa[[texture(2), function_constant(in0_is_arr)]], constant float* in1[[buffer(1)]], ushort3 gid[[thread_position_in_grid]]) { + ushort last_dim = ushort_arg_2; + ushort idx; if (in0_is_tex) { if (gid.x >= out.get_width() || gid.y >= out.get_height()) { return; } + idx = gid.y * out.get_width() + gid.x; } else { if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) { return; } + idx = gid.y * outa.get_width() + gid.x; } ushort2 gid_ = ushort2(gid.x, gid.y); if (in0_is_tex) { - out.write(in0.read(gid_) * in1[0], gid_); + out.write(in0.read(gid_) * in1[idx % last_dim], gid_); } else { - outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z); + outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z); } } +kernel void elementwise_sub(texture2d in0[[texture(0), function_constant(in0_is_tex)]], + texture2d_array ina0[[texture(0), function_constant(in0_is_arr)]], + texture2d out[[texture(2), function_constant(in0_is_tex)]], + texture2d_array outa[[texture(2), function_constant(in0_is_arr)]], + constant float* in1[[buffer(1)]], + ushort3 gid[[thread_position_in_grid]]) { + ushort last_dim = ushort_arg_2; + ushort idx; + if (in0_is_tex) { + if (gid.x >= out.get_width() || gid.y >= out.get_height()) { + return; + } + idx = gid.y * out.get_width() + gid.x; + } else { + if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) { + return; + } + idx = gid.y * outa.get_width() + gid.x; + } + ushort2 gid_ = ushort2(gid.x, gid.y); + if (in0_is_tex) { + out.write(in0.read(gid_) - in1[idx % last_dim], gid_); + } else { + outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z); + } +} + + kernel void elementwise_add_nonarray(texture2d in0[[texture(0)]], texture2d in1[[texture(1)]], texture2d out[[texture(2)]], diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h index 2ed3531e330..aafcb6ea089 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.h @@ -21,7 +21,7 @@ namespace caffe2 { static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__"; static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__"; - +static constexpr const int kMetalMaxTextureArrLength = 2048; // We currently only try to convert a fixed set of operators that handle a subset of a full // CNN. We also only run when MPSCNN is available, provides a speedup. // On failure, returns false. On success, returns true, and sets the MPSCNN net in the output diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm index e2f1f067a00..98826536ca3 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "caffe2/core/common.h" #include "caffe2/core/context.h" @@ -781,11 +780,13 @@ class MPSCNNMulOp final : public Operator { public: MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws) : Operator(operator_def, ws) { - OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument("broadcast", 0) == 1, - "OpenGLMul only supports broadcast"); + OPERATOR_NEEDS_FEATURE( + OperatorBase::GetSingleArgument("broadcast", 0) == 1, + "MPSCNNMul only supports broadcast"); - OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false, - "OpenGLMul does not support axis"); + OPERATOR_NEEDS_FEATURE( + OperatorBase::HasArgument("axis") == false, + "MPSCNNMul does not support axis"); } bool RunOnDevice() override { @@ -795,11 +796,15 @@ class MPSCNNMulOp final : public Operator { MPSImage* X0 = wrapper0.getImage(); const auto& X1 = Input(1); - CAFFE_ENFORCE_EQ(X1.size(), 1); // only scalar is supported + CAFFE_ENFORCE_EQ( + X1.ndim(), + 1, + "MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now"); - auto X1_ = [getMPSCNNContext().device newBufferWithBytes:X1.template data() - length:sizeof(float) - options:MTLResourceOptionCPUCacheModeDefault]; + auto X1_ = [getMPSCNNContext().device + newBufferWithBytes:X1.template data() + length:sizeof(float) * X1.size() + options:MTLResourceOptionCPUCacheModeDefault]; auto outputWrapper = MPSImageWrapper( this, &wrapper0, X0.numberOfImages, X0.height, X0.width, X0.featureChannels); @@ -807,8 +812,12 @@ class MPSCNNMulOp final : public Operator { MPSImage* output = outputWrapper.getImage(); id encoder = [commandBuffer computeCommandEncoder]; - id state = getMPSCNNContext().getSpecializedPipelineState( - @"elementwise_mul", {{ushort(X0.numberOfImages), ushort(X0.featureChannels)}}); + id state = + getMPSCNNContext().getSpecializedPipelineState( + @"elementwise_mul", + {{ushort(X0.numberOfImages), + ushort(X0.featureChannels), + ushort(X1.dim32(0))}}); [encoder setComputePipelineState:state]; [encoder setTexture:[X0 texture] atIndex:0]; @@ -828,6 +837,74 @@ class MPSCNNMulOp final : public Operator { REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp); OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}); +class MPSCNNSubOp final : public Operator { + public: + MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) { + OPERATOR_NEEDS_FEATURE( + OperatorBase::GetSingleArgument("broadcast", 0) == 1, + "MPSCNNSub only supports broadcast"); + + OPERATOR_NEEDS_FEATURE( + OperatorBase::HasArgument("axis") == false, + "MPSCNNSub does not support axis"); + } + + bool RunOnDevice() override { + caffe2::Timer t; + + auto wrapper0 = Inputs()[0]->Get(); + MPSImage* X0 = wrapper0.getImage(); + + const auto& X1 = Input(1); + CAFFE_ENFORCE_EQ( + X1.ndim(), + 1, + "MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now"); + + auto X1_ = [getMPSCNNContext().device + newBufferWithBytes:X1.template data() + length:sizeof(float) * X1.size() + options:MTLResourceOptionCPUCacheModeDefault]; + + auto outputWrapper = MPSImageWrapper( + this, + &wrapper0, + X0.numberOfImages, + X0.height, + X0.width, + X0.featureChannels); + auto commandBuffer = outputWrapper.getCommandBuffer(); + MPSImage* output = outputWrapper.getImage(); + + id encoder = + [commandBuffer computeCommandEncoder]; + id state = + getMPSCNNContext().getSpecializedPipelineState( + @"elementwise_sub", + {{ushort(X0.numberOfImages), + ushort(X0.featureChannels), + ushort(X1.dim32(0))}}); + + [encoder setComputePipelineState:state]; + [encoder setTexture:[X0 texture] atIndex:0]; + [encoder setBuffer:X1_ offset:0 atIndex:1]; + [encoder setTexture:[output texture] atIndex:2]; + const auto& launchParams = + spatialPointwiseKernelLaunchParams(state, output); + [encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid + threadsPerThreadgroup:launchParams.threadsPerThreadgroup]; + [encoder endEncoding]; + wrapper0.markRead(); + outputWrapper.copyToOutputBlob(Outputs()[0]); + VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds(); + return true; + } +}; + +REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp); +OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}); + class MPSCNNAddOp final : public Operator { public: MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws) @@ -1224,7 +1301,8 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase { CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW); CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1); CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1); - if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > MAX_ARR_LENGTH) { + if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > + kMetalMaxTextureArrLength) { LOG(INFO) << "ConvTranspose " << X.numberOfImages << " " << output_channels << " " << kH << " " << kW; LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture"; @@ -1280,8 +1358,6 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase { // Output: Y INPUT_TAGS(INPUT, FILTER, BIAS); id biasBuffer_; - int MAX_ARR_LENGTH = 2048; - MPSCNNConvolution* conv_{nullptr}; }; @@ -1568,6 +1644,13 @@ class MPSCNNRoIWarpOp final : public Operator { inputWrapper.cleanup(); return false; } + if (divRoundUp(R.dim32(0) * featureChannels, 4) > + kMetalMaxTextureArrLength) { + LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels; + LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture"; + inputWrapper.cleanup(); + return false; + } auto outputWrapper = MPSImageWrapper( this, &inputWrapper, R.dim32(0), pooled_height_, pooled_width_, featureChannels); auto commandBuffer = outputWrapper.getCommandBuffer(); diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm index df088bb30dd..90e76442e09 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_graph_mask.mm @@ -14,7 +14,6 @@ * limitations under the License. */ - #include "mpscnn_graph_mask.h" #include "caffe2/core/operator.h" #include "mpscnn_context.h" @@ -48,17 +47,35 @@ string as_string(StorageType st) { std::unordered_map> inputStorageTypeMap = { {"MPSCNNGenerateProposalsCPP", - std::vector{ - StorageType::CPU, StorageType::CPU, StorageType::CPU, StorageType::CPU}}, - {"MPSCNNRoIWarp", std::vector{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU}}, + std::vector{StorageType::CPU, + StorageType::CPU, + StorageType::CPU, + StorageType::CPU}}, + {"MPSCNNRoIWarp", + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU}}, {"MPSCNNConvRelu", - std::vector{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}, + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU, + StorageType::CPU}}, {"MPSCNNFC", - std::vector{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}, + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU, + StorageType::CPU}}, {"MPSCNNConv", - std::vector{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}, + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU, + StorageType::CPU}}, {"MPSCNNConvTranspose", - std::vector{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}}; + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU, + StorageType::CPU}}, + {"MPSCNNMul", + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU}}, + {"MPSCNNSub", + std::vector{StorageType::MPSTEMPORARYIMAGE, + StorageType::CPU}}}; std::unordered_map> outputStorageTypeMap = { {"MPSCNNGenerateProposalsCPP", std::vector{StorageType::CPU, StorageType::CPU}}}; std::vector opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"}; @@ -430,7 +447,7 @@ NetDef insertCopies(const NetDef& def) { expectedBlobType = StorageType::CPU; } auto inputBlob = ogOp.input(j); - auto version = analysis.ssa[i].outVersions[inputBlob]; + auto version = analysis.ssa[i].inVersions[inputBlob]; // Check whether the blob is produced by previous operators if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() && analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) { diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h index 25d6f91a209..ff1b0108454 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h @@ -674,20 +674,51 @@ kernel void elementwise_mul(texture2d in0[[texture(0), funct texture2d_array outa[[texture(2), function_constant(in0_is_arr)]], constant float* in1[[buffer(1)]], ushort3 gid[[thread_position_in_grid]]) { + ushort last_dim = ushort_arg_2; + ushort idx; if (in0_is_tex) { if (gid.x >= out.get_width() || gid.y >= out.get_height()) { return; } + idx = gid.y * out.get_width() + gid.x; } else { if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) { return; } + idx = gid.y * outa.get_width() + gid.x; } ushort2 gid_ = ushort2(gid.x, gid.y); if (in0_is_tex) { - out.write(in0.read(gid_) * in1[0], gid_); + out.write(in0.read(gid_) * in1[idx % last_dim], gid_); } else { - outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z); + outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z); + } +} + +kernel void elementwise_sub(texture2d in0[[texture(0), function_constant(in0_is_tex)]], + texture2d_array ina0[[texture(0), function_constant(in0_is_arr)]], + texture2d out[[texture(2), function_constant(in0_is_tex)]], + texture2d_array outa[[texture(2), function_constant(in0_is_arr)]], + constant float* in1[[buffer(1)]], + ushort3 gid[[thread_position_in_grid]]) { + ushort last_dim = ushort_arg_2; + ushort idx; + if (in0_is_tex) { + if (gid.x >= out.get_width() || gid.y >= out.get_height()) { + return; + } + idx = gid.y * out.get_width() + gid.x; + } else { + if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) { + return; + } + idx = gid.y * outa.get_width() + gid.x; + } + ushort2 gid_ = ushort2(gid.x, gid.y); + if (in0_is_tex) { + out.write(in0.read(gid_) - in1[idx % last_dim], gid_); + } else { + outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z); } } diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm index 3bd945871c8..f03faefd23c 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm @@ -1582,7 +1582,7 @@ void testMPSCNN() { { auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); - t->Resize(1); + t->Resize(72); CPUContext ctx; math::RandGaussian(t->size(), 0, 1, t->mutable_data(), &ctx); } @@ -1633,6 +1633,71 @@ void testMPSCNN() { } } + { + LOG(INFO) << "MPSCNNSub Test"; + Workspace ws; + { + auto* t = ws.CreateBlob("X0_cpu")->GetMutable(); + t->Resize(1, 12, 57, 72); + CPUContext ctx; + math::RandGaussian( + t->size(), 0, 1, t->mutable_data(), &ctx); + } + + { + auto* t = ws.CreateBlob("X1_cpu")->GetMutable(); + t->Resize(72); + CPUContext ctx; + math::RandGaussian( + t->size(), 0, 1, t->mutable_data(), &ctx); + } + + NetDef netdef; + { + auto& op = *(netdef.add_op()); + op.set_type("CopyToMPSCNN"); + op.add_input("X0_cpu"); + op.add_output("X0_mtl"); + } + + { + auto& op = *(netdef.add_op()); + op.set_type("MPSCNNSub"); + op.add_input("X0_mtl"); + op.add_input("X1_cpu"); + op.add_output("Y_mtl"); + add_arg_int(op, "broadcast", 1); + } + + { + auto& op = *(netdef.add_op()); + op.set_type("CopyFromMPSCNN"); + op.add_input("Y_mtl"); + op.add_output("Y_cpu"); + } + + { + auto& op = *(netdef.add_op()); + op.set_type("Sub"); + op.add_input("X0_cpu"); + op.add_input("X1_cpu"); + op.add_output("Y_ref"); + add_arg_int(op, "broadcast", 1); + } + + ws.RunNetOnce(netdef); + const auto& t2 = ws.GetBlob("Y_cpu")->Get(); + const auto& t1 = ws.GetBlob("Y_ref")->Get(); + + CAFFE_ENFORCE_EQ(t1.dims(), t2.dims()); + for (auto i = 0; i < t1.size(); ++i) { + // FP16 <-> FP32 round trip, accumulation, etc. + const float t1_i = t1.data()[i]; + const float t2_i = t2.data()[i]; + CHECK_NEAR(t1_i, t2_i, 0.01); + } + } + { LOG(INFO) << "MPSAdd Test"; Workspace ws;