Some bug-fixs in mpscnn backend

Summary: att

Reviewed By: ajtulloch

Differential Revision: D6037723

fbshipit-source-id: d7405b27089210abfd48a33ecee47a87f67ae9a0
This commit is contained in:
Jerry Zhang 2017-10-16 18:18:04 -07:00 committed by Facebook Github Bot
parent 6ac393a32b
commit 4ac8ecb76e
6 changed files with 257 additions and 29 deletions

View file

@ -654,23 +654,55 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
constant float* in1[[buffer(1)]],
ushort3 gid[[thread_position_in_grid]]) {
ushort last_dim = ushort_arg_2;
ushort idx;
if (in0_is_tex) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
return;
}
idx = gid.y * out.get_width() + gid.x;
} else {
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
return;
}
idx = gid.y * outa.get_width() + gid.x;
}
ushort2 gid_ = ushort2(gid.x, gid.y);
if (in0_is_tex) {
out.write(in0.read(gid_) * in1[0], gid_);
out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
} else {
outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
}
}
kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
constant float* in1[[buffer(1)]],
ushort3 gid[[thread_position_in_grid]]) {
ushort last_dim = ushort_arg_2;
ushort idx;
if (in0_is_tex) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
return;
}
idx = gid.y * out.get_width() + gid.x;
} else {
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
return;
}
idx = gid.y * outa.get_width() + gid.x;
}
ushort2 gid_ = ushort2(gid.x, gid.y);
if (in0_is_tex) {
out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
} else {
outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
}
}
kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
texture2d<half, access::read> in1[[texture(1)]],
texture2d<half, access::write> out[[texture(2)]],

View file

@ -21,7 +21,7 @@
namespace caffe2 {
static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
static constexpr const int kMetalMaxTextureArrLength = 2048;
// We currently only try to convert a fixed set of operators that handle a subset of a full
// CNN. We also only run when MPSCNN is available, provides a speedup.
// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output

View file

@ -14,7 +14,6 @@
* limitations under the License.
*/
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
@ -781,11 +780,13 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
public:
MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
"OpenGLMul only supports broadcast");
OPERATOR_NEEDS_FEATURE(
OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
"MPSCNNMul only supports broadcast");
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
"OpenGLMul does not support axis");
OPERATOR_NEEDS_FEATURE(
OperatorBase::HasArgument("axis") == false,
"MPSCNNMul does not support axis");
}
bool RunOnDevice() override {
@ -795,11 +796,15 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
MPSImage* X0 = wrapper0.getImage();
const auto& X1 = Input(1);
CAFFE_ENFORCE_EQ(X1.size(), 1); // only scalar is supported
CAFFE_ENFORCE_EQ(
X1.ndim(),
1,
"MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now");
auto X1_ = [getMPSCNNContext().device newBufferWithBytes:X1.template data<float>()
length:sizeof(float)
options:MTLResourceOptionCPUCacheModeDefault];
auto X1_ = [getMPSCNNContext().device
newBufferWithBytes:X1.template data<float>()
length:sizeof(float) * X1.size()
options:MTLResourceOptionCPUCacheModeDefault];
auto outputWrapper = MPSImageWrapper(
this, &wrapper0, X0.numberOfImages, X0.height, X0.width, X0.featureChannels);
@ -807,8 +812,12 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
MPSImage* output = outputWrapper.getImage();
id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
id<MTLComputePipelineState> state = getMPSCNNContext().getSpecializedPipelineState(
@"elementwise_mul", {{ushort(X0.numberOfImages), ushort(X0.featureChannels)}});
id<MTLComputePipelineState> state =
getMPSCNNContext().getSpecializedPipelineState(
@"elementwise_mul",
{{ushort(X0.numberOfImages),
ushort(X0.featureChannels),
ushort(X1.dim32(0))}});
[encoder setComputePipelineState:state];
[encoder setTexture:[X0 texture] atIndex:0];
@ -828,6 +837,74 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp);
OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
class MPSCNNSubOp final : public Operator<CPUContext> {
public:
MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
"MPSCNNSub only supports broadcast");
OPERATOR_NEEDS_FEATURE(
OperatorBase::HasArgument("axis") == false,
"MPSCNNSub does not support axis");
}
bool RunOnDevice() override {
caffe2::Timer t;
auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
MPSImage* X0 = wrapper0.getImage();
const auto& X1 = Input(1);
CAFFE_ENFORCE_EQ(
X1.ndim(),
1,
"MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now");
auto X1_ = [getMPSCNNContext().device
newBufferWithBytes:X1.template data<float>()
length:sizeof(float) * X1.size()
options:MTLResourceOptionCPUCacheModeDefault];
auto outputWrapper = MPSImageWrapper(
this,
&wrapper0,
X0.numberOfImages,
X0.height,
X0.width,
X0.featureChannels);
auto commandBuffer = outputWrapper.getCommandBuffer();
MPSImage* output = outputWrapper.getImage();
id<MTLComputeCommandEncoder> encoder =
[commandBuffer computeCommandEncoder];
id<MTLComputePipelineState> state =
getMPSCNNContext().getSpecializedPipelineState(
@"elementwise_sub",
{{ushort(X0.numberOfImages),
ushort(X0.featureChannels),
ushort(X1.dim32(0))}});
[encoder setComputePipelineState:state];
[encoder setTexture:[X0 texture] atIndex:0];
[encoder setBuffer:X1_ offset:0 atIndex:1];
[encoder setTexture:[output texture] atIndex:2];
const auto& launchParams =
spatialPointwiseKernelLaunchParams(state, output);
[encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
[encoder endEncoding];
wrapper0.markRead();
outputWrapper.copyToOutputBlob(Outputs()[0]);
VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds();
return true;
}
};
REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp);
OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
class MPSCNNAddOp final : public Operator<CPUContext> {
public:
MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws)
@ -1224,7 +1301,8 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW);
CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1);
CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1);
if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > MAX_ARR_LENGTH) {
if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) >
kMetalMaxTextureArrLength) {
LOG(INFO) << "ConvTranspose " << X.numberOfImages << " " << output_channels << " " << kH
<< " " << kW;
LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
@ -1280,8 +1358,6 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
id<MTLBuffer> biasBuffer_;
int MAX_ARR_LENGTH = 2048;
MPSCNNConvolution* conv_{nullptr};
};
@ -1568,6 +1644,13 @@ class MPSCNNRoIWarpOp final : public Operator<CPUContext> {
inputWrapper.cleanup();
return false;
}
if (divRoundUp(R.dim32(0) * featureChannels, 4) >
kMetalMaxTextureArrLength) {
LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels;
LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
inputWrapper.cleanup();
return false;
}
auto outputWrapper = MPSImageWrapper(
this, &inputWrapper, R.dim32(0), pooled_height_, pooled_width_, featureChannels);
auto commandBuffer = outputWrapper.getCommandBuffer();

View file

@ -14,7 +14,6 @@
* limitations under the License.
*/
#include "mpscnn_graph_mask.h"
#include "caffe2/core/operator.h"
#include "mpscnn_context.h"
@ -48,17 +47,35 @@ string as_string(StorageType st) {
std::unordered_map<string, std::vector<StorageType>> inputStorageTypeMap = {
{"MPSCNNGenerateProposalsCPP",
std::vector<StorageType>{
StorageType::CPU, StorageType::CPU, StorageType::CPU, StorageType::CPU}},
{"MPSCNNRoIWarp", std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU}},
std::vector<StorageType>{StorageType::CPU,
StorageType::CPU,
StorageType::CPU,
StorageType::CPU}},
{"MPSCNNRoIWarp",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU}},
{"MPSCNNConvRelu",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU,
StorageType::CPU}},
{"MPSCNNFC",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU,
StorageType::CPU}},
{"MPSCNNConv",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU,
StorageType::CPU}},
{"MPSCNNConvTranspose",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}};
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU,
StorageType::CPU}},
{"MPSCNNMul",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU}},
{"MPSCNNSub",
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
StorageType::CPU}}};
std::unordered_map<string, std::vector<StorageType>> outputStorageTypeMap = {
{"MPSCNNGenerateProposalsCPP", std::vector<StorageType>{StorageType::CPU, StorageType::CPU}}};
std::vector<string> opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"};
@ -430,7 +447,7 @@ NetDef insertCopies(const NetDef& def) {
expectedBlobType = StorageType::CPU;
}
auto inputBlob = ogOp.input(j);
auto version = analysis.ssa[i].outVersions[inputBlob];
auto version = analysis.ssa[i].inVersions[inputBlob];
// Check whether the blob is produced by previous operators
if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) {

View file

@ -674,20 +674,51 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
constant float* in1[[buffer(1)]],
ushort3 gid[[thread_position_in_grid]]) {
ushort last_dim = ushort_arg_2;
ushort idx;
if (in0_is_tex) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
return;
}
idx = gid.y * out.get_width() + gid.x;
} else {
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
return;
}
idx = gid.y * outa.get_width() + gid.x;
}
ushort2 gid_ = ushort2(gid.x, gid.y);
if (in0_is_tex) {
out.write(in0.read(gid_) * in1[0], gid_);
out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
} else {
outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
}
}
kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
constant float* in1[[buffer(1)]],
ushort3 gid[[thread_position_in_grid]]) {
ushort last_dim = ushort_arg_2;
ushort idx;
if (in0_is_tex) {
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
return;
}
idx = gid.y * out.get_width() + gid.x;
} else {
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
return;
}
idx = gid.y * outa.get_width() + gid.x;
}
ushort2 gid_ = ushort2(gid.x, gid.y);
if (in0_is_tex) {
out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
} else {
outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
}
}

View file

@ -1582,7 +1582,7 @@ void testMPSCNN() {
{
auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
t->Resize(1);
t->Resize(72);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
@ -1633,6 +1633,71 @@ void testMPSCNN() {
}
}
{
LOG(INFO) << "MPSCNNSub Test";
Workspace ws;
{
auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
t->Resize(1, 12, 57, 72);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
t->Resize(72);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
{
auto& op = *(netdef.add_op());
op.set_type("CopyToMPSCNN");
op.add_input("X0_cpu");
op.add_output("X0_mtl");
}
{
auto& op = *(netdef.add_op());
op.set_type("MPSCNNSub");
op.add_input("X0_mtl");
op.add_input("X1_cpu");
op.add_output("Y_mtl");
add_arg_int(op, "broadcast", 1);
}
{
auto& op = *(netdef.add_op());
op.set_type("CopyFromMPSCNN");
op.add_input("Y_mtl");
op.add_output("Y_cpu");
}
{
auto& op = *(netdef.add_op());
op.set_type("Sub");
op.add_input("X0_cpu");
op.add_input("X1_cpu");
op.add_output("Y_ref");
add_arg_int(op, "broadcast", 1);
}
ws.RunNetOnce(netdef);
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
for (auto i = 0; i < t1.size(); ++i) {
// FP16 <-> FP32 round trip, accumulation, etc.
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, 0.01);
}
}
{
LOG(INFO) << "MPSAdd Test";
Workspace ws;