mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Some bug-fixs in mpscnn backend
Summary: att Reviewed By: ajtulloch Differential Revision: D6037723 fbshipit-source-id: d7405b27089210abfd48a33ecee47a87f67ae9a0
This commit is contained in:
parent
6ac393a32b
commit
4ac8ecb76e
6 changed files with 257 additions and 29 deletions
|
|
@ -654,23 +654,55 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
|
|||
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
|
||||
constant float* in1[[buffer(1)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
ushort last_dim = ushort_arg_2;
|
||||
ushort idx;
|
||||
if (in0_is_tex) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * out.get_width() + gid.x;
|
||||
} else {
|
||||
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * outa.get_width() + gid.x;
|
||||
}
|
||||
ushort2 gid_ = ushort2(gid.x, gid.y);
|
||||
if (in0_is_tex) {
|
||||
out.write(in0.read(gid_) * in1[0], gid_);
|
||||
out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
|
||||
} else {
|
||||
outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
|
||||
outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
|
||||
texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
|
||||
texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
|
||||
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
|
||||
constant float* in1[[buffer(1)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
ushort last_dim = ushort_arg_2;
|
||||
ushort idx;
|
||||
if (in0_is_tex) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * out.get_width() + gid.x;
|
||||
} else {
|
||||
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * outa.get_width() + gid.x;
|
||||
}
|
||||
ushort2 gid_ = ushort2(gid.x, gid.y);
|
||||
if (in0_is_tex) {
|
||||
out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
|
||||
} else {
|
||||
outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void elementwise_add_nonarray(texture2d<half, access::read> in0[[texture(0)]],
|
||||
texture2d<half, access::read> in1[[texture(1)]],
|
||||
texture2d<half, access::write> out[[texture(2)]],
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
namespace caffe2 {
|
||||
static constexpr const char* kMPSCNNReadCountArg = "__mpscnn_read_count__";
|
||||
static constexpr const char* kMPSCNNOutputIsTempImageArg = "__mpscnn_output_is_temp_img__";
|
||||
|
||||
static constexpr const int kMetalMaxTextureArrLength = 2048;
|
||||
// We currently only try to convert a fixed set of operators that handle a subset of a full
|
||||
// CNN. We also only run when MPSCNN is available, provides a speedup.
|
||||
// On failure, returns false. On success, returns true, and sets the MPSCNN net in the output
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/context.h"
|
||||
|
||||
|
|
@ -781,11 +780,13 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
|
|||
public:
|
||||
MPSCNNMulOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
|
||||
"OpenGLMul only supports broadcast");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
|
||||
"MPSCNNMul only supports broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(OperatorBase::HasArgument("axis") == false,
|
||||
"OpenGLMul does not support axis");
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::HasArgument("axis") == false,
|
||||
"MPSCNNMul does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
|
|
@ -795,11 +796,15 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
|
|||
MPSImage* X0 = wrapper0.getImage();
|
||||
|
||||
const auto& X1 = Input(1);
|
||||
CAFFE_ENFORCE_EQ(X1.size(), 1); // only scalar is supported
|
||||
CAFFE_ENFORCE_EQ(
|
||||
X1.ndim(),
|
||||
1,
|
||||
"MPSCNNMulOp: Only ndim == 1 for Input(1) is supported for now");
|
||||
|
||||
auto X1_ = [getMPSCNNContext().device newBufferWithBytes:X1.template data<float>()
|
||||
length:sizeof(float)
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
auto X1_ = [getMPSCNNContext().device
|
||||
newBufferWithBytes:X1.template data<float>()
|
||||
length:sizeof(float) * X1.size()
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
|
||||
auto outputWrapper = MPSImageWrapper(
|
||||
this, &wrapper0, X0.numberOfImages, X0.height, X0.width, X0.featureChannels);
|
||||
|
|
@ -807,8 +812,12 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
|
|||
MPSImage* output = outputWrapper.getImage();
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
|
||||
id<MTLComputePipelineState> state = getMPSCNNContext().getSpecializedPipelineState(
|
||||
@"elementwise_mul", {{ushort(X0.numberOfImages), ushort(X0.featureChannels)}});
|
||||
id<MTLComputePipelineState> state =
|
||||
getMPSCNNContext().getSpecializedPipelineState(
|
||||
@"elementwise_mul",
|
||||
{{ushort(X0.numberOfImages),
|
||||
ushort(X0.featureChannels),
|
||||
ushort(X1.dim32(0))}});
|
||||
|
||||
[encoder setComputePipelineState:state];
|
||||
[encoder setTexture:[X0 texture] atIndex:0];
|
||||
|
|
@ -828,6 +837,74 @@ class MPSCNNMulOp final : public Operator<CPUContext> {
|
|||
REGISTER_CPU_OPERATOR(MPSCNNMul, MPSCNNMulOp);
|
||||
OPERATOR_SCHEMA(MPSCNNMul).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class MPSCNNSubOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
MPSCNNSubOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<CPUContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::GetSingleArgument<int>("broadcast", 0) == 1,
|
||||
"MPSCNNSub only supports broadcast");
|
||||
|
||||
OPERATOR_NEEDS_FEATURE(
|
||||
OperatorBase::HasArgument("axis") == false,
|
||||
"MPSCNNSub does not support axis");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
caffe2::Timer t;
|
||||
|
||||
auto wrapper0 = Inputs()[0]->Get<MPSImageWrapper>();
|
||||
MPSImage* X0 = wrapper0.getImage();
|
||||
|
||||
const auto& X1 = Input(1);
|
||||
CAFFE_ENFORCE_EQ(
|
||||
X1.ndim(),
|
||||
1,
|
||||
"MPSCNNSubOp: Only ndim == 1 for Input(1) is supported for now");
|
||||
|
||||
auto X1_ = [getMPSCNNContext().device
|
||||
newBufferWithBytes:X1.template data<float>()
|
||||
length:sizeof(float) * X1.size()
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
|
||||
auto outputWrapper = MPSImageWrapper(
|
||||
this,
|
||||
&wrapper0,
|
||||
X0.numberOfImages,
|
||||
X0.height,
|
||||
X0.width,
|
||||
X0.featureChannels);
|
||||
auto commandBuffer = outputWrapper.getCommandBuffer();
|
||||
MPSImage* output = outputWrapper.getImage();
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder =
|
||||
[commandBuffer computeCommandEncoder];
|
||||
id<MTLComputePipelineState> state =
|
||||
getMPSCNNContext().getSpecializedPipelineState(
|
||||
@"elementwise_sub",
|
||||
{{ushort(X0.numberOfImages),
|
||||
ushort(X0.featureChannels),
|
||||
ushort(X1.dim32(0))}});
|
||||
|
||||
[encoder setComputePipelineState:state];
|
||||
[encoder setTexture:[X0 texture] atIndex:0];
|
||||
[encoder setBuffer:X1_ offset:0 atIndex:1];
|
||||
[encoder setTexture:[output texture] atIndex:2];
|
||||
const auto& launchParams =
|
||||
spatialPointwiseKernelLaunchParams(state, output);
|
||||
[encoder dispatchThreadgroups:launchParams.threadgroupsPerGrid
|
||||
threadsPerThreadgroup:launchParams.threadsPerThreadgroup];
|
||||
[encoder endEncoding];
|
||||
wrapper0.markRead();
|
||||
outputWrapper.copyToOutputBlob(Outputs()[0]);
|
||||
VLOG(2) << "ElementwiseSub took: " << t.MilliSeconds();
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR(MPSCNNSub, MPSCNNSubOp);
|
||||
OPERATOR_SCHEMA(MPSCNNSub).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class MPSCNNAddOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
MPSCNNAddOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
|
|
@ -1224,7 +1301,8 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
|
|||
CAFFE_ENFORCE_EQ(conv_.outputFeatureChannels, output_channels * kH * kW);
|
||||
CAFFE_ENFORCE_EQ(conv_.kernelWidth, 1);
|
||||
CAFFE_ENFORCE_EQ(conv_.kernelHeight, 1);
|
||||
if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) > MAX_ARR_LENGTH) {
|
||||
if (divRoundUp(X.numberOfImages * output_channels * kH * kW, 4) >
|
||||
kMetalMaxTextureArrLength) {
|
||||
LOG(INFO) << "ConvTranspose " << X.numberOfImages << " " << output_channels << " " << kH
|
||||
<< " " << kW;
|
||||
LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
|
||||
|
|
@ -1280,8 +1358,6 @@ class MPSCNNConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext> {
|
|||
// Output: Y
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
id<MTLBuffer> biasBuffer_;
|
||||
int MAX_ARR_LENGTH = 2048;
|
||||
|
||||
MPSCNNConvolution* conv_{nullptr};
|
||||
};
|
||||
|
||||
|
|
@ -1568,6 +1644,13 @@ class MPSCNNRoIWarpOp final : public Operator<CPUContext> {
|
|||
inputWrapper.cleanup();
|
||||
return false;
|
||||
}
|
||||
if (divRoundUp(R.dim32(0) * featureChannels, 4) >
|
||||
kMetalMaxTextureArrLength) {
|
||||
LOG(INFO) << "MPSCNNRoIWarp " << R.dim32(0) << " " << featureChannels;
|
||||
LOG(ERROR) << "arrayLength exceeds the maximum allowed length in texture";
|
||||
inputWrapper.cleanup();
|
||||
return false;
|
||||
}
|
||||
auto outputWrapper = MPSImageWrapper(
|
||||
this, &inputWrapper, R.dim32(0), pooled_height_, pooled_width_, featureChannels);
|
||||
auto commandBuffer = outputWrapper.getCommandBuffer();
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
#include "mpscnn_graph_mask.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "mpscnn_context.h"
|
||||
|
|
@ -48,17 +47,35 @@ string as_string(StorageType st) {
|
|||
|
||||
std::unordered_map<string, std::vector<StorageType>> inputStorageTypeMap = {
|
||||
{"MPSCNNGenerateProposalsCPP",
|
||||
std::vector<StorageType>{
|
||||
StorageType::CPU, StorageType::CPU, StorageType::CPU, StorageType::CPU}},
|
||||
{"MPSCNNRoIWarp", std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU}},
|
||||
std::vector<StorageType>{StorageType::CPU,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNRoIWarp",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNConvRelu",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNFC",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNConv",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}},
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNConvTranspose",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE, StorageType::CPU, StorageType::CPU}}};
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNMul",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU}},
|
||||
{"MPSCNNSub",
|
||||
std::vector<StorageType>{StorageType::MPSTEMPORARYIMAGE,
|
||||
StorageType::CPU}}};
|
||||
std::unordered_map<string, std::vector<StorageType>> outputStorageTypeMap = {
|
||||
{"MPSCNNGenerateProposalsCPP", std::vector<StorageType>{StorageType::CPU, StorageType::CPU}}};
|
||||
std::vector<string> opsNeedsSync = {"MPSCNNGenerateProposalsCPP", "CopyFromMPSCNN", "CopyToMPSCNN"};
|
||||
|
|
@ -430,7 +447,7 @@ NetDef insertCopies(const NetDef& def) {
|
|||
expectedBlobType = StorageType::CPU;
|
||||
}
|
||||
auto inputBlob = ogOp.input(j);
|
||||
auto version = analysis.ssa[i].outVersions[inputBlob];
|
||||
auto version = analysis.ssa[i].inVersions[inputBlob];
|
||||
// Check whether the blob is produced by previous operators
|
||||
if (analysis.blobInfoMap.find(inputBlob) != analysis.blobInfoMap.end() &&
|
||||
analysis.blobInfoMap[inputBlob][version].storageType != StorageType::INVALID) {
|
||||
|
|
|
|||
|
|
@ -674,20 +674,51 @@ kernel void elementwise_mul(texture2d<half, access::read> in0[[texture(0), funct
|
|||
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
|
||||
constant float* in1[[buffer(1)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
ushort last_dim = ushort_arg_2;
|
||||
ushort idx;
|
||||
if (in0_is_tex) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * out.get_width() + gid.x;
|
||||
} else {
|
||||
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * outa.get_width() + gid.x;
|
||||
}
|
||||
ushort2 gid_ = ushort2(gid.x, gid.y);
|
||||
if (in0_is_tex) {
|
||||
out.write(in0.read(gid_) * in1[0], gid_);
|
||||
out.write(in0.read(gid_) * in1[idx % last_dim], gid_);
|
||||
} else {
|
||||
outa.write(ina0.read(gid_, gid.z) * in1[0], gid_, gid.z);
|
||||
outa.write(ina0.read(gid_, gid.z) * in1[idx % last_dim], gid_, gid.z);
|
||||
}
|
||||
}
|
||||
|
||||
kernel void elementwise_sub(texture2d<half, access::read> in0[[texture(0), function_constant(in0_is_tex)]],
|
||||
texture2d_array<half, access::read> ina0[[texture(0), function_constant(in0_is_arr)]],
|
||||
texture2d<half, access::write> out[[texture(2), function_constant(in0_is_tex)]],
|
||||
texture2d_array<half, access::write> outa[[texture(2), function_constant(in0_is_arr)]],
|
||||
constant float* in1[[buffer(1)]],
|
||||
ushort3 gid[[thread_position_in_grid]]) {
|
||||
ushort last_dim = ushort_arg_2;
|
||||
ushort idx;
|
||||
if (in0_is_tex) {
|
||||
if (gid.x >= out.get_width() || gid.y >= out.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * out.get_width() + gid.x;
|
||||
} else {
|
||||
if (gid.x >= outa.get_width() || gid.y >= outa.get_height()) {
|
||||
return;
|
||||
}
|
||||
idx = gid.y * outa.get_width() + gid.x;
|
||||
}
|
||||
ushort2 gid_ = ushort2(gid.x, gid.y);
|
||||
if (in0_is_tex) {
|
||||
out.write(in0.read(gid_) - in1[idx % last_dim], gid_);
|
||||
} else {
|
||||
outa.write(ina0.read(gid_, gid.z) - in1[idx % last_dim], gid_, gid.z);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1582,7 +1582,7 @@ void testMPSCNN() {
|
|||
|
||||
{
|
||||
auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(1);
|
||||
t->Resize(72);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
|
@ -1633,6 +1633,71 @@ void testMPSCNN() {
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
LOG(INFO) << "MPSCNNSub Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto* t = ws.CreateBlob("X0_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(1, 12, 57, 72);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(
|
||||
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto* t = ws.CreateBlob("X1_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(72);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(
|
||||
t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type("CopyToMPSCNN");
|
||||
op.add_input("X0_cpu");
|
||||
op.add_output("X0_mtl");
|
||||
}
|
||||
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type("MPSCNNSub");
|
||||
op.add_input("X0_mtl");
|
||||
op.add_input("X1_cpu");
|
||||
op.add_output("Y_mtl");
|
||||
add_arg_int(op, "broadcast", 1);
|
||||
}
|
||||
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMPSCNN");
|
||||
op.add_input("Y_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto& op = *(netdef.add_op());
|
||||
op.set_type("Sub");
|
||||
op.add_input("X0_cpu");
|
||||
op.add_input("X1_cpu");
|
||||
op.add_output("Y_ref");
|
||||
add_arg_int(op, "broadcast", 1);
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto& t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto& t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
// FP16 <-> FP32 round trip, accumulation, etc.
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, 0.01);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
LOG(INFO) << "MPSAdd Test";
|
||||
Workspace ws;
|
||||
|
|
|
|||
Loading…
Reference in a new issue