From ab3fea540dc75f2dfe9d3688258502042e0bbb59 Mon Sep 17 00:00:00 2001 From: Yangqing Jia Date: Thu, 1 Dec 2016 22:30:55 -0800 Subject: [PATCH] Add serialization interface for MKLMemory Summary: This allows us to serialize things between MKLMemory and a TensorProto. Reviewed By: dzhulgakov Differential Revision: D4218044 fbshipit-source-id: 934181493b482cb259c17ff4b17008eac52fd885 --- caffe2/core/blob_serialization.cc | 9 +- caffe2/core/common.h | 29 +++++ caffe2/mkl/mklmemory_serialization.cc | 124 ++++++++++++++++++ caffe2/mkl/mklmemory_serialization_test.cc | 54 ++++++++ caffe2/mkl/operators/operator_fallback_mkl.h | 127 +++++++++++++++++++ caffe2/operators/conv_pool_op_base.h | 13 +- caffe2/operators/operator_fallback_gpu.h | 26 ---- caffe2/utils/mkl/mkl_memory.h | 109 ++++++++++++---- 8 files changed, 433 insertions(+), 58 deletions(-) create mode 100644 caffe2/mkl/mklmemory_serialization.cc create mode 100644 caffe2/mkl/mklmemory_serialization_test.cc create mode 100644 caffe2/mkl/operators/operator_fallback_mkl.h diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index 70676cb2e08..0f06bbccd24 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -69,7 +69,14 @@ std::string TensorDeviceTypeName(const int32_t& d) { case MKLDNN: return "TensorMKLDNN"; default: - CAFFE_THROW("Unknown device: ", d); + CAFFE_THROW( + "Unknown device: ", + d, + ". If you have recently updated the caffe2.proto file to add a new " + "device type, did you forget to update the TensorDeviceTypeName() " + "function to reflect such recent changes?"); + // The below code won't run but is needed to suppress some compiler + // warnings. return ""; } }; diff --git a/caffe2/core/common.h b/caffe2/core/common.h index d1651ee9b61..82b331be109 100644 --- a/caffe2/core/common.h +++ b/caffe2/core/common.h @@ -114,6 +114,35 @@ inline Dst dynamic_cast_if_rtti(Src ptr) { #endif } +// SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h +// as utilty functions that marks input / output indices to skip when we use a +// CPU operator as the fallback of GPU/MKL operator option. +template +class SkipIndices { + private: + template + static inline bool ContainsInternal(const int i) { + return (i == V); + } + template + static inline bool ContainsInternal(const int i) { + return (i == First) && ContainsInternal(i); + } + + public: + static inline bool Contains(const int i) { + return ContainsInternal(i); + } +}; + +template <> +class SkipIndices<> { + public: + static inline bool Contains(const int i) { + return false; + } +}; + } // namespace caffe2 #endif // CAFFE2_CORE_COMMON_H_ diff --git a/caffe2/mkl/mklmemory_serialization.cc b/caffe2/mkl/mklmemory_serialization.cc new file mode 100644 index 00000000000..8164d570440 --- /dev/null +++ b/caffe2/mkl/mklmemory_serialization.cc @@ -0,0 +1,124 @@ +#include "caffe2/core/blob.h" +#include "caffe2/core/blob_serialization.h" +#include "caffe2/utils/mkl_utils.h" + +#ifdef CAFFE2_HAS_MKL_DNN + +namespace caffe2 { +namespace mkl { +/** + * @brief MKLMemorySerializer is the serializer for MKLMemory. + * + * MKLMemorySerializer takes in a blob that contains an MKLMemory, and + * serializes it into a TensorProto protocol buffer. + */ +class MKLMemorySerializer : public BlobSerializerBase { + public: + MKLMemorySerializer() {} + ~MKLMemorySerializer() {} + + void Serialize( + const Blob& blob, + const string& name, + SerializationAcceptor acceptor) override { + BlobProto blob_proto; + blob_proto.set_name(name); + blob_proto.set_type(kTensorBlobType); + TensorProto* proto = blob_proto.mutable_tensor(); + auto* device_detail = proto->mutable_device_detail(); + device_detail->set_device_type(MKLDNN); + proto->set_name(name); + if (blob.IsType>()) { + const MKLMemory& src = blob.Get>(); + CAFFE_ENFORCE( + src.buffer(), "Cannot serialize an empty MKLMemory object."); + size_t total = 1; + for (int i = 0; i < src.dims().size(); ++i) { + proto->add_dims(src.dims()[i]); + total *= src.dims()[i]; + } + proto->mutable_float_data()->Reserve(total); + while (total--) { + proto->add_float_data(0); + } + src.CopyTo(proto->mutable_float_data()->mutable_data()); + } else if (blob.IsType>()) { + const MKLMemory& src = blob.Get>(); + CAFFE_ENFORCE( + src.buffer(), "Cannot serialize an empty MKLMemory object."); + size_t total = 1; + for (int i = 0; i < src.dims().size(); ++i) { + proto->add_dims(src.dims()[i]); + total *= src.dims()[i]; + } + proto->mutable_double_data()->Reserve(total); + while (total--) { + proto->add_double_data(0); + } + src.CopyTo(proto->mutable_double_data()->mutable_data()); + } else { + CAFFE_THROW( + "MKLMemory could only be either float or double. " + "Encountered unsupported type."); + } + acceptor(name, blob_proto.SerializeAsString()); + } +}; + +/** + * @brief MKLMemoryDeserializer is the deserializer for TensorProto that has + * MKLDNN as its device. + * + * The device that the deserialized Tensor will live under is determined by the + * device_detail field. If you want to specify the device of the deserialized + * tensor, change the TensorProto's corresponding fields before calling + * Deserialize. + */ +class MKLMemoryDeserializer : public BlobDeserializerBase { + public: + bool Deserialize(const BlobProto& blob_proto, Blob* blob) override { + const TensorProto& proto = blob_proto.tensor(); + CAFFE_ENFORCE( + proto.data_type() == TensorProto_DataType_FLOAT || + proto.data_type() == TensorProto_DataType_DOUBLE, + "MKLMemory only supports either float or double formats."); + CAFFE_ENFORCE( + !proto.has_segment(), "MKLMemory does not support segment right now."); + vector dims; + for (const TIndex d : proto.dims()) { + dims.push_back(d); + } + // TODO: right now, every time we do a deserializer we create a new MKL + // Memory object. Optionally, we can change that. + switch (proto.data_type()) { + case TensorProto_DataType_FLOAT: { + auto dst = make_unique>(dims); + dst->CopyFrom(proto.float_data().data()); + blob->Reset(dst.release()); + break; + } + case TensorProto_DataType_DOUBLE: { + auto dst = make_unique>(dims); + dst->CopyFrom(proto.double_data().data()); + blob->Reset(dst.release()); + break; + } + default: + CAFFE_THROW("This should not happen, we guarded things above already."); + } + return true; + } +}; + +} // namespace mkl + +REGISTER_BLOB_SERIALIZER( + (TypeMeta::Id>()), + mkl::MKLMemorySerializer); +REGISTER_BLOB_SERIALIZER( + (TypeMeta::Id>()), + mkl::MKLMemorySerializer); +REGISTER_BLOB_DESERIALIZER(TensorMKLDNN, mkl::MKLMemoryDeserializer); +} // namespace caffe2 + +#endif // CAFFE2_HAS_MKL_DNN diff --git a/caffe2/mkl/mklmemory_serialization_test.cc b/caffe2/mkl/mklmemory_serialization_test.cc new file mode 100644 index 00000000000..e8274f7f6d5 --- /dev/null +++ b/caffe2/mkl/mklmemory_serialization_test.cc @@ -0,0 +1,54 @@ +#include "caffe2/core/blob.h" +#include "caffe2/core/blob_serialization.h" +#include "caffe2/core/common.h" +#include "caffe2/utils/mkl_utils.h" + +#include "gtest/gtest.h" + +#ifdef CAFFE2_HAS_MKL_DNN + +namespace caffe2 { + +using mkl::MKLMemory; + +TEST(MKLTest, MKLMemorySerialization) { + Blob blob; + vector shape{2, 3, 4}; + float data[2 * 3 * 4]; + for (int i = 0; i < 2 * 3 * 4; ++i) { + data[i] = i; + } + blob.Reset>(new MKLMemory(shape)); + MKLMemory* mkl_memory = blob.GetMutable>(); + mkl_memory->CopyFrom(data); + string serialized = blob.Serialize("test"); + BlobProto proto; + CHECK(proto.ParseFromString(serialized)); + EXPECT_EQ(proto.name(), "test"); + EXPECT_EQ(proto.type(), "Tensor"); + EXPECT_TRUE(proto.has_tensor()); + const TensorProto& tensor_proto = proto.tensor(); + EXPECT_EQ( + tensor_proto.data_type(), TypeMetaToDataType(TypeMeta::Make())); + EXPECT_EQ(tensor_proto.float_data_size(), 2 * 3 * 4); + for (int i = 0; i < 2 * 3 * 4; ++i) { + EXPECT_EQ(tensor_proto.float_data(i), static_cast(i)); + } + Blob new_blob; + EXPECT_TRUE(new_blob.Deserialize(serialized)); + EXPECT_TRUE(new_blob.IsType>()); + const auto& new_mkl_memory = blob.Get>(); + EXPECT_EQ(new_mkl_memory.dims().size(), 3); + EXPECT_EQ(new_mkl_memory.dims()[0], 2); + EXPECT_EQ(new_mkl_memory.dims()[1], 3); + EXPECT_EQ(new_mkl_memory.dims()[2], 4); + float recovered_data[2 * 3 * 4]; + new_mkl_memory.CopyTo(recovered_data); + for (int i = 0; i < 2 * 3 * 4; ++i) { + EXPECT_EQ(recovered_data[i], i); + } +} + +} // namespace caffe2 + +#endif // CAFFE2_HAS_MKL_DNN diff --git a/caffe2/mkl/operators/operator_fallback_mkl.h b/caffe2/mkl/operators/operator_fallback_mkl.h new file mode 100644 index 00000000000..bec3fd0390d --- /dev/null +++ b/caffe2/mkl/operators/operator_fallback_mkl.h @@ -0,0 +1,127 @@ +#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ +#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ + +#include "caffe2/core/common.h" +#include "caffe2/core/context.h" +#include "caffe2/core/context_gpu.h" +#include "caffe2/core/operator.h" +#include "caffe2/proto/caffe2.pb.h" + +namespace caffe2 { + +/** + * @brief A templated class to allow one to wrap a CPU operator as an MKL + * operator. + * + * This class can be used when one does not have the MKL implementation ready + * yet for an operator. Essentially, what this op does is to automatically + * deal with data copy for you. Plausibly, this causes a lot of overhead and + * is not optimal, so you should use this operator mostly for quick prototyping + * purpose. + * + * All the input and output of the original operator should be TensorCPU. + * + * Example usage: if you have a class MyMagicOp that is CPU based, and you use + * the registration code + * REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp); + * to register the CPU side, you can create its corresponding MKL operator + * (with performance hits of course) via + * REGISTER_MKL_OPERATOR(MyMagic, + * MKLFallbackOp); + * + * Advanced usage: if you want to have some specific outputs never copied, you + * can use the SkipOutputCopy template argument to do that. For example, if + * MyMagic produces two outputs and the first output is always going to live on + * the CPU, you can do + * REGISTER_CUDA_OPERATOR(MyMagic, + * MKLFallbackOp>); + */ +template > +class MKLFallbackOp final : public Operator { + public: + USE_OPERATOR_FUNCTIONS(MKLContext); + MKLFallbackOp(const OperatorDef& def, Workspace* ws) + : Operator(def, ws) { + CAFFE_ENFORCE_EQ(def.device_option().device_type(), MKLDNN); + OperatorDef base_def_(def); + // base_def_ runs on CPU, so we will set its device option to CPU. + base_def_.clear_device_option(); + base_def_.mutable_device_option()->set_device_type(CPU); + // Set up the symbols for the local workspace. + for (const string& name : def.input()) { + local_input_blobs_.push_back(local_ws_.CreateBlob(name)); + CHECK_NOTNULL(local_input_blobs_.back()); + } + base_op_.reset(new CPUOp(base_def_, &local_ws_)); + for (const string& name : def.output()) { + local_output_blobs_.push_back(local_ws_.GetBlob(name)); + CHECK_NOTNULL(local_output_blobs_.back()); + } + } + + bool RunOnDevice() override { + for (int i = 0; i < InputSize(); ++i) { + if (OperatorBase::InputIsType>(i)) { + OperatorBase::Input>(i).CopyTo( + local_input_blobs_[i]->template GetMutable()); + } else if (OperatorBase::InputIsType>(i)) { + OperatorBase::Input>(i).CopyTo( + local_input_blobs_[i]->template GetMutable()); + } else { + VLOG(1) << "Input " << i << " is not MKLMemory. Skipping copy."; + // Note(jiayq): This removes a const but conceptually + // local_input_blobs will only be used as const blob input for the + // base op so we are still fine. + local_input_blobs_[i]->ShareExternal( + const_cast(OperatorBase::Inputs()[i]->GetRaw()), + OperatorBase::Inputs()[i]->meta()); + } + } + + if (!base_op_->Run()) { + LOG(ERROR) << "Base op run failed in MKLFallbackOp. Def: " + << ProtoDebugString(def()); + return false; + } + + for (int i = 0; i < OutputSize(); ++i) { + if (SkipOutputCopy::Contains(i)) { + VLOG(1) << "Copy output: index " << i << " skipped."; + continue; + } + CAFFE_ENFORCE( + local_output_blobs_[i]->template IsType(), + "MKL fallback op currently does not support non-TensorCPU " + "output type who needs copying."); + const auto& src = local_output_blobs_[i]->template Get(); + if (src.IsType()) { + Blob& dst = OperatorBase::OutputAt(i); + if (!dst.IsType>() || + dst.Get>().dims() != src.dims()) { + dst.Reset(new MKLMemory(src.dims()); + } + dst.GetMutable < MKLMemory()->CopyFrom(src); + } else if (src.IsType()) { + Blob& dst = OperatorBase::OutputAt(i); + if (!dst.IsType>() || + dst.Get>().dims() != src.dims()) { + dst.Reset(new MKLMemory(src.dims()); + } + dst.GetMutable < MKLMemory()->CopyFrom(src); + } else { + CAFFE_THROW("MKLMemory only supports float and double."); + } + } + return true; + } + + protected: + Workspace local_ws_; + vector local_input_blobs_; + vector local_output_blobs_; + std::unique_ptr base_op_; +}; + +} // namespace caffe2 + +#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_ diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index 6fbc784f7cc..7a2fef079d1 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -97,9 +97,14 @@ class ConvPoolOpBase : public Operator { // it may not be identical to the input channels. // This function can be used in the forward functions to obtain the output // sizes. + // Note(jiayq): the templatization of this function is mainly to help + // implementations that do not use first-class Tensor objects, such as the + // MKL operator. One can still call this function with dummy + // Tensor objects in order to obtain the sizes. + template void SetOutputSize( - const Tensor& input, - Tensor* output, + const Tensor& input, + Tensor* output, int output_channel) { CAFFE_ENFORCE(4 == input.ndim()); CAFFE_ENFORCE(input.size() > 0); @@ -119,7 +124,7 @@ class ConvPoolOpBase : public Operator { W = input.dim32(3); break; default: - LOG(FATAL) << "Unknown Storage order: " << order_; + CAFFE_THROW("Unknown Storage order: ", order_); } int output_height = 0, output_width = 0; @@ -191,7 +196,7 @@ class ConvPoolOpBase : public Operator { // VLOG(2) << "Running NCHW"; return RunOnDeviceWithOrderNCHW(); default: - LOG(FATAL) << "Unknown storage order: " << order_; + CAFFE_THROW("Unknown Storage order: ", order_); } // To suppress old compiler warnings return true; diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index d26d44913d6..d7fe666fb9e 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -9,32 +9,6 @@ namespace caffe2 { -template -class SkipIndices { - private: - template - static inline bool ContainsInternal(const int i) { - return (i == V); - } - template - static inline bool ContainsInternal(const int i) { - return (i == First) && ContainsInternal(i); - } - - public: - static inline bool Contains(const int i) { - return ContainsInternal(i); - } -}; - -template <> -class SkipIndices<> { - public: - static inline bool Contains(const int i) { - return false; - } -}; - /** * @brief A templated class to allow one to wrap a CPU operator as a CUDA * operator. diff --git a/caffe2/utils/mkl/mkl_memory.h b/caffe2/utils/mkl/mkl_memory.h index ed739d690fd..9da7f56e792 100644 --- a/caffe2/utils/mkl/mkl_memory.h +++ b/caffe2/utils/mkl/mkl_memory.h @@ -116,6 +116,8 @@ class LayoutWrapper { /** * @brief A wrapper around an opaque MKL internal resource that has certain * layouts and convertion primitives set up. + * + * Most of the MKLMemory functions are not thread safe. */ template class MKLMemory { @@ -131,6 +133,29 @@ class MKLMemory { const dnnPrimitive_t primitive = nullptr, const dnnResourceType_t type = dnnResourceNumber, bool share_mem_if_possible = false) { + Reset(dimension, size, strides, primitive, type, share_mem_if_possible); + } + + // Initialize an MKLMemory, with the given dimension assuming a C-contiguous + // storage. + template + explicit MKLMemory( + const vector& dims, + const dnnPrimitive_t primitive = nullptr, + const dnnResourceType_t type = dnnResourceNumber, + bool share_mem_if_possible = false) { + Reset(dims, primitive, type, share_mem_if_possible); + } + + // Initialize an MKLMemory with the given size, strides, dnn + // primitive and type. + void Reset( + const size_t dimension, + const size_t size[], + const size_t strides[], + const dnnPrimitive_t primitive = nullptr, + const dnnResourceType_t type = dnnResourceNumber, + bool share_mem_if_possible = false) { dims_.resize(dimension); for (int i = 0; i < dimension; ++i) { dims_[i] = size[dimension - 1 - i]; @@ -143,27 +168,22 @@ class MKLMemory { } convert_in_.Reset(dnnConversionCreate, user_layout_, layout_); convert_out_.Reset(dnnConversionCreate, layout_, user_layout_); - share_mem_ = - share_mem_if_possible && dnnLayoutCompare(layout_, user_layout_); - if (!share_mem_) { - // If we do not do copy, we will create the buffer and own it. - void* allocated = nullptr; - MKLDNN_SAFE_CALL(dnnAllocateBuffer(&allocated, layout_)); - buffer_.reset(allocated, [](void* ptr) -> void { - MKLDNN_CHECK(dnnReleaseBuffer(ptr)); - }); - } + share_mem_if_possible_ = share_mem_if_possible; + layout_is_user_layout_ = dnnLayoutCompare(layout_, user_layout_); } // Initialize an MKLMemory, with the given dimension assuming a C-contiguous // storage. template - explicit MKLMemory( + void Reset( const vector& dims, const dnnPrimitive_t primitive = nullptr, const dnnResourceType_t type = dnnResourceNumber, bool share_mem_if_possible = false) { - dims_ = dims; + dims_.resize(dims.size()); + for (int i = 0; i < dims.size(); ++i) { + dims_[i] = dims[i]; + } size_t dimension = dims.size(); size_t size[dimension]; size_t strides[dimension]; @@ -179,27 +199,19 @@ class MKLMemory { } convert_in_.Reset(dnnConversionCreate, user_layout_, layout_); convert_out_.Reset(dnnConversionCreate, layout_, user_layout_); - share_mem_ = - share_mem_if_possible && dnnLayoutCompare(layout_, user_layout_); - if (!share_mem_) { - // If we do not do copy, we will create the buffer and own it. - void* allocated = nullptr; - MKLDNN_SAFE_CALL(dnnAllocateBuffer(&allocated, layout_)); - buffer_.reset(allocated, [](void* ptr) -> void { - MKLDNN_CHECK(dnnReleaseBuffer(ptr)); - }); - } + share_mem_if_possible_ = share_mem_if_possible; + layout_is_user_layout_ = dnnLayoutCompare(layout_, user_layout_); } // Destructs the MKLMemory. ~MKLMemory() {} void CopyFrom(const void* ptr) { - if (share_mem_) { + if (share_mem_if_possible_ && layout_is_user_layout_) { buffer_.reset(const_cast(ptr), [](void*) -> void {}); } else { MKLDNN_SAFE_CALL(dnnConversionExecute( - convert_in_, const_cast(ptr), buffer_.get())); + convert_in_, const_cast(ptr), buffer())); } } @@ -211,8 +223,19 @@ class MKLMemory { CopyFrom(tensor.template data()); } + void CopyFrom(const MKLMemory& other) { + if (share_mem_if_possible_ && dnnLayoutCompare(other.layout_, layout_)) { + buffer_ = other.buffer_; + } else { + PrimitiveWrapper convert( + dnnConversionCreate, other.layout_, layout_); + MKLDNN_SAFE_CALL( + dnnConversionExecute(convert, other.buffer_, buffer())); + } + } + bool ShareFrom(const void* ptr) { - if (share_mem_) { + if (share_mem_if_possible_ && layout_is_user_layout_) { buffer_.reset(const_cast(ptr), [](void*) -> void {}); return true; } else { @@ -228,13 +251,22 @@ class MKLMemory { return ShareFrom(tensor.template data()); } + bool ShareFrom(const MKLMemory& other) { + if (share_mem_if_possible_ && dnnLayoutCompare(other.layout_, layout_)) { + buffer_ = other.buffer_; + return true; + } else { + return false; + } + } + void CopyTo(void* ptr) const { if (buffer_.get() == ptr) { // This is already mapping to the same memory region. Skip copy. return; } CAFFE_ENFORCE( - buffer_.get(), "Canot copy out from an empty internal resource."); + buffer_.get(), "Canot copy out from an uninitialized MKLMemory."); MKLDNN_SAFE_CALL(dnnConversionExecute(convert_out_, buffer_.get(), ptr)); } @@ -247,7 +279,29 @@ class MKLMemory { CopyTo(tensor->mutable_data()); } + void CopyTo(MKLMemory* other) { + if (buffer_.get() == other->buffer_.get()) { + return; + } + CAFFE_ENFORCE( + buffer_.get(), "Canot copy out from an uninitialized MKLMemory."); + // TODO(jiayq): if primitive creation is a big overhead and we will be + // consistently copying stuff with fixed src and dst layouts, consider + // making a cache for the primitive below. + PrimitiveWrapper convert( + dnnConversionCreate, layout_, other->layout_); + MKLDNN_SAFE_CALL( + dnnConversionExecute(convert, buffer_, other->buffer())); + } + inline void* buffer() { + if (buffer_ == nullptr) { + void* allocated = nullptr; + MKLDNN_SAFE_CALL(dnnAllocateBuffer(&allocated, layout_)); + buffer_.reset(allocated, [](void* ptr) -> void { + MKLDNN_CHECK(dnnReleaseBuffer(ptr)); + }); + } return buffer_.get(); } @@ -279,7 +333,8 @@ class MKLMemory { } private: - bool share_mem_; + bool share_mem_if_possible_; + bool layout_is_user_layout_; // The internal buffer in the specific dnn layout. std::shared_ptr buffer_; // The dimensions in the same order as Caffe2 does. This is used to