MKL convolution operator

Summary: Closes https://github.com/caffe2/caffe2/pull/102

Differential Revision: D4448886

Pulled By: Yangqing

fbshipit-source-id: 914d11cd79107895a9755154df3526fcf71a31ea
This commit is contained in:
Yangqing Jia 2017-01-23 09:44:23 -08:00 committed by Facebook Github Bot
parent e0c90de6e6
commit e3ea3e8c12
13 changed files with 402 additions and 72 deletions

View file

@ -99,6 +99,19 @@ if (BUILD_PYTHON)
message(STATUS "Automatically generating missing __init__.py files.")
caffe_autogen_init_py_files()
# Create a custom target that copies all python files.
file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${PROJECT_SOURCE_DIR}
"${PROJECT_SOURCE_DIR}/caffe2/*.py")
add_custom_target(python_copy_files ALL)
foreach(python_src ${PYTHON_SRCS})
get_filename_component(dir ${python_src} DIRECTORY)
add_custom_command(
TARGET python_copy_files PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${PROJECT_SOURCE_DIR}/${python_src} ${CMAKE_BINARY_DIR}/${dir})
# file(COPY ${python_src} DESTINATION ${CMAKE_BINARY_DIR}/caffe2/${dir})
endforeach()
# Install commands
# Pick up static python files
install(DIRECTORY ${CMAKE_BINARY_DIR}/caffe2 DESTINATION ${CMAKE_INSTALL_PREFIX}

View file

@ -161,13 +161,3 @@ foreach(binary_src ${Caffe2_ALL_BINARY_SRCS})
install(TARGETS ${bin_name} DESTINATION ${CMAKE_INSTALL_PREFIX}/binaries)
endforeach()
# ---[ Python files
if (BUILD_PYTHON)
file(GLOB_RECURSE PYTHON_SRCS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.py)
foreach(python_src ${PYTHON_SRCS})
get_filename_component(dir ${python_src} DIRECTORY)
file(COPY ${python_src} DESTINATION ${CMAKE_BINARY_DIR}/caffe2/${dir})
endforeach()
endif()

View file

@ -1,12 +1,24 @@
/**
* @file flags.h
* @brief Commandline flags support for Caffe2.
*
* This is a portable commandline flags tool for caffe2, so we can optionally
* choose to use gflags or a lightweighted custom implementation if gflags is
* not possible on a certain platform. If you have gflags installed, set the
* macro CAFFE2_USE_GFLAGS will seamlessly route everything to gflags.
*
* To define a flag foo of type bool default to true, do the following in the
* *global* namespace:
* CAFFE2_DEFINE_bool(foo, true, "An example.");
*
* To use it in another .cc file, you can use CAFFE2_DECLARE_* as follows:
* CAFFE2_DECLARE_bool(foo);
*
* In both cases, you can then access the flag via caffe2::FLAGS_foo.
*/
#ifndef CAFFE2_CORE_FLAGS_H_
#define CAFFE2_CORE_FLAGS_H_
// A lightweighted commandline flags tool for caffe2, so we do not need to rely
// on gflags. If you have gflags installed, set the macro CAFFE2_USE_GFLAGS will
// seamlessly route everything to gflags.
#ifdef CAFFE2_USE_GFLAGS
#include <gflags/gflags.h>
#endif
#include "caffe2/core/registry.h"
@ -44,6 +56,8 @@ bool CommandLineFlagsHasBeenParsed();
#ifdef CAFFE2_USE_GFLAGS
#include <gflags/gflags.h>
#define CAFFE2_GFLAGS_DEF_WRAPPER(type, name, default_value, help_str) \
DEFINE_##type(name, default_value, help_str); \
namespace caffe2 { \

View file

@ -0,0 +1,134 @@
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/utils/mkl_utils.h"
#ifdef CAFFE2_HAS_MKL_DNN
namespace caffe2 {
namespace mkl {
template <typename T>
class MKLConvOp final : public ConvPoolOpBase<MKLContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(MKLContext);
MKLConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<MKLContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(
dilation_h_ == 1 && dilation_w_ == 1, "Dilation not supported.");
OPERATOR_NEEDS_FEATURE(
pad_l_ == pad_r_ && pad_t_ == pad_b_, "Uneven padding not supported.");
OPERATOR_NEEDS_FEATURE(
order_ == StorageOrder::NCHW, "Only NCHW order supported.");
OPERATOR_NEEDS_FEATURE(
group_ == 1, "Group convolution not supported yet.");
}
~MKLConvOp() {}
// TODO(jiayq): support double if needed.
bool RunOnDeviceWithOrderNCHW() override {
auto& X = OperatorBase::Input<MKLMemory<float>>(INPUT);
auto& filter = OperatorBase::Input<MKLMemory<float>>(FILTER);
auto& bias = OperatorBase::Input<MKLMemory<float>>(BIAS);
MKLMemory<float>* Y = OperatorBase::Output<MKLMemory<float>>(0);
CAFFE_ENFORCE(4 == X.ndim());
const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(4 == filter.ndim());
const int M = filter.dim32(0);
if (cached_input_dims_ != X.dims() ||
cached_filter_dims_ != filter.dims()) {
cached_input_dims_ = X.dims();
cached_filter_dims_ = filter.dims();
CAFFE_ENFORCE(
C == filter.dim32(1),
"Convolution op: # of input channels ",
C,
" is not equal to kernel channels:",
filter.dim32(1));
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_);
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_);
CAFFE_ENFORCE(bias.ndim() == 1);
CAFFE_ENFORCE(bias.dim32(0) == M);
size_t dimension = 4;
size_t bdata_sizes[4] = {W, H, C, N};
// We will utilize the SetOutputSize() function int he base class
// with dummy TensorCPU input and output to calculate the sizes.
TensorCPU dummy_input(X.dims());
TensorCPU dummy_output;
ConvPoolOpBase<MKLContext>::SetOutputSize(
dummy_input, &dummy_output, M);
size_t tdata_sizes[4] = {
dummy_output.dim(3), dummy_output.dim(2),
dummy_output.dim(1), dummy_output.dim(0)};
size_t fdata_sizes[4] = {kernel_w_, kernel_h_, C, M};
size_t strides[2] = {stride_w_, stride_h_};
int pads[2] = {-pad_l_, -pad_t_};
primitive_.Reset(
dnnConvolutionCreateForwardBias<float>,
nullptr,
dnnAlgorithmConvolutionDirect,
dimension,
bdata_sizes,
tdata_sizes,
fdata_sizes,
strides,
pads,
dnnBorderZeros);
Y->Reset(dummy_output.dims(), primitive_, dnnResourceDst);
buffer_.Reset(dummy_output.dims(), primitive_, dnnResourceDst, true);
input_layout_.Reset(primitive_, dnnResourceSrc);
filter_layout_.Reset(primitive_, dnnResourceFilter);
bias_layout_.Reset(primitive_, dnnResourceBias);
}
// Try to share from the output: this allows us to avoid unnecessary copy
// operations, if the output is already allocated and is having the same
// layout as the buffer has.
buffer_.ShareFrom(*Y);
std::shared_ptr<void> X_view = X.View(
input_layout_, primitive_, dnnResourceSrc);
std::shared_ptr<void> filter_view = filter.View(
filter_layout_, primitive_, dnnResourceFilter);
std::shared_ptr<void> bias_view = bias.View(
bias_layout_, primitive_, dnnResourceBias);
resources_[dnnResourceSrc] = X_view.get();
resources_[dnnResourceFilter] = filter_view.get();
resources_[dnnResourceBias] = bias_view.get();
resources_[dnnResourceDst] = buffer_.buffer();
MKLDNN_SAFE_CALL(mkl::dnnExecute<T>(primitive_, resources_));
buffer_.CopyTo(Y, primitive_, dnnResourceDst);
return true;
}
bool RunOnDeviceWithOrderNHWC() override {
CAFFE_NOT_IMPLEMENTED;
}
private:
// Input: X, W, b
// Output: Y
vector<TIndex> cached_input_dims_;
vector<TIndex> cached_filter_dims_;
PrimitiveWrapper<T> primitive_;
LayoutWrapper<T> input_layout_;
LayoutWrapper<T> filter_layout_;
LayoutWrapper<T> bias_layout_;
MKLMemory<T> buffer_;
void* resources_[dnnResourceNumber] = {0};
INPUT_TAGS(INPUT, FILTER, BIAS);
};
} // namespace mkl
REGISTER_MKL_OPERATOR(Conv, mkl::MKLConvOp<float>);
} // namespace caffe2
#endif // CAFFE2_HAS_MKL_DNN

View file

@ -22,8 +22,9 @@ class MKLReluOp : public MKLOperator<T> {
Y->Reset(X.dims(), primitive_, dnnResourceDst);
buffer_.Reset(X.dims(), primitive_, dnnResourceDst, true);
}
// Try to share from the output: this will save a copy if the output is
// already allocated and is having the same layout as the buffer has.
// Try to share from the output: this allows us to avoid unnecessary copy
// operations, if the output is already allocated and is having the same
// layout as the buffer has.
buffer_.ShareFrom(*Y);
resources_[dnnResourceSrc] = X.buffer();
resources_[dnnResourceDst] = buffer_.buffer();

View file

@ -1,40 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import unittest
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import cnn, core, workspace, test_util
@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
class TestMKLBasic(test_util.TestCase):
def testReLUConsistencyWithCPU(self):
X = np.random.randn(128, 4096).astype(np.float32)
mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
# Makes sure that feed works.
workspace.FeedBlob("X", X)
workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
model = cnn.CNNModelHelper()
# Makes sure that we can run relu.
model.Relu("X", "Y")
model.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
workspace.CreateNet(model.net)
workspace.RunNet(model.net)
# makes sure that the results are good.
np.testing.assert_allclose(
workspace.FetchBlob("Y"),
workspace.FetchBlob("Y_mkl"),
atol=1e-10,
rtol=1e-10)
runtime = workspace.BenchmarkNet(model.net.Proto().name, 1, 10, True)
# The returned runtime is the time of
# [whole_net, cpu_op, mkl_op]
# so we will assume that the MKL one runs faster than the CPU one.
self.assertTrue(runtime[1] >= runtime[2])
print("CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,51 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import unittest
import hypothesis.strategies as st
from hypothesis import given, settings
import numpy as np
from caffe2.python import core, workspace
import caffe2.python.hypothesis_test_util as hu
import caffe2.python.mkl_test_util as mu
@unittest.skipIf(not workspace.C.has_mkldnn,
"Skipping as we do not have mkldnn.")
class MKLConvTest(hu.HypothesisTestCase):
@given(stride=st.integers(1, 3),
pad=st.integers(0, 3),
kernel=st.integers(3, 5),
size=st.integers(8, 8),
input_channels=st.integers(1, 3),
output_channels=st.integers(1, 3),
batch_size=st.integers(1, 3),
**mu.gcs)
@settings(max_examples=2, timeout=100)
def test_mkl_convolution(self, stride, pad, kernel, size,
input_channels, output_channels,
batch_size, gc, dc):
op = core.CreateOperator(
"Conv",
["X", "w", "b"],
["Y"],
stride=stride,
pad=pad,
kernel=kernel,
)
X = np.random.rand(
batch_size, input_channels, size, size).astype(np.float32) - 0.5
w = np.random.rand(
output_channels, input_channels, kernel, kernel) \
.astype(np.float32) - 0.5
b = np.random.rand(output_channels).astype(np.float32) - 0.5
inputs = [X, w, b]
self.assertDeviceChecks(dc, op, inputs, [0])
if __name__ == "__main__":
import unittest
unittest.main()

View file

@ -67,7 +67,7 @@ class PackedFCTest(hu.HypothesisTestCase):
def ref(X, W, b):
output_axes = list(X.shape[:axis]) + [N]
return (
np.dot(X.reshape(X.size / K, K), W.T).reshape(output_axes) + b,)
np.dot(X.reshape(int(X.size / K), K), W.T).reshape(output_axes) + b,)
self.assertReferenceChecks(gc, op, [X, W, b], ref)

View file

@ -0,0 +1,80 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import unittest
import numpy as np
from caffe2.proto import caffe2_pb2
from caffe2.python import cnn, core, workspace, test_util
@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
class TestMKLBasic(test_util.TestCase):
def testReLUSpeed(self):
X = np.random.randn(128, 4096).astype(np.float32)
mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
# Makes sure that feed works.
workspace.FeedBlob("X", X)
workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
net = core.Net("test")
# Makes sure that we can run relu.
net.Relu("X", "Y")
net.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
workspace.CreateNet(net)
workspace.RunNet(net)
# makes sure that the results are good.
np.testing.assert_allclose(
workspace.FetchBlob("Y"),
workspace.FetchBlob("Y_mkl"),
atol=1e-10,
rtol=1e-10)
runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
# The returned runtime is the time of
# [whole_net, cpu_op, mkl_op]
# so we will assume that the MKL one runs faster than the CPU one.
# Note(Yangqing): in fact, it seems that in optimized mode, this is
# not always guaranteed - MKL runs slower than the Eigen vectorized
# version, so I am turning this assertion off.
#self.assertTrue(runtime[1] >= runtime[2])
print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
def testConvSpeed(self):
# We randomly select a shape to test the speed. Intentionally we
# test a batch size of 1 since this may be the most frequent use
# case for MKL during deployment time.
X = np.random.rand(1, 256, 27, 27).astype(np.float32) - 0.5
W = np.random.rand(192, 256, 3, 3).astype(np.float32) - 0.5
b = np.random.rand(192).astype(np.float32) - 0.5
mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
# Makes sure that feed works.
workspace.FeedBlob("X", X)
workspace.FeedBlob("W", W)
workspace.FeedBlob("b", b)
workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
net = core.Net("test")
# Makes sure that we can run relu.
net.Conv(["X", "W", "b"], "Y", pad=1, stride=1, kernel=3)
net.Conv(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl",
pad=1, stride=1, kernel=3, device_option=mkl_do)
workspace.CreateNet(net)
workspace.RunNet(net)
# makes sure that the results are good.
np.testing.assert_allclose(
workspace.FetchBlob("Y"),
workspace.FetchBlob("Y_mkl"),
atol=1e-2,
rtol=1e-2)
runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
print("Conv CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
if __name__ == '__main__':
unittest.main()

View file

@ -20,7 +20,6 @@ class TestCase(unittest.TestCase):
workspace.GlobalInit([
'caffe2',
'--caffe2_log_level=0',
'--caffe2_omp_num_threads=1',
])
def setUp(self):

View file

@ -2,6 +2,14 @@
#ifdef CAFFE2_HAS_MKL_DNN
CAFFE2_DEFINE_bool(
caffe2_mkl_implicit_layout_change, false,
"Controls the behavior when we call View() on an MKLMemory: if it is set "
"true, then the View() function will actually change the underlying "
"storage. If it is set false, an implicit copy is triggered but the "
"original storage is not affected."
);
namespace caffe2 {
CAFFE_KNOWN_TYPE(mkl::MKLMemory<float>);

View file

@ -3,10 +3,18 @@
#include <string>
#include <vector>
#include <mutex>
#include "caffe2/core/tensor.h" // for TIndex
#include "caffe2/core/flags.h" // for TIndex
#include "caffe2/utils/mkl/mkl_dnn_cppwrapper.h"
// A global boolean variable that controls the behavior when we call View() on
// an MKLMemory: if it is set true, then the View() function will actually
// change the underlying storage. If it is set false, an implicit copy is
// triggered but the original storage is not affected.
CAFFE2_DECLARE_bool(caffe2_mkl_implicit_layout_change);
namespace caffe2 {
namespace mkl {
@ -177,6 +185,12 @@ class MKLMemory {
convert_out_.Reset(dnnConversionCreate<T>, layout_, user_layout_);
share_mem_if_possible_ = share_mem_if_possible;
layout_is_user_layout_ = dnnLayoutCompare<T>(layout_, user_layout_);
VLOG(2) << "layout is user layout? " << layout_is_user_layout_;
if (!share_mem_if_possible_) {
// If we are not going to share memory, we will simply allocate
// memory upfront.
buffer();
}
}
// Initialize an MKLMemory, with the given dimension assuming a C-contiguous
@ -209,6 +223,12 @@ class MKLMemory {
convert_out_.Reset(dnnConversionCreate<T>, layout_, user_layout_);
share_mem_if_possible_ = share_mem_if_possible;
layout_is_user_layout_ = dnnLayoutCompare<T>(layout_, user_layout_);
VLOG(2) << "layout is user layout? " << layout_is_user_layout_;
if (!share_mem_if_possible_) {
// If we are not going to share memory, we will simply allocate
// memory upfront.
buffer();
}
}
// Destructs the MKLMemory.
@ -216,8 +236,10 @@ class MKLMemory {
void CopyFrom(const void* ptr) {
if (share_mem_if_possible_ && layout_is_user_layout_) {
VLOG(2) << "Sharing underlying memory and skip copy.";
buffer_.reset(const_cast<void*>(ptr), [](void*) -> void {});
} else {
VLOG(2) << "Copying external content.";
MKLDNN_SAFE_CALL(dnnConversionExecute<T>(
convert_in_, const_cast<void*>(ptr), buffer()));
}
@ -261,9 +283,15 @@ class MKLMemory {
bool ShareFrom(const MKLMemory<T>& other) {
if (share_mem_if_possible_ && dnnLayoutCompare<T>(other.layout_, layout_)) {
VLOG(2) << "Sharing underlying memory.";
buffer_ = other.buffer_;
if (!buffer_.get()) {
VLOG(2) << "Warning: the source MKLMemory has no content yet, so the "
"sharing actually has no effect.";
}
return true;
} else {
VLOG(2) << "Not sharing underlying memory.";
return false;
}
}
@ -271,16 +299,21 @@ class MKLMemory {
void CopyTo(void* ptr) const {
if (buffer_.get() == ptr) {
// This is already mapping to the same memory region. Skip copy.
VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
"memory with the output.";
return;
}
CAFFE_ENFORCE(
buffer_.get(), "Canot copy out from an uninitialized MKLMemory.");
VLOG(2) << "Copy to external memory.";
MKLDNN_SAFE_CALL(dnnConversionExecute<T>(convert_out_, buffer_.get(), ptr));
}
void CopyTo(TensorCPU* tensor) const {
if (buffer_.get() == tensor->mutable_data<T>()) {
// This is already mapping to the same memory region. Skip copy.
VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
"memory with the output.";
return;
}
tensor->Resize(dims_);
@ -295,7 +328,8 @@ class MKLMemory {
const dnnPrimitive_t primitive = nullptr,
const dnnResourceType_t type = dnnResourceNumber) {
if (buffer_.get() == other->buffer_.get()) {
VLOG(1) << "We are sharing memory with the output, skipping copy.";
VLOG(2) << "CopyTo does not need actual copying, as we are sharing "
"memory with the output.";
// This is already mapping to the same memory region. Skip copy.
return;
}
@ -304,13 +338,13 @@ class MKLMemory {
// TODO(jiayq): if primitive creation is a big overhead and we will be
// consistently copying stuff with fixed src and dst layouts, consider
// making a cache for the primitive below.
VLOG(1) << "Trying direct copy.";
VLOG(2) << "CopyTo requires copying. Performing direct copy.";
PrimitiveWrapper<T> convert(
dnnConversionCreate<T>, layout_, other->layout_);
if (dnnPrimitive_t(convert) == nullptr ||
dnnConversionExecute<T>(convert, buffer_.get(), other->buffer()) !=
E_SUCCESS) {
VLOG(1) << "Direct copy failed, will need to allocate output.";
VLOG(2) << "Direct copy failed, will need to allocate output.";
// If CopyTo directly did not succeed, it could be because the target
// MKLMemory is not having the right layout. In this case we will reset
// the target and then do another copy.
@ -348,6 +382,22 @@ class MKLMemory {
return dims_;
}
inline const int ndim() const { return dims_.size(); }
inline int dim32(const int i) const {
CAFFE_ENFORCE_LT(dims_.at(i), std::numeric_limits<int>::max());
return static_cast<int>(dims_[i]);
}
/**
* Returns the i-th dimension of the tensor. Note that the passed in index
* must be between 0 (inclusive) and the number of dimensions, otherwise
* this function will produce a fatal message.
*/
inline TIndex dim(const int i) const {
return dims_.at(i);
}
inline const LayoutWrapper<T>& layout() const {
return layout_;
}
@ -355,19 +405,43 @@ class MKLMemory {
// Returns a view of the content. We mark this function const, but be noted
// that the returned std::shared_ptr is not const protected - user discretion
// is recommended for correctness.
std::shared_ptr<void> View(dnnLayout_t layout_wanted) const {
if (dnnLayoutCompare(layout_wanted, layout_)) {
std::shared_ptr<void> View(
dnnLayout_t layout_wanted,
dnnPrimitive_t primitive,
dnnResourceType_t type) const {
std::lock_guard<std::mutex> lock(buffer_lock_);
if (dnnLayoutCompare<T>(layout_wanted, layout_)) {
// If they are the same, return the original content.
VLOG(2) << "Creating a view without the need of copying.";
return std::shared_ptr<void>(buffer_);
} else {
void* temp_buffer;
VLOG(2) << "Creating a view with copying.";
MKLDNN_SAFE_CALL(dnnAllocateBuffer<T>(&temp_buffer, layout_wanted));
PrimitiveWrapper<T> convert(
dnnConversionCreate<T>, layout_, layout_wanted);
MKLDNN_SAFE_CALL(dnnConversionExecute<T>(convert, buffer_, temp_buffer));
return std::shared_ptr<void>(temp_buffer, [](void* ptr) -> void {
MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
});
MKLDNN_SAFE_CALL(dnnConversionExecute<T>(
convert, buffer_.get(), temp_buffer));
if (FLAGS_caffe2_mkl_implicit_layout_change) {
VLOG(2) << "Implicit layout change set. "
"Changing the underlying storage.";
// We will need to call Reset to set up all the member variables.
// This is not thread safe, so we might want to double check if this
// makes sense in actual use cases.
const_cast<MKLMemory<T>*>(this)->Reset(
dims_, primitive, type, share_mem_if_possible_);
CAFFE_ENFORCE(dnnLayoutCompare<T>(layout_wanted, layout_),
"You passed in a target layout that is not "
"generated by the given primitive and type.");
buffer_.reset(temp_buffer, [](void* ptr) -> void {
MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
});
return std::shared_ptr<void>(buffer_);
} else {
return std::shared_ptr<void>(temp_buffer, [](void* ptr) -> void {
MKLDNN_CHECK(dnnReleaseBuffer<T>(ptr));
});
}
}
}
@ -375,7 +449,11 @@ class MKLMemory {
bool share_mem_if_possible_;
bool layout_is_user_layout_;
// The internal buffer in the specific dnn layout.
std::shared_ptr<void> buffer_;
// It is marked mutable but any modification in a const function should
// be accompanied by the buffer lock, see the View() function.
mutable std::shared_ptr<void> buffer_;
// A mutex to control the access of buffer in the View() function.
mutable std::mutex buffer_lock_;
// The dimensions in the same order as Caffe2 does. This is used to
// interface with C2.
vector<TIndex> dims_;

View file

@ -394,7 +394,8 @@ endfunction()
# Helper function to automatically generate __init__.py files where python
# sources reside but there are no __init__.py present.
function(caffe_autogen_init_py_files)
file(GLOB_RECURSE all_python_files RELATIVE ${CMAKE_BINARY_DIR} "${CMAKE_BINARY_DIR}/*.py")
file(GLOB_RECURSE all_python_files RELATIVE ${PROJECT_SOURCE_DIR}
"${PROJECT_SOURCE_DIR}/caffe2/*.py")
set(python_paths_need_init_py)
foreach(python_file ${all_python_files})
get_filename_component(python_path ${python_file} PATH)
@ -408,6 +409,7 @@ function(caffe_autogen_init_py_files)
list(REMOVE_DUPLICATES python_paths_need_init_py)
# Since the _pb2.py files are yet to be created, we will need to manually
# add them to the list.
list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe)
list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe/proto)
list(APPEND python_paths_need_init_py ${CMAKE_BINARY_DIR}/caffe2/proto)