diff --git a/caffe2/operators/quantized/int8_add_op.h b/caffe2/operators/quantized/int8_add_op.h index fa2a53eeb97..ea5cb8d5114 100644 --- a/caffe2/operators/quantized/int8_add_op.h +++ b/caffe2/operators/quantized/int8_add_op.h @@ -90,7 +90,7 @@ class Int8AddOp final : public Operator { setupStatus == qnnp_status_success, "failed to setup QNNPACK add operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_average_pool_op.h b/caffe2/operators/quantized/int8_average_pool_op.h index e9fd24b6818..1df8f180756 100644 --- a/caffe2/operators/quantized/int8_average_pool_op.h +++ b/caffe2/operators/quantized/int8_average_pool_op.h @@ -85,7 +85,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase { setupStatus == qnnp_status_success, "failed to setup QNNPACK Global Average Pooling operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator( this->qnnpackGlobalOperator_, nullptr /* thread pool */); #else @@ -137,7 +137,7 @@ class Int8AveragePoolOp final : public ConvPoolOpBase { setupStatus == qnnp_status_success, "failed to setup QNNPACK Average Pooling operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_channel_shuffle_op.h b/caffe2/operators/quantized/int8_channel_shuffle_op.h index 4c280c2f574..77a34c41c6b 100644 --- a/caffe2/operators/quantized/int8_channel_shuffle_op.h +++ b/caffe2/operators/quantized/int8_channel_shuffle_op.h @@ -74,7 +74,7 @@ class Int8ChannelShuffleOp final : public ConvPoolOpBase { setupStatus == qnnp_status_success, "failed to setup QNNPACK channel shuffle operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_conv_op.h b/caffe2/operators/quantized/int8_conv_op.h index 9690e6fe849..a309be9c11a 100644 --- a/caffe2/operators/quantized/int8_conv_op.h +++ b/caffe2/operators/quantized/int8_conv_op.h @@ -141,7 +141,7 @@ class Int8ConvOp final : public ConvPoolOpBase { lastOutputPointer_ = Y->t.template mutable_data(); } -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_conv_transpose_op.h b/caffe2/operators/quantized/int8_conv_transpose_op.h index ac3c99b9943..66f9ff94710 100644 --- a/caffe2/operators/quantized/int8_conv_transpose_op.h +++ b/caffe2/operators/quantized/int8_conv_transpose_op.h @@ -140,7 +140,7 @@ class Int8ConvTransposeOp final : public ConvTransposeUnpoolBase { lastOutputPointer_ = Y->t.template mutable_data(); } -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_fc_op.h b/caffe2/operators/quantized/int8_fc_op.h index a5a2af3392d..58e85caebeb 100644 --- a/caffe2/operators/quantized/int8_fc_op.h +++ b/caffe2/operators/quantized/int8_fc_op.h @@ -104,7 +104,7 @@ class Int8FCOp final : public Operator { lastOutputPointer_ = Y->t.template mutable_data(); } -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackObject_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_leaky_relu_op.h b/caffe2/operators/quantized/int8_leaky_relu_op.h index ec2542832e8..c05f9e47e89 100644 --- a/caffe2/operators/quantized/int8_leaky_relu_op.h +++ b/caffe2/operators/quantized/int8_leaky_relu_op.h @@ -81,7 +81,7 @@ class Int8LeakyReluOp final : public Operator { setupStatus == qnnp_status_success, "failed to setup QNNPACK Leaky ReLU operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_max_pool_op.h b/caffe2/operators/quantized/int8_max_pool_op.h index ba1edef2721..df85ef10211 100644 --- a/caffe2/operators/quantized/int8_max_pool_op.h +++ b/caffe2/operators/quantized/int8_max_pool_op.h @@ -84,7 +84,7 @@ class Int8MaxPoolOp final : public ConvPoolOpBase { setupStatus == qnnp_status_success, "failed to setup QNNPACK Max Pooling operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_relu_op.h b/caffe2/operators/quantized/int8_relu_op.h index 477e24a6b1f..100cf4459ef 100644 --- a/caffe2/operators/quantized/int8_relu_op.h +++ b/caffe2/operators/quantized/int8_relu_op.h @@ -65,7 +65,7 @@ class Int8ReluOp final : public Operator { setupStatus == qnnp_status_success, "failed to setup QNNPACK Clamp operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_sigmoid_op.h b/caffe2/operators/quantized/int8_sigmoid_op.h index 87b10c1e747..fdf5e330dff 100644 --- a/caffe2/operators/quantized/int8_sigmoid_op.h +++ b/caffe2/operators/quantized/int8_sigmoid_op.h @@ -74,7 +74,7 @@ class Int8SigmoidOp final : public Operator { setupStatus == qnnp_status_success, "failed to setup QNNPACK Sigmoid operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/operators/quantized/int8_softmax_op.h b/caffe2/operators/quantized/int8_softmax_op.h index 1bf836634ed..e62b2237b1c 100644 --- a/caffe2/operators/quantized/int8_softmax_op.h +++ b/caffe2/operators/quantized/int8_softmax_op.h @@ -72,7 +72,7 @@ class Int8SoftmaxOp final : public Operator { setupStatus == qnnp_status_success, "failed to setup QNNPACK SoftArgMax operator"); -#ifdef FBCODE_CAFFE2 +#if defined(FBCODE_CAFFE2) || !defined(USE_INTERNAL_PTHREADPOOL_IMPL) const qnnp_status runStatus = qnnp_run_operator(this->qnnpackOperator_, nullptr /* thread pool */); #else diff --git a/caffe2/share/contrib/nnpack/conv_op.cc b/caffe2/share/contrib/nnpack/conv_op.cc index 67dd1804531..ad071f46c48 100644 --- a/caffe2/share/contrib/nnpack/conv_op.cc +++ b/caffe2/share/contrib/nnpack/conv_op.cc @@ -195,7 +195,12 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() { const nnp_size output_subsample = {.width = static_cast(stride_w()), .height = static_cast(stride_h())}; initNNPACK(); + +#if !defined(USE_INTERNAL_PTHREADPOOL_IMPL) + pthreadpool_t pool = nullptr; +#else pthreadpool_t pool = reinterpret_cast(ws_->GetThreadPool()); +#endif runWithSharedBuffer(ws_, [&](Tensor* buffer) { if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) { diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d6ffc506606..487bdd98965 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -239,10 +239,10 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK) endif() if(DISABLE_NNPACK_AND_FAMILY) - set(USE_NNPACK OFF) - set(USE_QNNPACK OFF) - set(USE_PYTORCH_QNNPACK OFF) - set(USE_XNNPACK OFF) + caffe2_update_option(USE_NNPACK OFF) + caffe2_update_option(USE_QNNPACK OFF) + caffe2_update_option(USE_PYTORCH_QNNPACK OFF) + caffe2_update_option(USE_XNNPACK OFF) else() set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") @@ -261,10 +261,6 @@ if(USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK) if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") endif() - - set(CPUINFO_LIBRARY_TYPE "static" CACHE STRING "") - set(CPUINFO_LOG_LEVEL "error" CACHE STRING "") - set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "") endif() else() set(DISABLE_NNPACK_AND_FAMILY ON) @@ -283,42 +279,45 @@ if(INTERN_BUILD_MOBILE AND INTERN_USE_EIGEN_BLAS) endif() # ---[ pthreadpool -if(NOT USE_SYSTEM_PTHREADPOOL AND (INTERN_BUILD_MOBILE OR NOT DISABLE_NNPACK_AND_FAMILY)) +# Only add a dependency on pthreadpool if we are on a mobile build +# or are building any of the libraries in the {Q/X}NNPACK family. +if(INTERN_BUILD_MOBILE OR NOT DISABLE_NNPACK_AND_FAMILY) set(USE_PTHREADPOOL ON CACHE BOOL "" FORCE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_PTHREADPOOL") - # Opt for custom Caffe2 implementation on MSVC. Windows support seems to have - # been added to pthreadpool recently but the current third party revision we are - # using right now does not suppor it. Should unify later after updating pthreadpool. - if(MSVC) - set(USE_INTERNAL_PTHREADPOOL_IMPL ON CACHE BOOL "" FORCE) - # XNNPACK cannot link against a custom implementation of pthreadpool - caffe2_update_option(USE_XNNPACK OFF) - else() - # We would like to maintain the ability to build against the internal C2 - # pthreadpool implementation for now, hence this flag. This flag is not - # exposed as a build option to the user and is purly internal. - set(USE_INTERNAL_PTHREADPOOL_IMPL OFF CACHE BOOL "" FORCE) + # Always use third_party/pthreadpool. + set(USE_INTERNAL_PTHREADPOOL_IMPL OFF CACHE BOOL "" FORCE) - if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) - set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") - set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") - endif() + if(NOT TARGET pthreadpool) + if(USE_SYSTEM_PTHREADPOOL) + add_library(pthreadpool SHARED IMPORTED) + find_library(PTHREADPOOL_LIBRARY pthreadpool) + set_property(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}") + if(NOT PTHREADPOOL_LIBRARY) + message(FATAL_ERROR "Cannot find pthreadpool") + endif() + message("-- Found pthreadpool: ${PTHREADPOOL_LIBRARY}") + elseif(NOT USE_INTERNAL_PTHREADPOOL_IMPL) + if(NOT DEFINED PTHREADPOOL_SOURCE_DIR) + set(CAFFE2_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") + set(PTHREADPOOL_SOURCE_DIR "${CAFFE2_THIRD_PARTY_ROOT}/pthreadpool" CACHE STRING "pthreadpool source directory") + endif() - if(NOT TARGET pthreadpool) set(PTHREADPOOL_BUILD_TESTS OFF CACHE BOOL "") set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE BOOL "") + set(PTHREADPOOL_LIBRARY_TYPE "static" CACHE STRING "") + set(PTHREADPOOL_ALLOW_DEPRECATED_API ON CACHE BOOL "") add_subdirectory( "${PTHREADPOOL_SOURCE_DIR}" "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool") set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) endif() - list(APPEND Caffe2_DEPENDENCY_LIBS pthreadpool) - endif() - - if(USE_INTERNAL_PTHREADPOOL_IMPL) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTERNAL_PTHREADPOOL_IMPL") + if(USE_INTERNAL_PTHREADPOOL_IMPL) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTERNAL_PTHREADPOOL_IMPL") + else() + list(APPEND Caffe2_DEPENDENCY_LIBS pthreadpool) + endif() endif() else() set(USE_PTHREADPOOL OFF CACHE BOOL "" FORCE) @@ -385,6 +384,28 @@ if(USE_QNNPACK) # them into a shared library for Caffe2, so they need PIC. set_property(TARGET qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) + + if(QNNPACK_CUSTOM_THREADPOOL) + target_compile_definitions( + qnnpack PRIVATE + pthreadpool_t=legacy_pthreadpool_t + pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t + pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t + pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t + pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t + pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t + pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t + pthreadpool_create=legacy_pthreadpool_create + pthreadpool_destroy=legacy_pthreadpool_destroy + pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count + pthreadpool_compute_1d=legacy_pthreadpool_compute_1d + pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d + pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled + pthreadpool_compute_2d=legacy_pthreadpool_compute_2d + pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled + pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled + pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) + endif() endif() list(APPEND Caffe2_DEPENDENCY_LIBS qnnpack) @@ -418,6 +439,28 @@ if(USE_PYTORCH_QNNPACK) # them into a shared library for Caffe2, so they need PIC. set_property(TARGET pytorch_qnnpack PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) + + if(PYTORCH_QNNPACK_CUSTOM_THREADPOOL) + target_compile_definitions( + pytorch_qnnpack PRIVATE + pthreadpool_t=legacy_pthreadpool_t + pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t + pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t + pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t + pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t + pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t + pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t + pthreadpool_create=legacy_pthreadpool_create + pthreadpool_destroy=legacy_pthreadpool_destroy + pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count + pthreadpool_compute_1d=legacy_pthreadpool_compute_1d + pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d + pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled + pthreadpool_compute_2d=legacy_pthreadpool_compute_2d + pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled + pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled + pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) + endif() endif() list(APPEND Caffe2_DEPENDENCY_LIBS pytorch_qnnpack) diff --git a/cmake/External/nnpack.cmake b/cmake/External/nnpack.cmake index 24f54627c01..84244dc864c 100644 --- a/cmake/External/nnpack.cmake +++ b/cmake/External/nnpack.cmake @@ -76,6 +76,28 @@ if(ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAM set_property(TARGET nnpack PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET pthreadpool PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) + + if(NNPACK_CUSTOM_THREADPOOL) + target_compile_definitions( + nnpack PRIVATE + pthreadpool_t=legacy_pthreadpool_t + pthreadpool_function_1d_t=legacy_pthreadpool_function_1d_t + pthreadpool_function_1d_tiled_t=legacy_pthreadpool_function_1d_tiled_t + pthreadpool_function_2d_t=legacy_pthreadpool_function_2d_t + pthreadpool_function_2d_tiled_t=legacy_pthreadpool_function_2d_tiled_t + pthreadpool_function_3d_tiled_t=legacy_pthreadpool_function_3d_tiled_t + pthreadpool_function_4d_tiled_t=legacy_pthreadpool_function_4d_tiled_t + pthreadpool_create=legacy_pthreadpool_create + pthreadpool_destroy=legacy_pthreadpool_destroy + pthreadpool_get_threads_count=legacy_pthreadpool_get_threads_count + pthreadpool_compute_1d=legacy_pthreadpool_compute_1d + pthreadpool_parallelize_1d=legacy_pthreadpool_parallelize_1d + pthreadpool_compute_1d_tiled=legacy_pthreadpool_compute_1d_tiled + pthreadpool_compute_2d=legacy_pthreadpool_compute_2d + pthreadpool_compute_2d_tiled=legacy_pthreadpool_compute_2d_tiled + pthreadpool_compute_3d_tiled=legacy_pthreadpool_compute_3d_tiled + pthreadpool_compute_4d_tiled=legacy_pthreadpool_compute_4d_tiled) + endif() endif() set(NNPACK_FOUND TRUE) diff --git a/third_party/pthreadpool b/third_party/pthreadpool index d465747660e..029c8862080 160000 --- a/third_party/pthreadpool +++ b/third_party/pthreadpool @@ -1 +1 @@ -Subproject commit d465747660ecf9ebbaddf8c3db37e4a13d0c9103 +Subproject commit 029c88620802e1361ccf41d1970bd5b07fd6b7bb