From 84bd71dbd49fa99c669b3908a82d5476d4e27c4d Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Sat, 14 Mar 2020 12:48:24 -0700
Subject: [PATCH] Enable threading for XNNPACK ops. (#34547)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/34547

This enables threading by passing a threadpool to xnnpack ops.

Test Plan:
python test/test_xnnpack_integration.py

Imported from OSS

Differential Revision: D20370553

fbshipit-source-id: 4db08e73f8c69b9e722b0e11a00621c4e229a31a
---
 CMakeLists.txt                               | 2 +-
 aten/src/ATen/native/xnnpack/Common.h        | 1 +
 aten/src/ATen/native/xnnpack/Convolution.cpp | 6 +++---
 aten/src/ATen/native/xnnpack/Linear.cpp      | 6 +++---
 cmake/Dependencies.cmake                     | 9 +++++++++
 5 files changed, 17 insertions(+), 7 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b6d7d93438..ea88454d227 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -423,7 +423,7 @@ if(USE_PYTORCH_QNNPACK)
 endif()
 
 if(USE_XNNPACK)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK -DUSE_INTERNAL_THREADPOOL_IMPL")
 endif()
 
 # ---[ Whitelist file if whitelist is specified
diff --git a/aten/src/ATen/native/xnnpack/Common.h b/aten/src/ATen/native/xnnpack/Common.h
index 09472d31368..9484d99c1f6 100644
--- a/aten/src/ATen/native/xnnpack/Common.h
+++ b/aten/src/ATen/native/xnnpack/Common.h
@@ -5,6 +5,7 @@
 #ifdef USE_XNNPACK
 
 #include <xnnpack.h>
+#include "caffe2/utils/threadpool/ThreadPoolXNNPACK.h"
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp
index a8239ea8738..82c8d9b1ad4 100644
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@@ -110,15 +110,15 @@ Tensor run(
       padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
       padded_input_nhwc.data_ptr<float>(),                   // input
       output.data_ptr<float>(),                              // output
-      nullptr);                                              // threadpool
+      caffe2::xnnpack_threadpool());                         // threadpool
 
   TORCH_CHECK(
       xnn_status_success == setup_status,
       "xnn_setup_convolution2d_nhwc_f32 failed!");
 
   const xnn_status run_status = xnn_run_operator(
-      context.op.get(),             // operator
-      nullptr);                     // threadpool
+      context.op.get(),               // operator
+      caffe2::xnnpack_threadpool());  // threadpool
 
   TORCH_INTERNAL_ASSERT(
       xnn_status_success == run_status,
diff --git a/aten/src/ATen/native/xnnpack/Linear.cpp b/aten/src/ATen/native/xnnpack/Linear.cpp
index ff34c7ff897..b62746f1d43 100644
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@@ -72,15 +72,15 @@ Tensor run(
       Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
       padded_input.data_ptr<float>(),                     // input
       output.data_ptr<float>(),                           // output
-      nullptr);                                           // threadpool
+      caffe2::xnnpack_threadpool());                      // threadpool
 
   TORCH_CHECK(
       xnn_status_success == setup_status,
       "xnn_setup_fully_connected_nc_f32 failed!");
 
   const xnn_status run_status = xnn_run_operator(
-      context.op.get(),         // operator
-      nullptr);                 // threadpool
+      context.op.get(),               // operator
+      caffe2::xnnpack_threadpool());  // threadpool
 
   TORCH_INTERNAL_ASSERT(
       xnn_status_success == run_status,
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7d13acea9ae..320bba10d8e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -434,6 +434,15 @@ if(USE_XNNPACK)
       "${CONFU_DEPENDENCIES_BINARY_DIR}/XNNPACK")
 
     set_property(TARGET XNNPACK PROPERTY POSITION_INDEPENDENT_CODE ON)
+    # Context: pthreadpool_get_threads_count implementation that is built in pytorch, uses
+    # implementation defined in caffe2/utils/threadpool/pthreadpool_impl.cc. This implementation
+    # assumes the the pthreadpool* passed is of type caffe2::ThradPool and thus does reinterpret cast.
+    # This is not valid when we create pthreadpool via caffe2::xnnpack_threadpool, which is of type
+    # compatible with new pthreadpool interface and is used in PT's XNNPACK integration.
+    # Thus all the calls for pthreadpool_get_threads_count originating from XNNPACK must be routed
+    # appropriately to pthreadpool_get_threads_count_xnnpack, which does not do the aforementioned
+    # casting to caffe2::ThradPool. Once the threadpools are unified, we will not need this.
+    target_compile_definitions(XNNPACK PRIVATE -Dpthreadpool_get_threads_count=pthreadpool_get_threads_count_xnnpack)
   endif()
 
   include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})