From aa37e2de8f9e2991341df437dfec40259b5e4e33 Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Mon, 11 Nov 2019 13:46:55 -0800
Subject: [PATCH] Direct use python numpy array's memory if already contiguous.
  (#2355)

* Direct use python numpy array's memory if already contiguous. This
could greatly improve performance for session with large input,
like big image 1920x1080 fastrcnn, 30~40% speed up could be achieved.

* Add test case enforce contiguous/non-contiguos numpy array as inputs.
---
 .../python/onnxruntime_pybind_mlvalue.cc      | 122 +++++++++---------
 .../test/python/onnxruntime_test_python.py    |  20 +++
 2 files changed, 84 insertions(+), 58 deletions(-)
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index b5e9d7ad4e..491ca6e105 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -117,66 +117,72 @@ std::unique_ptr<Tensor> CreateTensor(AllocatorPtr alloc, const std::string& name
 
     TensorShape shape(dims);
     auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
-    p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
-    if (npy_type == NPY_UNICODE) {
-      // Copy string data which needs to be done after Tensor is allocated.
-      // Strings are Python strings or numpy.unicode string.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      auto num_chars = item_size / PyUnicode_4BYTE_KIND;
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      const char* str;
-      Py_ssize_t size;
-      PyObject* pStr;
-      for (int i = 0; i < shape.Size(); i++, src += item_size) {
-        // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-        pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
-        str = PyUnicode_AsUTF8AndSize(pStr, &size);
-        if (str == NULL) {
-          dst[i] = "";
-        } else {
-          // Size is equal to the longest string size, numpy stores
-          // strings in a single array. Those code assumes a string ends with a final 0.
-          dst[i] = str;
-        }
-        Py_XDECREF(pStr);
-      }
-    } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
-      // Copy string data which needs to be done after Tensor is allocated.
-      // Strings are given as bytes (encoded strings).
-      // NPY_VOID does not trim final 0.
-      // NPY_STRING assumes bytes string ends with a final 0.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      for (int i = 0; i < shape.Size(); i++, src += item_size) {
-        if (npy_type == NPY_STRING) {
-          dst[i] = src;
-        } else {
-          dst[i].resize(item_size);
-          memcpy((void*)dst[i].c_str(), src, item_size);
-        }
-      }
-    } else if (npy_type == NPY_OBJECT) {
-      // Converts object into string.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      PyObject *item, *pStr;
-      for (int i = 0; i < shape.Size(); ++i, src += item_size) {
-        // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-        item = PyArray_GETITEM(darray, src);
-        pStr = PyObject_Str(item);
-        dst[i] = py::reinterpret_borrow<py::str>(pStr);
-        Py_XDECREF(pStr);
-      }
+    if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
+        npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
+      p_tensor = onnxruntime::make_unique<Tensor>(
+          element_type, shape, static_cast<void*>(PyArray_DATA(darray)), alloc->Info());
     } else {
-      void* buffer = p_tensor->MutableDataRaw();
-      size_t len;
-      if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
-        throw std::runtime_error("length overflow");
+      p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
+      if (npy_type == NPY_UNICODE) {
+        // Copy string data which needs to be done after Tensor is allocated.
+        // Strings are Python strings or numpy.unicode string.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        auto num_chars = item_size / PyUnicode_4BYTE_KIND;
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        const char* str;
+        Py_ssize_t size;
+        PyObject* pStr;
+        for (int i = 0; i < shape.Size(); i++, src += item_size) {
+          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+          pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
+          str = PyUnicode_AsUTF8AndSize(pStr, &size);
+          if (str == NULL) {
+            dst[i] = "";
+          } else {
+            // Size is equal to the longest string size, numpy stores
+            // strings in a single array. Those code assumes a string ends with a final 0.
+            dst[i] = str;
+          }
+          Py_XDECREF(pStr);
+        }
+      } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
+        // Copy string data which needs to be done after Tensor is allocated.
+        // Strings are given as bytes (encoded strings).
+        // NPY_VOID does not trim final 0.
+        // NPY_STRING assumes bytes string ends with a final 0.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        for (int i = 0; i < shape.Size(); i++, src += item_size) {
+          if (npy_type == NPY_STRING) {
+            dst[i] = src;
+          } else {
+            dst[i].resize(item_size);
+            memcpy((void*)dst[i].c_str(), src, item_size);
+          }
+        }
+      } else if (npy_type == NPY_OBJECT) {
+        // Converts object into string.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        PyObject *item, *pStr;
+        for (int i = 0; i < shape.Size(); ++i, src += item_size) {
+          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+          item = PyArray_GETITEM(darray, src);
+          pStr = PyObject_Str(item);
+          dst[i] = py::reinterpret_borrow<py::str>(pStr);
+          Py_XDECREF(pStr);
+        }
+      } else {
+        void* buffer = p_tensor->MutableDataRaw();
+        size_t len;
+        if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
+          throw std::runtime_error("length overflow");
+        }
+        memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
       }
-      memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
     }
   } catch (...) {
     if (!dref) {
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 43d8dc8e9c..6b5a808037 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -118,6 +118,26 @@ class TestInferenceSession(unittest.TestCase):
         np.testing.assert_allclose(
             output_expected, res[0], rtol=1e-05, atol=1e-08)
 
+    def testRunModel2Contiguous(self):
+        sess = onnxrt.InferenceSession(self.get_name("matmul_1.onnx"))
+        x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:,[1,0]]
+        input_name = sess.get_inputs()[0].name
+        self.assertEqual(input_name, "X")
+        input_shape = sess.get_inputs()[0].shape
+        self.assertEqual(input_shape, [3, 2])
+        output_name = sess.get_outputs()[0].name
+        self.assertEqual(output_name, "Y")
+        output_shape = sess.get_outputs()[0].shape
+        self.assertEqual(output_shape, [3, 1])
+        res = sess.run([output_name], {input_name: x})
+        output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
+        np.testing.assert_allclose(
+            output_expected, res[0], rtol=1e-05, atol=1e-08)
+        xcontiguous = np.ascontiguousarray(x)
+        rescontiguous = sess.run([output_name], {input_name: xcontiguous})
+        np.testing.assert_allclose(
+            output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
+
     def testRunModelMultipleThreads(self):
         so = onnxrt.SessionOptions()
         so.log_verbosity_level = 1