Direct use python numpy array's memory if already contiguous. (#2355)

* Direct use python numpy array's memory if already contiguous. This could greatly improve performance for session with large input, like big image 1920x1080 fastrcnn, 30~40% speed up could be achieved. * Add test case enforce contiguous/non-contiguos numpy array as inputs.
2026-07-06 04:28:32 +00:00 · 2019-11-11 13:46:55 -08:00 · 2019-11-11 13:46:55 -08:00 · aa37e2de8f
commit aa37e2de8f
parent ed6da0d191
2 changed files with 84 additions and 58 deletions
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@ -117,66 +117,72 @@ std::unique_ptr<Tensor> CreateTensor(AllocatorPtr alloc, const std::string& name

    TensorShape shape(dims);
    auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
-    p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
-    if (npy_type == NPY_UNICODE) {
-      // Copy string data which needs to be done after Tensor is allocated.
-      // Strings are Python strings or numpy.unicode string.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      auto num_chars = item_size / PyUnicode_4BYTE_KIND;
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      const char* str;
-      Py_ssize_t size;
-      PyObject* pStr;
-      for (int i = 0; i < shape.Size(); i++, src += item_size) {
-        // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-        pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
-        str = PyUnicode_AsUTF8AndSize(pStr, &size);
-        if (str == NULL) {
-          dst[i] = "";
-        } else {
-          // Size is equal to the longest string size, numpy stores
-          // strings in a single array. Those code assumes a string ends with a final 0.
-          dst[i] = str;
-        }
-        Py_XDECREF(pStr);
-      }
-    } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
-      // Copy string data which needs to be done after Tensor is allocated.
-      // Strings are given as bytes (encoded strings).
-      // NPY_VOID does not trim final 0.
-      // NPY_STRING assumes bytes string ends with a final 0.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      for (int i = 0; i < shape.Size(); i++, src += item_size) {
-        if (npy_type == NPY_STRING) {
-          dst[i] = src;
-        } else {
-          dst[i].resize(item_size);
-          memcpy((void*)dst[i].c_str(), src, item_size);
-        }
-      }
-    } else if (npy_type == NPY_OBJECT) {
-      // Converts object into string.
-      std::string* dst = p_tensor->MutableData<std::string>();
-      auto item_size = PyArray_ITEMSIZE(darray);
-      char* src = static_cast<char*>(PyArray_DATA(darray));
-      PyObject *item, *pStr;
-      for (int i = 0; i < shape.Size(); ++i, src += item_size) {
-        // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
-        item = PyArray_GETITEM(darray, src);
-        pStr = PyObject_Str(item);
-        dst[i] = py::reinterpret_borrow<py::str>(pStr);
-        Py_XDECREF(pStr);
-      }
+    if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
+        npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
+      p_tensor = onnxruntime::make_unique<Tensor>(
+          element_type, shape, static_cast<void*>(PyArray_DATA(darray)), alloc->Info());
    } else {
-      void* buffer = p_tensor->MutableDataRaw();
-      size_t len;
-      if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
-        throw std::runtime_error("length overflow");
+      p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
+      if (npy_type == NPY_UNICODE) {
+        // Copy string data which needs to be done after Tensor is allocated.
+        // Strings are Python strings or numpy.unicode string.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        auto num_chars = item_size / PyUnicode_4BYTE_KIND;
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        const char* str;
+        Py_ssize_t size;
+        PyObject* pStr;
+        for (int i = 0; i < shape.Size(); i++, src += item_size) {
+          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+          pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
+          str = PyUnicode_AsUTF8AndSize(pStr, &size);
+          if (str == NULL) {
+            dst[i] = "";
+          } else {
+            // Size is equal to the longest string size, numpy stores
+            // strings in a single array. Those code assumes a string ends with a final 0.
+            dst[i] = str;
+          }
+          Py_XDECREF(pStr);
+        }
+      } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
+        // Copy string data which needs to be done after Tensor is allocated.
+        // Strings are given as bytes (encoded strings).
+        // NPY_VOID does not trim final 0.
+        // NPY_STRING assumes bytes string ends with a final 0.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        for (int i = 0; i < shape.Size(); i++, src += item_size) {
+          if (npy_type == NPY_STRING) {
+            dst[i] = src;
+          } else {
+            dst[i].resize(item_size);
+            memcpy((void*)dst[i].c_str(), src, item_size);
+          }
+        }
+      } else if (npy_type == NPY_OBJECT) {
+        // Converts object into string.
+        std::string* dst = p_tensor->MutableData<std::string>();
+        auto item_size = PyArray_ITEMSIZE(darray);
+        char* src = static_cast<char*>(PyArray_DATA(darray));
+        PyObject *item, *pStr;
+        for (int i = 0; i < shape.Size(); ++i, src += item_size) {
+          // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
+          item = PyArray_GETITEM(darray, src);
+          pStr = PyObject_Str(item);
+          dst[i] = py::reinterpret_borrow<py::str>(pStr);
+          Py_XDECREF(pStr);
+        }
+      } else {
+        void* buffer = p_tensor->MutableDataRaw();
+        size_t len;
+        if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
+          throw std::runtime_error("length overflow");
+        }
+        memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
      }
-      memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
    }
  } catch (...) {
    if (!dref) {
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@ -118,6 +118,26 @@ class TestInferenceSession(unittest.TestCase):
        np.testing.assert_allclose(
            output_expected, res[0], rtol=1e-05, atol=1e-08)

+    def testRunModel2Contiguous(self):
+        sess = onnxrt.InferenceSession(self.get_name("matmul_1.onnx"))
+        x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:,[1,0]]
+        input_name = sess.get_inputs()[0].name
+        self.assertEqual(input_name, "X")
+        input_shape = sess.get_inputs()[0].shape
+        self.assertEqual(input_shape, [3, 2])
+        output_name = sess.get_outputs()[0].name
+        self.assertEqual(output_name, "Y")
+        output_shape = sess.get_outputs()[0].shape
+        self.assertEqual(output_shape, [3, 1])
+        res = sess.run([output_name], {input_name: x})
+        output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
+        np.testing.assert_allclose(
+            output_expected, res[0], rtol=1e-05, atol=1e-08)
+        xcontiguous = np.ascontiguousarray(x)
+        rescontiguous = sess.run([output_name], {input_name: xcontiguous})
+        np.testing.assert_allclose(
+            output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
+
    def testRunModelMultipleThreads(self):
        so = onnxrt.SessionOptions()
        so.log_verbosity_level = 1