From aa37e2de8f9e2991341df437dfec40259b5e4e33 Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Mon, 11 Nov 2019 13:46:55 -0800 Subject: [PATCH] Direct use python numpy array's memory if already contiguous. (#2355) * Direct use python numpy array's memory if already contiguous. This could greatly improve performance for session with large input, like big image 1920x1080 fastrcnn, 30~40% speed up could be achieved. * Add test case enforce contiguous/non-contiguos numpy array as inputs. --- .../python/onnxruntime_pybind_mlvalue.cc | 122 +++++++++--------- .../test/python/onnxruntime_test_python.py | 20 +++ 2 files changed, 84 insertions(+), 58 deletions(-) diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index b5e9d7ad4e..491ca6e105 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -117,66 +117,72 @@ std::unique_ptr CreateTensor(AllocatorPtr alloc, const std::string& name TensorShape shape(dims); auto element_type = NumpyToOnnxRuntimeTensorType(npy_type); - p_tensor = onnxruntime::make_unique(element_type, shape, alloc); - if (npy_type == NPY_UNICODE) { - // Copy string data which needs to be done after Tensor is allocated. - // Strings are Python strings or numpy.unicode string. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - auto num_chars = item_size / PyUnicode_4BYTE_KIND; - char* src = static_cast(PyArray_DATA(darray)); - const char* str; - Py_ssize_t size; - PyObject* pStr; - for (int i = 0; i < shape.Size(); i++, src += item_size) { - // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. - pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars); - str = PyUnicode_AsUTF8AndSize(pStr, &size); - if (str == NULL) { - dst[i] = ""; - } else { - // Size is equal to the longest string size, numpy stores - // strings in a single array. Those code assumes a string ends with a final 0. - dst[i] = str; - } - Py_XDECREF(pStr); - } - } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) { - // Copy string data which needs to be done after Tensor is allocated. - // Strings are given as bytes (encoded strings). - // NPY_VOID does not trim final 0. - // NPY_STRING assumes bytes string ends with a final 0. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - char* src = static_cast(PyArray_DATA(darray)); - for (int i = 0; i < shape.Size(); i++, src += item_size) { - if (npy_type == NPY_STRING) { - dst[i] = src; - } else { - dst[i].resize(item_size); - memcpy((void*)dst[i].c_str(), src, item_size); - } - } - } else if (npy_type == NPY_OBJECT) { - // Converts object into string. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - char* src = static_cast(PyArray_DATA(darray)); - PyObject *item, *pStr; - for (int i = 0; i < shape.Size(); ++i, src += item_size) { - // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. - item = PyArray_GETITEM(darray, src); - pStr = PyObject_Str(item); - dst[i] = py::reinterpret_borrow(pStr); - Py_XDECREF(pStr); - } + if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING && + npy_type != NPY_VOID && npy_type != NPY_OBJECT) { + p_tensor = onnxruntime::make_unique( + element_type, shape, static_cast(PyArray_DATA(darray)), alloc->Info()); } else { - void* buffer = p_tensor->MutableDataRaw(); - size_t len; - if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) { - throw std::runtime_error("length overflow"); + p_tensor = onnxruntime::make_unique(element_type, shape, alloc); + if (npy_type == NPY_UNICODE) { + // Copy string data which needs to be done after Tensor is allocated. + // Strings are Python strings or numpy.unicode string. + std::string* dst = p_tensor->MutableData(); + auto item_size = PyArray_ITEMSIZE(darray); + auto num_chars = item_size / PyUnicode_4BYTE_KIND; + char* src = static_cast(PyArray_DATA(darray)); + const char* str; + Py_ssize_t size; + PyObject* pStr; + for (int i = 0; i < shape.Size(); i++, src += item_size) { + // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. + pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars); + str = PyUnicode_AsUTF8AndSize(pStr, &size); + if (str == NULL) { + dst[i] = ""; + } else { + // Size is equal to the longest string size, numpy stores + // strings in a single array. Those code assumes a string ends with a final 0. + dst[i] = str; + } + Py_XDECREF(pStr); + } + } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) { + // Copy string data which needs to be done after Tensor is allocated. + // Strings are given as bytes (encoded strings). + // NPY_VOID does not trim final 0. + // NPY_STRING assumes bytes string ends with a final 0. + std::string* dst = p_tensor->MutableData(); + auto item_size = PyArray_ITEMSIZE(darray); + char* src = static_cast(PyArray_DATA(darray)); + for (int i = 0; i < shape.Size(); i++, src += item_size) { + if (npy_type == NPY_STRING) { + dst[i] = src; + } else { + dst[i].resize(item_size); + memcpy((void*)dst[i].c_str(), src, item_size); + } + } + } else if (npy_type == NPY_OBJECT) { + // Converts object into string. + std::string* dst = p_tensor->MutableData(); + auto item_size = PyArray_ITEMSIZE(darray); + char* src = static_cast(PyArray_DATA(darray)); + PyObject *item, *pStr; + for (int i = 0; i < shape.Size(); ++i, src += item_size) { + // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. + item = PyArray_GETITEM(darray, src); + pStr = PyObject_Str(item); + dst[i] = py::reinterpret_borrow(pStr); + Py_XDECREF(pStr); + } + } else { + void* buffer = p_tensor->MutableDataRaw(); + size_t len; + if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) { + throw std::runtime_error("length overflow"); + } + memcpy(buffer, static_cast(PyArray_DATA(darray)), len); } - memcpy(buffer, static_cast(PyArray_DATA(darray)), len); } } catch (...) { if (!dref) { diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 43d8dc8e9c..6b5a808037 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -118,6 +118,26 @@ class TestInferenceSession(unittest.TestCase): np.testing.assert_allclose( output_expected, res[0], rtol=1e-05, atol=1e-08) + def testRunModel2Contiguous(self): + sess = onnxrt.InferenceSession(self.get_name("matmul_1.onnx")) + x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:,[1,0]] + input_name = sess.get_inputs()[0].name + self.assertEqual(input_name, "X") + input_shape = sess.get_inputs()[0].shape + self.assertEqual(input_shape, [3, 2]) + output_name = sess.get_outputs()[0].name + self.assertEqual(output_name, "Y") + output_shape = sess.get_outputs()[0].shape + self.assertEqual(output_shape, [3, 1]) + res = sess.run([output_name], {input_name: x}) + output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32) + np.testing.assert_allclose( + output_expected, res[0], rtol=1e-05, atol=1e-08) + xcontiguous = np.ascontiguousarray(x) + rescontiguous = sess.run([output_name], {input_name: xcontiguous}) + np.testing.assert_allclose( + output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08) + def testRunModelMultipleThreads(self): so = onnxrt.SessionOptions() so.log_verbosity_level = 1