Direct use python numpy array's memory if already contiguous. (#2355)

* Direct use python numpy array's memory if already contiguous. This
could greatly improve performance for session with large input,
like big image 1920x1080 fastrcnn, 30~40% speed up could be achieved.

* Add test case enforce contiguous/non-contiguos numpy array as inputs.
This commit is contained in:
Zhang Lei 2019-11-11 13:46:55 -08:00 committed by GitHub
parent ed6da0d191
commit aa37e2de8f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 58 deletions

View file

@ -117,66 +117,72 @@ std::unique_ptr<Tensor> CreateTensor(AllocatorPtr alloc, const std::string& name
TensorShape shape(dims);
auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
if (npy_type == NPY_UNICODE) {
// Copy string data which needs to be done after Tensor is allocated.
// Strings are Python strings or numpy.unicode string.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
auto num_chars = item_size / PyUnicode_4BYTE_KIND;
char* src = static_cast<char*>(PyArray_DATA(darray));
const char* str;
Py_ssize_t size;
PyObject* pStr;
for (int i = 0; i < shape.Size(); i++, src += item_size) {
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
str = PyUnicode_AsUTF8AndSize(pStr, &size);
if (str == NULL) {
dst[i] = "";
} else {
// Size is equal to the longest string size, numpy stores
// strings in a single array. Those code assumes a string ends with a final 0.
dst[i] = str;
}
Py_XDECREF(pStr);
}
} else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
// Copy string data which needs to be done after Tensor is allocated.
// Strings are given as bytes (encoded strings).
// NPY_VOID does not trim final 0.
// NPY_STRING assumes bytes string ends with a final 0.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
char* src = static_cast<char*>(PyArray_DATA(darray));
for (int i = 0; i < shape.Size(); i++, src += item_size) {
if (npy_type == NPY_STRING) {
dst[i] = src;
} else {
dst[i].resize(item_size);
memcpy((void*)dst[i].c_str(), src, item_size);
}
}
} else if (npy_type == NPY_OBJECT) {
// Converts object into string.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
char* src = static_cast<char*>(PyArray_DATA(darray));
PyObject *item, *pStr;
for (int i = 0; i < shape.Size(); ++i, src += item_size) {
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
item = PyArray_GETITEM(darray, src);
pStr = PyObject_Str(item);
dst[i] = py::reinterpret_borrow<py::str>(pStr);
Py_XDECREF(pStr);
}
if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
p_tensor = onnxruntime::make_unique<Tensor>(
element_type, shape, static_cast<void*>(PyArray_DATA(darray)), alloc->Info());
} else {
void* buffer = p_tensor->MutableDataRaw();
size_t len;
if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
throw std::runtime_error("length overflow");
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
if (npy_type == NPY_UNICODE) {
// Copy string data which needs to be done after Tensor is allocated.
// Strings are Python strings or numpy.unicode string.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
auto num_chars = item_size / PyUnicode_4BYTE_KIND;
char* src = static_cast<char*>(PyArray_DATA(darray));
const char* str;
Py_ssize_t size;
PyObject* pStr;
for (int i = 0; i < shape.Size(); i++, src += item_size) {
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
str = PyUnicode_AsUTF8AndSize(pStr, &size);
if (str == NULL) {
dst[i] = "";
} else {
// Size is equal to the longest string size, numpy stores
// strings in a single array. Those code assumes a string ends with a final 0.
dst[i] = str;
}
Py_XDECREF(pStr);
}
} else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
// Copy string data which needs to be done after Tensor is allocated.
// Strings are given as bytes (encoded strings).
// NPY_VOID does not trim final 0.
// NPY_STRING assumes bytes string ends with a final 0.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
char* src = static_cast<char*>(PyArray_DATA(darray));
for (int i = 0; i < shape.Size(); i++, src += item_size) {
if (npy_type == NPY_STRING) {
dst[i] = src;
} else {
dst[i].resize(item_size);
memcpy((void*)dst[i].c_str(), src, item_size);
}
}
} else if (npy_type == NPY_OBJECT) {
// Converts object into string.
std::string* dst = p_tensor->MutableData<std::string>();
auto item_size = PyArray_ITEMSIZE(darray);
char* src = static_cast<char*>(PyArray_DATA(darray));
PyObject *item, *pStr;
for (int i = 0; i < shape.Size(); ++i, src += item_size) {
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
item = PyArray_GETITEM(darray, src);
pStr = PyObject_Str(item);
dst[i] = py::reinterpret_borrow<py::str>(pStr);
Py_XDECREF(pStr);
}
} else {
void* buffer = p_tensor->MutableDataRaw();
size_t len;
if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
throw std::runtime_error("length overflow");
}
memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
}
memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
}
} catch (...) {
if (!dref) {

View file

@ -118,6 +118,26 @@ class TestInferenceSession(unittest.TestCase):
np.testing.assert_allclose(
output_expected, res[0], rtol=1e-05, atol=1e-08)
def testRunModel2Contiguous(self):
sess = onnxrt.InferenceSession(self.get_name("matmul_1.onnx"))
x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:,[1,0]]
input_name = sess.get_inputs()[0].name
self.assertEqual(input_name, "X")
input_shape = sess.get_inputs()[0].shape
self.assertEqual(input_shape, [3, 2])
output_name = sess.get_outputs()[0].name
self.assertEqual(output_name, "Y")
output_shape = sess.get_outputs()[0].shape
self.assertEqual(output_shape, [3, 1])
res = sess.run([output_name], {input_name: x})
output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
np.testing.assert_allclose(
output_expected, res[0], rtol=1e-05, atol=1e-08)
xcontiguous = np.ascontiguousarray(x)
rescontiguous = sess.run([output_name], {input_name: xcontiguous})
np.testing.assert_allclose(
output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
def testRunModelMultipleThreads(self):
so = onnxrt.SessionOptions()
so.log_verbosity_level = 1