diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index 4d656dd4a3..e536db6472 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -118,110 +118,173 @@ const DataTypeImpl* NumpyToOnnxRuntimeTensorType(int numpy_type) { } } +TensorShape GetArrayShape(PyArrayObject* pyObject) { + int ndim = PyArray_NDIM(pyObject); + const npy_intp* npy_dims = PyArray_DIMS(pyObject); + std::vector dims(ndim); + for (int i = 0; i < ndim; ++i) { + dims[i] = npy_dims[i]; + } + TensorShape shape(std::move(dims)); + return shape; +} + +// This is a one time use, ad-hoc allocator that allows Tensors to take ownership of +// python array objects and use the underlying memory directly and +// properly deallocated them when they are done. +// +// This addresses the case when our interfaces receive python lists on the input. +// We have to convert them into new Numpy arrays which 1) needs to be properly deallocated +// because they are not owned by the calling python code (we create it inside pybind code) +// 2) we still want to avoid yet another data copy and use it directly if possible +// 3) string data types still need to be copied +// +// This is a stateful allocator. It will always return the same pre-allocated +// buffer pointer and will own references to underlying objects. +class OrtPybindSingleUseAllocator : public IAllocator { + public: + // This constructor is used when we create numpy array from python list + OrtPybindSingleUseAllocator(PyArrayObject* pyObject, const std::string& value_name, const OrtMemoryInfo& mem_info) + : pyObject_(pyObject, DecRefFn()), + pyObjectContiguous_(PyArray_GETCONTIGUOUS(pyObject), DecRefFn()), + mem_info_(mem_info) { + ORT_ENFORCE(pyObjectContiguous_ != nullptr, "The object must be a contiguous array for input :", value_name); + } + + // Constructor to use when a contiguous array had to be copied. Instead of creating yet another copy + // we are still able to use it directly for primitive types + OrtPybindSingleUseAllocator(UniqueDecRefPtr&& pyContiguous, const std::string& value_name, const OrtMemoryInfo& mem_info) + : pyObject_(nullptr, DecRefFn()), + pyObjectContiguous_(std::move(pyContiguous)), + mem_info_(mem_info){ + ORT_ENFORCE(pyObjectContiguous_ != nullptr, "Expecting a valid contiguous array:", value_name); + } + + ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtPybindSingleUseAllocator); + + // Always return pre-allocated buffer + // which actually contains the array data + void* Alloc(size_t) override { + return static_cast(PyArray_DATA(pyObjectContiguous_.get())); + } + + void Free(void*) override { + // Free when requested, do not wait for + // destruction of the allocator which may + // be non-deterministic. However, we do not anticipate + // true shared ownership of the allocator object except + // at the creation stack. + pyObjectContiguous_.reset(); + pyObject_.reset(); + } + + const OrtMemoryInfo& Info() const override { + return mem_info_; + } + + PyArrayObject* GetContiguous() const { + return pyObjectContiguous_.get(); + } + + private: + UniqueDecRefPtr pyObject_; + UniqueDecRefPtr pyObjectContiguous_; + OrtMemoryInfo mem_info_; +}; + +using OrtPybindSingleUseAllocatorPtr = std::shared_ptr; + bool PyObjectCheck_Array(PyObject* o) { return PyObject_HasAttrString(o, "__array_finalize__"); } -std::unique_ptr CreateTensor(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject) { - PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject); - if (darray == NULL) { - throw std::runtime_error(std::string("The object must be a contiguous array for input '") + name_input + std::string("'.")); - } - bool dref = false; - std::unique_ptr p_tensor; - try { - const int npy_type = PyArray_TYPE(darray); - // numpy requires long int as its dims. - int ndim = PyArray_NDIM(darray); - npy_intp* npy_dims = PyArray_DIMS(darray); - std::vector dims(ndim); - for (int i = 0; i < ndim; ++i) { - dims[i] = npy_dims[i]; - } - - TensorShape shape(dims); - auto element_type = NumpyToOnnxRuntimeTensorType(npy_type); - if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING && - npy_type != NPY_VOID && npy_type != NPY_OBJECT) { - p_tensor = onnxruntime::make_unique( - element_type, shape, static_cast(PyArray_DATA(darray)), alloc->Info()); - } else { - p_tensor = onnxruntime::make_unique(element_type, shape, alloc); - if (npy_type == NPY_UNICODE) { - // Copy string data which needs to be done after Tensor is allocated. - // Strings are Python strings or numpy.unicode string. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - auto num_chars = item_size / PyUnicode_4BYTE_KIND; - char* src = static_cast(PyArray_DATA(darray)); - const char* str; - Py_ssize_t size; - PyObject* pStr; - for (int i = 0; i < shape.Size(); i++, src += item_size) { - // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. - pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars); - str = PyUnicode_AsUTF8AndSize(pStr, &size); - if (str == NULL) { - dst[i] = ""; - } else { - // Size is equal to the longest string size, numpy stores - // strings in a single array. Those code assumes a string ends with a final 0. - dst[i] = str; - } - Py_XDECREF(pStr); - } - } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) { - // Copy string data which needs to be done after Tensor is allocated. - // Strings are given as bytes (encoded strings). - // NPY_VOID does not trim final 0. - // NPY_STRING assumes bytes string ends with a final 0. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - char* src = static_cast(PyArray_DATA(darray)); - for (int i = 0; i < shape.Size(); i++, src += item_size) { - if (npy_type == NPY_STRING) { - dst[i] = src; - } else { - dst[i].resize(item_size); - memcpy((void*)dst[i].c_str(), src, item_size); - } - } - } else if (npy_type == NPY_OBJECT) { - // Converts object into string. - std::string* dst = p_tensor->MutableData(); - auto item_size = PyArray_ITEMSIZE(darray); - char* src = static_cast(PyArray_DATA(darray)); - PyObject *item, *pStr; - for (int i = 0; i < shape.Size(); ++i, src += item_size) { - // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. - item = PyArray_GETITEM(darray, src); - pStr = PyObject_Str(item); - dst[i] = py::reinterpret_borrow(pStr); - Py_XDECREF(pStr); - } +// Expects p_tensor properly created +// Does not manage darray life-cycle +void CopyDataToTensor(PyArrayObject* darray, int npy_type, std::unique_ptr& p_tensor) { + const auto total_items = p_tensor->Shape().Size(); + if (npy_type == NPY_UNICODE) { + // Copy string data which needs to be done after Tensor is allocated. + // Strings are Python strings or numpy.unicode string. + std::string* dst = p_tensor->MutableData(); + const auto item_size = PyArray_ITEMSIZE(darray); + const auto num_chars = item_size / PyUnicode_4BYTE_KIND; + const char* src = reinterpret_cast(PyArray_DATA(darray)); + for (int i = 0; i < total_items; i++, src += item_size) { + // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. + PyObject* pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars); + UniqueDecRefPtr strGuard(pStr, DecRefFn()); + const char* str = PyUnicode_AsUTF8(pStr); + if (str == NULL) { + dst[i].clear(); } else { - void* buffer = p_tensor->MutableDataRaw(); - size_t len; - if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) { - throw std::runtime_error("length overflow"); - } - memcpy(buffer, static_cast(PyArray_DATA(darray)), len); + // Size is equal to the longest string size, numpy stores + // strings in a single array. + dst[i] = str; } } - } catch (...) { - if (!dref) { - Py_XDECREF(darray); - dref = true; + } else if (npy_type == NPY_STRING || npy_type == NPY_VOID) { + // Copy string data which needs to be done after Tensor is allocated. + // Strings are given as bytes (encoded strings). + // NPY_VOID does not trim final 0. + // NPY_STRING assumes bytes string ends with a final 0. + std::string* dst = p_tensor->MutableData(); + const auto item_size = PyArray_ITEMSIZE(darray); + const char* src = reinterpret_cast(PyArray_DATA(darray)); + for (int i = 0; i < total_items; i++, src += item_size) { + if (npy_type == NPY_STRING) { + dst[i] = src; + } else { + dst[i].assign(src, item_size); + } } - - // allocator should be able to gc the memory created by it. - // ... - - throw; + } else if (npy_type == NPY_OBJECT) { + // Converts object into string. + std::string* dst = p_tensor->MutableData(); + const auto item_size = PyArray_ITEMSIZE(darray); + const char* src = reinterpret_cast(PyArray_DATA(darray)); + for (int i = 0; i < total_items; ++i, src += item_size) { + // Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8. + PyObject* item = PyArray_GETITEM(darray, src); + PyObject* pStr = PyObject_Str(item); + UniqueDecRefPtr strGuard(pStr, DecRefFn()); + dst[i] = py::reinterpret_borrow(pStr); + } + } else { + void* buffer = p_tensor->MutableDataRaw(); + size_t len; + if (!IAllocator::CalcMemSizeForArray(p_tensor->DataType()->Size(), p_tensor->Shape().Size(), &len)) { + throw std::runtime_error("length overflow"); + } + memcpy(buffer, PyArray_DATA(darray), len); } +} - if (!dref) { - Py_XDECREF(darray); +std::unique_ptr CreateTensor(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject) { + PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject); + ORT_ENFORCE(darray != nullptr, "The object must be a contiguous array for input '", name_input, "'."); + + UniqueDecRefPtr darray_guard(darray, DecRefFn()); + std::unique_ptr p_tensor; + + const int npy_type = PyArray_TYPE(darray); + TensorShape shape = GetArrayShape(darray); + auto element_type = NumpyToOnnxRuntimeTensorType(npy_type); + if (npy_type != NPY_UNICODE && npy_type != NPY_STRING && + npy_type != NPY_VOID && npy_type != NPY_OBJECT) { + if (pyObject == darray) { + // Use the memory of numpy array directly. The ownership belongs to the calling + // python code. In this case, the incoming pyObject must itself be contiguous (pyObject == darray). + // darray reference will be decremented but the original array is still alive + p_tensor = onnxruntime::make_unique(element_type, shape, PyArray_DATA(darray), alloc->Info()); + } else { + // This is the case when a contiguous array is a copy. We still can use it directly with OrtPybindSingleUseAllocator + // which takes ownership of the array. + auto pybind_alloc = std::make_shared(std::move(darray_guard), name_input, alloc->Info()); + p_tensor = onnxruntime::make_unique(element_type, shape, std::move(pybind_alloc)); + } + } else { + p_tensor = onnxruntime::make_unique(element_type, shape, alloc); + CopyDataToTensor(darray, npy_type, p_tensor); } return p_tensor; @@ -280,11 +343,34 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input, ml_tensor_sequence->GetDeleteFunc()); } -void CreateTensorMLValue(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject, +void CreateTensorMLValue(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject, OrtValue* p_mlvalue) { auto p_tensor = CreateTensor(alloc, name_input, pyObject); - if (!p_tensor) { - throw std::runtime_error("Got exception while creating tensor for input: " + name_input); + + auto ml_tensor = DataTypeImpl::GetType(); + p_mlvalue->Init(p_tensor.release(), + ml_tensor, + ml_tensor->GetDeleteFunc()); +} + +// This function will create a Tensor that owns the python array memory. This is done to properly +// release python arrays allocated within the pybind code. +void CreateTensorMLValueOwned(const OrtPybindSingleUseAllocatorPtr& pybind_alloc, const AllocatorPtr& alloc, OrtValue* p_mlvalue) { + auto npy_type = PyArray_TYPE(pybind_alloc->GetContiguous()); + TensorShape shape = GetArrayShape(pybind_alloc->GetContiguous()); + auto element_type = NumpyToOnnxRuntimeTensorType(npy_type); + + std::unique_ptr p_tensor; + + if (npy_type != NPY_UNICODE && npy_type != NPY_STRING && + npy_type != NPY_VOID && npy_type != NPY_OBJECT) { + // We are able to reuse the memory of the contiguous python buffer and avoid + // extra copy using OrtPybindAllocator which will take care of the memory + p_tensor = onnxruntime::make_unique(element_type, shape, pybind_alloc); + } else { + // We still need to copy elements properly from the contiguous buffer + p_tensor = onnxruntime::make_unique(element_type, shape, alloc); + CopyDataToTensor(pybind_alloc->GetContiguous(), npy_type, p_tensor); } auto ml_tensor = DataTypeImpl::GetType(); @@ -495,7 +581,7 @@ void CreateGenericIterableMLValue(PyObject* iterator, AllocatorPtr alloc, const } } -void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input, +void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input, py::object& value, OrtValue* p_mlvalue) { onnx::TypeProto type_proto; if (PyObjectCheck_Array(value.ptr())) { @@ -505,15 +591,14 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc } else if (PyList_Check(value.ptr()) && !CheckIfInputIsSequenceType(name_input, input_def_list, type_proto)) { // This is not a sequence tensor. This is just a regular tensor fed through as a list. - if (!type_proto.tensor_type().has_elem_type()) { - std::runtime_error("The graph is missing type information needed to construct the ORT tensor"); - } + ORT_ENFORCE(type_proto.tensor_type().has_elem_type(), "The graph is missing type information needed to construct the ORT tensor"); MLDataType dtype = OrtTypeInfo::ElementTypeFromProto( static_cast(type_proto.tensor_type().elem_type())); int numpy_dtype = OnnxRuntimeTensorToNumpyType(dtype); + // This creates a new object with its own reference count PyArrayObject* arr = reinterpret_cast( PyArray_FromAny(value.ptr(), PyArray_DescrFromType(numpy_dtype), 0, 0, 0, nullptr)); @@ -521,7 +606,10 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc throw std::runtime_error("Could not create tensor from given input list"); } - CreateTensorMLValue(alloc, name_input, arr, p_mlvalue); + // The allocator will own the array memory and will decrement the reference on Free() + // or when destroyed + auto pybind_allloc = std::make_shared(arr, name_input, alloc->Info()); + CreateTensorMLValueOwned(pybind_allloc, alloc, p_mlvalue); } else if (PyList_Check(value.ptr())) { auto* seq_tensors = reinterpret_cast(value.ptr()); CreateSequenceOfTensors(alloc, name_input, input_def_list, seq_tensors, p_mlvalue); diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h index 6dbd611872..fdcaeb04b6 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h @@ -24,7 +24,17 @@ int OnnxRuntimeTensorToNumpyType(const DataTypeImpl* tensor_type); MLDataType NumpyTypeToOnnxRuntimeType(int numpy_type); -void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input, +void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input, py::object& value, OrtValue* p_mlvalue); + +template +struct DecRefFn { + void operator()(T* pyobject) const { + Py_XDECREF(pyobject); + } +}; + +template +using UniqueDecRefPtr = std::unique_ptr>; } // namespace python } // namespace onnxruntime diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index f841e0487f..bb726b553a 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -22,7 +22,7 @@ using namespace onnxruntime::logging; // BEGIN: forward declaration for stuff in onnxruntime_pybind_state void InitializeSession(InferenceSession* sess, const std::vector& provider_types); void GetPyObjFromTensor(const Tensor& rtensor, py::object& obj, const DataTransferManager* data_transfer_manager = nullptr); -void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input, +void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input, py::object& value, OrtValue* p_mlvalue); // END: forward declaration