mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
Address memory leak and improve memory handling (#4124)
Fix memory leak when a Python list passed as a feed. Create a custom allocator that can take ownership of python arrays that are created inside pybind. Allow direct memory use if continuous array is a copy because we now can take ownership of it by the allocator.
This commit is contained in:
parent
b8db8076cb
commit
4e1dac67cd
3 changed files with 202 additions and 104 deletions
|
|
@ -118,110 +118,173 @@ const DataTypeImpl* NumpyToOnnxRuntimeTensorType(int numpy_type) {
|
|||
}
|
||||
}
|
||||
|
||||
TensorShape GetArrayShape(PyArrayObject* pyObject) {
|
||||
int ndim = PyArray_NDIM(pyObject);
|
||||
const npy_intp* npy_dims = PyArray_DIMS(pyObject);
|
||||
std::vector<int64_t> dims(ndim);
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
dims[i] = npy_dims[i];
|
||||
}
|
||||
TensorShape shape(std::move(dims));
|
||||
return shape;
|
||||
}
|
||||
|
||||
// This is a one time use, ad-hoc allocator that allows Tensors to take ownership of
|
||||
// python array objects and use the underlying memory directly and
|
||||
// properly deallocated them when they are done.
|
||||
//
|
||||
// This addresses the case when our interfaces receive python lists on the input.
|
||||
// We have to convert them into new Numpy arrays which 1) needs to be properly deallocated
|
||||
// because they are not owned by the calling python code (we create it inside pybind code)
|
||||
// 2) we still want to avoid yet another data copy and use it directly if possible
|
||||
// 3) string data types still need to be copied
|
||||
//
|
||||
// This is a stateful allocator. It will always return the same pre-allocated
|
||||
// buffer pointer and will own references to underlying objects.
|
||||
class OrtPybindSingleUseAllocator : public IAllocator {
|
||||
public:
|
||||
// This constructor is used when we create numpy array from python list
|
||||
OrtPybindSingleUseAllocator(PyArrayObject* pyObject, const std::string& value_name, const OrtMemoryInfo& mem_info)
|
||||
: pyObject_(pyObject, DecRefFn<PyArrayObject>()),
|
||||
pyObjectContiguous_(PyArray_GETCONTIGUOUS(pyObject), DecRefFn<PyArrayObject>()),
|
||||
mem_info_(mem_info) {
|
||||
ORT_ENFORCE(pyObjectContiguous_ != nullptr, "The object must be a contiguous array for input :", value_name);
|
||||
}
|
||||
|
||||
// Constructor to use when a contiguous array had to be copied. Instead of creating yet another copy
|
||||
// we are still able to use it directly for primitive types
|
||||
OrtPybindSingleUseAllocator(UniqueDecRefPtr<PyArrayObject>&& pyContiguous, const std::string& value_name, const OrtMemoryInfo& mem_info)
|
||||
: pyObject_(nullptr, DecRefFn<PyArrayObject>()),
|
||||
pyObjectContiguous_(std::move(pyContiguous)),
|
||||
mem_info_(mem_info){
|
||||
ORT_ENFORCE(pyObjectContiguous_ != nullptr, "Expecting a valid contiguous array:", value_name);
|
||||
}
|
||||
|
||||
ORT_DISALLOW_COPY_AND_ASSIGNMENT(OrtPybindSingleUseAllocator);
|
||||
|
||||
// Always return pre-allocated buffer
|
||||
// which actually contains the array data
|
||||
void* Alloc(size_t) override {
|
||||
return static_cast<void*>(PyArray_DATA(pyObjectContiguous_.get()));
|
||||
}
|
||||
|
||||
void Free(void*) override {
|
||||
// Free when requested, do not wait for
|
||||
// destruction of the allocator which may
|
||||
// be non-deterministic. However, we do not anticipate
|
||||
// true shared ownership of the allocator object except
|
||||
// at the creation stack.
|
||||
pyObjectContiguous_.reset();
|
||||
pyObject_.reset();
|
||||
}
|
||||
|
||||
const OrtMemoryInfo& Info() const override {
|
||||
return mem_info_;
|
||||
}
|
||||
|
||||
PyArrayObject* GetContiguous() const {
|
||||
return pyObjectContiguous_.get();
|
||||
}
|
||||
|
||||
private:
|
||||
UniqueDecRefPtr<PyArrayObject> pyObject_;
|
||||
UniqueDecRefPtr<PyArrayObject> pyObjectContiguous_;
|
||||
OrtMemoryInfo mem_info_;
|
||||
};
|
||||
|
||||
using OrtPybindSingleUseAllocatorPtr = std::shared_ptr<OrtPybindSingleUseAllocator>;
|
||||
|
||||
bool PyObjectCheck_Array(PyObject* o) {
|
||||
return PyObject_HasAttrString(o, "__array_finalize__");
|
||||
}
|
||||
|
||||
std::unique_ptr<Tensor> CreateTensor(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject) {
|
||||
PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject);
|
||||
if (darray == NULL) {
|
||||
throw std::runtime_error(std::string("The object must be a contiguous array for input '") + name_input + std::string("'."));
|
||||
}
|
||||
bool dref = false;
|
||||
std::unique_ptr<Tensor> p_tensor;
|
||||
try {
|
||||
const int npy_type = PyArray_TYPE(darray);
|
||||
// numpy requires long int as its dims.
|
||||
int ndim = PyArray_NDIM(darray);
|
||||
npy_intp* npy_dims = PyArray_DIMS(darray);
|
||||
std::vector<int64_t> dims(ndim);
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
dims[i] = npy_dims[i];
|
||||
}
|
||||
|
||||
TensorShape shape(dims);
|
||||
auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
|
||||
if (pyObject == darray && npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
|
||||
npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(
|
||||
element_type, shape, static_cast<void*>(PyArray_DATA(darray)), alloc->Info());
|
||||
} else {
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
|
||||
if (npy_type == NPY_UNICODE) {
|
||||
// Copy string data which needs to be done after Tensor is allocated.
|
||||
// Strings are Python strings or numpy.unicode string.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
auto item_size = PyArray_ITEMSIZE(darray);
|
||||
auto num_chars = item_size / PyUnicode_4BYTE_KIND;
|
||||
char* src = static_cast<char*>(PyArray_DATA(darray));
|
||||
const char* str;
|
||||
Py_ssize_t size;
|
||||
PyObject* pStr;
|
||||
for (int i = 0; i < shape.Size(); i++, src += item_size) {
|
||||
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
|
||||
pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
|
||||
str = PyUnicode_AsUTF8AndSize(pStr, &size);
|
||||
if (str == NULL) {
|
||||
dst[i] = "";
|
||||
} else {
|
||||
// Size is equal to the longest string size, numpy stores
|
||||
// strings in a single array. Those code assumes a string ends with a final 0.
|
||||
dst[i] = str;
|
||||
}
|
||||
Py_XDECREF(pStr);
|
||||
}
|
||||
} else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
|
||||
// Copy string data which needs to be done after Tensor is allocated.
|
||||
// Strings are given as bytes (encoded strings).
|
||||
// NPY_VOID does not trim final 0.
|
||||
// NPY_STRING assumes bytes string ends with a final 0.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
auto item_size = PyArray_ITEMSIZE(darray);
|
||||
char* src = static_cast<char*>(PyArray_DATA(darray));
|
||||
for (int i = 0; i < shape.Size(); i++, src += item_size) {
|
||||
if (npy_type == NPY_STRING) {
|
||||
dst[i] = src;
|
||||
} else {
|
||||
dst[i].resize(item_size);
|
||||
memcpy((void*)dst[i].c_str(), src, item_size);
|
||||
}
|
||||
}
|
||||
} else if (npy_type == NPY_OBJECT) {
|
||||
// Converts object into string.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
auto item_size = PyArray_ITEMSIZE(darray);
|
||||
char* src = static_cast<char*>(PyArray_DATA(darray));
|
||||
PyObject *item, *pStr;
|
||||
for (int i = 0; i < shape.Size(); ++i, src += item_size) {
|
||||
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
|
||||
item = PyArray_GETITEM(darray, src);
|
||||
pStr = PyObject_Str(item);
|
||||
dst[i] = py::reinterpret_borrow<py::str>(pStr);
|
||||
Py_XDECREF(pStr);
|
||||
}
|
||||
// Expects p_tensor properly created
|
||||
// Does not manage darray life-cycle
|
||||
void CopyDataToTensor(PyArrayObject* darray, int npy_type, std::unique_ptr<Tensor>& p_tensor) {
|
||||
const auto total_items = p_tensor->Shape().Size();
|
||||
if (npy_type == NPY_UNICODE) {
|
||||
// Copy string data which needs to be done after Tensor is allocated.
|
||||
// Strings are Python strings or numpy.unicode string.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
const auto item_size = PyArray_ITEMSIZE(darray);
|
||||
const auto num_chars = item_size / PyUnicode_4BYTE_KIND;
|
||||
const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
|
||||
for (int i = 0; i < total_items; i++, src += item_size) {
|
||||
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
|
||||
PyObject* pStr = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, src, num_chars);
|
||||
UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
|
||||
const char* str = PyUnicode_AsUTF8(pStr);
|
||||
if (str == NULL) {
|
||||
dst[i].clear();
|
||||
} else {
|
||||
void* buffer = p_tensor->MutableDataRaw();
|
||||
size_t len;
|
||||
if (!IAllocator::CalcMemSizeForArray(element_type->Size(), shape.Size(), &len)) {
|
||||
throw std::runtime_error("length overflow");
|
||||
}
|
||||
memcpy(buffer, static_cast<void*>(PyArray_DATA(darray)), len);
|
||||
// Size is equal to the longest string size, numpy stores
|
||||
// strings in a single array.
|
||||
dst[i] = str;
|
||||
}
|
||||
}
|
||||
} catch (...) {
|
||||
if (!dref) {
|
||||
Py_XDECREF(darray);
|
||||
dref = true;
|
||||
} else if (npy_type == NPY_STRING || npy_type == NPY_VOID) {
|
||||
// Copy string data which needs to be done after Tensor is allocated.
|
||||
// Strings are given as bytes (encoded strings).
|
||||
// NPY_VOID does not trim final 0.
|
||||
// NPY_STRING assumes bytes string ends with a final 0.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
const auto item_size = PyArray_ITEMSIZE(darray);
|
||||
const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
|
||||
for (int i = 0; i < total_items; i++, src += item_size) {
|
||||
if (npy_type == NPY_STRING) {
|
||||
dst[i] = src;
|
||||
} else {
|
||||
dst[i].assign(src, item_size);
|
||||
}
|
||||
}
|
||||
|
||||
// allocator should be able to gc the memory created by it.
|
||||
// ...
|
||||
|
||||
throw;
|
||||
} else if (npy_type == NPY_OBJECT) {
|
||||
// Converts object into string.
|
||||
std::string* dst = p_tensor->MutableData<std::string>();
|
||||
const auto item_size = PyArray_ITEMSIZE(darray);
|
||||
const char* src = reinterpret_cast<const char*>(PyArray_DATA(darray));
|
||||
for (int i = 0; i < total_items; ++i, src += item_size) {
|
||||
// Python unicode strings are assumed to be USC-4. Strings are stored as UTF-8.
|
||||
PyObject* item = PyArray_GETITEM(darray, src);
|
||||
PyObject* pStr = PyObject_Str(item);
|
||||
UniqueDecRefPtr<PyObject> strGuard(pStr, DecRefFn<PyObject>());
|
||||
dst[i] = py::reinterpret_borrow<py::str>(pStr);
|
||||
}
|
||||
} else {
|
||||
void* buffer = p_tensor->MutableDataRaw();
|
||||
size_t len;
|
||||
if (!IAllocator::CalcMemSizeForArray(p_tensor->DataType()->Size(), p_tensor->Shape().Size(), &len)) {
|
||||
throw std::runtime_error("length overflow");
|
||||
}
|
||||
memcpy(buffer, PyArray_DATA(darray), len);
|
||||
}
|
||||
}
|
||||
|
||||
if (!dref) {
|
||||
Py_XDECREF(darray);
|
||||
std::unique_ptr<Tensor> CreateTensor(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject) {
|
||||
PyArrayObject* darray = PyArray_GETCONTIGUOUS(pyObject);
|
||||
ORT_ENFORCE(darray != nullptr, "The object must be a contiguous array for input '", name_input, "'.");
|
||||
|
||||
UniqueDecRefPtr<PyArrayObject> darray_guard(darray, DecRefFn<PyArrayObject>());
|
||||
std::unique_ptr<Tensor> p_tensor;
|
||||
|
||||
const int npy_type = PyArray_TYPE(darray);
|
||||
TensorShape shape = GetArrayShape(darray);
|
||||
auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
|
||||
if (npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
|
||||
npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
|
||||
if (pyObject == darray) {
|
||||
// Use the memory of numpy array directly. The ownership belongs to the calling
|
||||
// python code. In this case, the incoming pyObject must itself be contiguous (pyObject == darray).
|
||||
// darray reference will be decremented but the original array is still alive
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, PyArray_DATA(darray), alloc->Info());
|
||||
} else {
|
||||
// This is the case when a contiguous array is a copy. We still can use it directly with OrtPybindSingleUseAllocator
|
||||
// which takes ownership of the array.
|
||||
auto pybind_alloc = std::make_shared<OrtPybindSingleUseAllocator>(std::move(darray_guard), name_input, alloc->Info());
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, std::move(pybind_alloc));
|
||||
}
|
||||
} else {
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
|
||||
CopyDataToTensor(darray, npy_type, p_tensor);
|
||||
}
|
||||
|
||||
return p_tensor;
|
||||
|
|
@ -280,11 +343,34 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input,
|
|||
ml_tensor_sequence->GetDeleteFunc());
|
||||
}
|
||||
|
||||
void CreateTensorMLValue(AllocatorPtr alloc, const std::string& name_input, PyArrayObject* pyObject,
|
||||
void CreateTensorMLValue(const AllocatorPtr& alloc, const std::string& name_input, PyArrayObject* pyObject,
|
||||
OrtValue* p_mlvalue) {
|
||||
auto p_tensor = CreateTensor(alloc, name_input, pyObject);
|
||||
if (!p_tensor) {
|
||||
throw std::runtime_error("Got exception while creating tensor for input: " + name_input);
|
||||
|
||||
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
|
||||
p_mlvalue->Init(p_tensor.release(),
|
||||
ml_tensor,
|
||||
ml_tensor->GetDeleteFunc());
|
||||
}
|
||||
|
||||
// This function will create a Tensor that owns the python array memory. This is done to properly
|
||||
// release python arrays allocated within the pybind code.
|
||||
void CreateTensorMLValueOwned(const OrtPybindSingleUseAllocatorPtr& pybind_alloc, const AllocatorPtr& alloc, OrtValue* p_mlvalue) {
|
||||
auto npy_type = PyArray_TYPE(pybind_alloc->GetContiguous());
|
||||
TensorShape shape = GetArrayShape(pybind_alloc->GetContiguous());
|
||||
auto element_type = NumpyToOnnxRuntimeTensorType(npy_type);
|
||||
|
||||
std::unique_ptr<Tensor> p_tensor;
|
||||
|
||||
if (npy_type != NPY_UNICODE && npy_type != NPY_STRING &&
|
||||
npy_type != NPY_VOID && npy_type != NPY_OBJECT) {
|
||||
// We are able to reuse the memory of the contiguous python buffer and avoid
|
||||
// extra copy using OrtPybindAllocator which will take care of the memory
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, pybind_alloc);
|
||||
} else {
|
||||
// We still need to copy elements properly from the contiguous buffer
|
||||
p_tensor = onnxruntime::make_unique<Tensor>(element_type, shape, alloc);
|
||||
CopyDataToTensor(pybind_alloc->GetContiguous(), npy_type, p_tensor);
|
||||
}
|
||||
|
||||
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
|
||||
|
|
@ -495,7 +581,7 @@ void CreateGenericIterableMLValue(PyObject* iterator, AllocatorPtr alloc, const
|
|||
}
|
||||
}
|
||||
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
|
||||
py::object& value, OrtValue* p_mlvalue) {
|
||||
onnx::TypeProto type_proto;
|
||||
if (PyObjectCheck_Array(value.ptr())) {
|
||||
|
|
@ -505,15 +591,14 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc
|
|||
} else if (PyList_Check(value.ptr()) &&
|
||||
!CheckIfInputIsSequenceType(name_input, input_def_list, type_proto)) {
|
||||
// This is not a sequence tensor. This is just a regular tensor fed through as a list.
|
||||
if (!type_proto.tensor_type().has_elem_type()) {
|
||||
std::runtime_error("The graph is missing type information needed to construct the ORT tensor");
|
||||
}
|
||||
ORT_ENFORCE(type_proto.tensor_type().has_elem_type(), "The graph is missing type information needed to construct the ORT tensor");
|
||||
|
||||
MLDataType dtype = OrtTypeInfo::ElementTypeFromProto(
|
||||
static_cast<ONNX_NAMESPACE::TensorProto_DataType>(type_proto.tensor_type().elem_type()));
|
||||
|
||||
int numpy_dtype = OnnxRuntimeTensorToNumpyType(dtype);
|
||||
|
||||
// This creates a new object with its own reference count
|
||||
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(
|
||||
PyArray_FromAny(value.ptr(), PyArray_DescrFromType(numpy_dtype), 0, 0, 0, nullptr));
|
||||
|
||||
|
|
@ -521,7 +606,10 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, Alloc
|
|||
throw std::runtime_error("Could not create tensor from given input list");
|
||||
}
|
||||
|
||||
CreateTensorMLValue(alloc, name_input, arr, p_mlvalue);
|
||||
// The allocator will own the array memory and will decrement the reference on Free()
|
||||
// or when destroyed
|
||||
auto pybind_allloc = std::make_shared<OrtPybindSingleUseAllocator>(arr, name_input, alloc->Info());
|
||||
CreateTensorMLValueOwned(pybind_allloc, alloc, p_mlvalue);
|
||||
} else if (PyList_Check(value.ptr())) {
|
||||
auto* seq_tensors = reinterpret_cast<PyObject*>(value.ptr());
|
||||
CreateSequenceOfTensors(alloc, name_input, input_def_list, seq_tensors, p_mlvalue);
|
||||
|
|
|
|||
|
|
@ -24,7 +24,17 @@ int OnnxRuntimeTensorToNumpyType(const DataTypeImpl* tensor_type);
|
|||
|
||||
MLDataType NumpyTypeToOnnxRuntimeType(int numpy_type);
|
||||
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
|
||||
py::object& value, OrtValue* p_mlvalue);
|
||||
|
||||
template <class T>
|
||||
struct DecRefFn {
|
||||
void operator()(T* pyobject) const {
|
||||
Py_XDECREF(pyobject);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using UniqueDecRefPtr = std::unique_ptr<T, DecRefFn<T>>;
|
||||
} // namespace python
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ using namespace onnxruntime::logging;
|
|||
// BEGIN: forward declaration for stuff in onnxruntime_pybind_state
|
||||
void InitializeSession(InferenceSession* sess, const std::vector<std::string>& provider_types);
|
||||
void GetPyObjFromTensor(const Tensor& rtensor, py::object& obj, const DataTransferManager* data_transfer_manager = nullptr);
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, AllocatorPtr alloc, const std::string& name_input,
|
||||
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc, const std::string& name_input,
|
||||
py::object& value, OrtValue* p_mlvalue);
|
||||
// END: forward declaration
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue