Implement numpy array over CPU OrtValues on return values (#20539)

### Description
Create numpy arrays based on the native buffers of returned OrtValues.
Hold on to the OrtValue until the numpy array is garbage collected.

### Motivation and Context
This saves cpu on tensor copies and addresses customer concerns.
This commit is contained in:
Dmitri Smirnov 2024-05-08 10:56:36 -07:00 committed by GitHub
parent 156d52163d
commit 08ecf30e0b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 174 additions and 117 deletions

View file

@ -646,7 +646,7 @@ class IOBinding:
return self._iobinding.get_outputs()
def copy_outputs_to_cpu(self):
"""Copy output contents to CPU (if on another device). No-op if already on the CPU."""
"""Copy output contents to CPU."""
return self._iobinding.copy_outputs_to_cpu()
def clear_binding_inputs(self):

View file

@ -161,23 +161,27 @@ void addIoBindingMethods(pybind11::module& m) {
return io_binding->Get()->GetOutputs();
},
py::return_value_policy::reference_internal)
.def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> std::vector<py::object> {
.def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> py::list {
const std::vector<OrtValue>& outputs = io_binding->Get()->GetOutputs();
std::vector<py::object> rfetch;
rfetch.reserve(outputs.size());
size_t pos = 0;
const auto& dtm = io_binding->GetInferenceSession()->GetDataTransferManager();
py::list result;
for (const auto& ort_value : outputs) {
if (ort_value.IsTensor()) {
rfetch.push_back(AddTensorAsPyObj(ort_value, &dtm, nullptr));
// We make a copy of the tensor to CPU even if it is already on CPU
// as the function name implies using DataTransferManager.
py::array arr = PrimitiveTensorToNumpyFromDevice(ort_value, &dtm);
result.append(py::cast<py::object>(arr));
} else if (ort_value.IsSparseTensor()) {
rfetch.push_back(GetPyObjectFromSparseTensor(pos, ort_value, &dtm));
result.append(GetPyObjectFromSparseTensor(pos, ort_value, &dtm));
} else {
rfetch.push_back(AddNonTensorAsPyObj(ort_value, &dtm, nullptr));
result.append(AddNonTensorAsPyObj(ort_value, &dtm, nullptr));
}
++pos;
}
return rfetch;
return result;
});
}

View file

@ -16,6 +16,8 @@
#include "core/framework/ort_value.h"
#include "core/session/inference_session.h"
#include <variant>
PYBIND11_MAKE_OPAQUE(std::vector<OrtValue>);
namespace onnxruntime {
@ -40,6 +42,8 @@ MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type);
using MemCpyFunc = void (*)(void*, const void*, size_t);
using DataTransferAlternative = std::variant<const DataTransferManager*, MemCpyFunc>;
void CpuToCpuMemCpy(void*, const void*, size_t);
void CopyDataToTensor(const pybind11::array& py_array, int npy_type, Tensor& tensor, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
@ -117,9 +121,42 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const
const std::string& name_input, const pybind11::object& value, OrtValue* p_mlvalue,
bool accept_only_numpy_array = false, bool use_numpy_data_memory = true, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
void GetPyObjFromTensor(const Tensor& rtensor, pybind11::object& obj,
const DataTransferManager* data_transfer_manager = nullptr,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
pybind11::object GetPyObjFromTensor(const OrtValue& rtensor,
const DataTransferManager* data_transfer_manager = nullptr,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
// The below two functions are used to convert OrtValue to numpy arrays
/// <summary>
/// This function operates on string tensors. Strings are always
/// copied to python and converted to UTF-16/UCS-4/32 depending on the platform.
/// This is accomplished using py::cast()
///
/// It is an error to pass a non-tensor or a non-string tensor to this function.
/// </summary>
/// <param name="tensor">Tensor that contains strings</param>
/// <returns>py::array object</returns>
pybind11::array StringTensorToNumpyArray(const Tensor& tensor);
/// <summary>
/// Creates a numpy array with shape over OrtValue memory. Numpy array
/// does not own the memory, but it holds a copy or OrtValue in a py::capsule.
/// OrtValue is destroyed when the numpy array is garbage collected.
/// This is used when the OrtValue memory is on CPU.
/// </summary>
/// <param name="ort_value">OrtValue with data</param>
/// <returns>numpy array</returns>
pybind11::array PrimitiveTensorToNumpyOverOrtValue(const OrtValue& ort_value);
/// <summary>
/// Creates a numpy array with shape with a copy of OrtValue data.
/// This function is used when the OrtValue memory is not on CPU.
/// </summary>
/// <param name="ort_value">Source memory that is not on CPU.</param>
/// <param name="data_transfer">a variant encapsulating alternatives for copying data</param>
/// <returns></returns>
pybind11::array PrimitiveTensorToNumpyFromDevice(const OrtValue& ort_value,
const DataTransferAlternative& data_transfer);
template <class T>
struct DecRefFn {

View file

@ -233,20 +233,20 @@ void addOrtValueMethods(pybind11::module& m) {
#endif
})
.def("shape", [](const OrtValue* ort_value) -> py::list {
py::list shape_arr;
#if !defined(DISABLE_SPARSE_TENSORS)
// OrtValue can only be a Tensor/SparseTensor, make this generic to handle non-Tensors
ORT_ENFORCE(ort_value->IsTensor() || ort_value->IsSparseTensor(),
"Only OrtValues that are Tensors/SpareTensors are currently supported");
const auto& dims = (ort_value->IsTensor())
? ort_value->Get<Tensor>().Shape().GetDims()
: ort_value->Get<SparseTensor>().DenseShape().GetDims();
const auto dims = (ort_value->IsTensor())
? ort_value->Get<Tensor>().Shape().GetDims()
: ort_value->Get<SparseTensor>().DenseShape().GetDims();
#else
ORT_ENFORCE(ort_value->IsTensor(), "Only OrtValues that are Tensors are supported in this build");
const auto& dims = ort_value->Get<Tensor>().Shape().GetDims();
const auto dims = ort_value->Get<Tensor>().Shape().GetDims();
#endif
py::list shape_arr;
for (auto dim : dims) {
// For sequence tensors - we would append a list of dims to the outermost list
// For now only tensors are supported in OrtValue
@ -302,18 +302,16 @@ void addOrtValueMethods(pybind11::module& m) {
.def("numpy", [](const OrtValue* ml_value) -> py::object {
ORT_ENFORCE(ml_value->IsTensor(), "Only OrtValues that are Tensors are convertible to Numpy objects");
py::object obj;
#ifdef USE_CUDA
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCudaToHostMemCpyFunction());
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetCudaToHostMemCpyFunction());
#elif USE_ROCM
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetRocmToHostMemCpyFunction());
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetRocmToHostMemCpyFunction());
#elif USE_CANN
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetCannToHostMemCpyFunction());
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetCannToHostMemCpyFunction());
#elif USE_DML
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, GetDmlToHostMemCpyFunction());
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, GetDmlToHostMemCpyFunction());
#else
GetPyObjFromTensor(ml_value->Get<Tensor>(), obj, nullptr, nullptr);
py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
#endif
return obj;
})

View file

@ -305,18 +305,7 @@ void addSparseTensorMethods(pybind11::module& m) {
if (sparse_tensor.IsDataTypeString()) {
// Strings can not be on GPU and require conversion UTF-8 to Python UNICODE
// We need to create a copy.
const int numpy_type = OnnxRuntimeTensorToNumpyType(DataTypeImpl::GetType<std::string>());
ORT_ENFORCE(NPY_OBJECT == numpy_type, "We are expecting to map strings to NPY_OBJECT type");
const auto& values_shape = sparse_tensor.Values().Shape();
py::dtype dtype("object");
py::array result(dtype, values_shape.GetDims(), {});
auto* out_ptr = static_cast<py::object*>(
PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.ptr())));
const std::string* src = sparse_tensor.Values().Data<std::string>();
for (int64_t i = 0, size = values_shape.Size(); i < size; ++i, src++) {
out_ptr[i] = py::cast(*src);
}
return result;
return StringTensorToNumpyArray(sparse_tensor.Values());
} else {
utils::MLTypeCallDispatcher<float, double, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t>
t_disp(sparse_tensor.GetElementType());
@ -386,7 +375,7 @@ void addSparseTensorMethods(pybind11::module& m) {
})
.def("dense_shape", [](const PySparseTensor* py_tensor) -> py::list {
const SparseTensor& st = py_tensor->Instance();
const auto& dims = st.DenseShape().GetDims();
const auto dims = st.DenseShape().GetDims();
// We create a copy of dimensions, it is small
py::list py_dims;
for (auto d : dims) {

View file

@ -166,66 +166,96 @@ static py::object AddNonTensor(const OrtValue& val,
return py::cast(val.Get<T>());
}
// This function is used to return strings from a string tensor to python
// as a numpy array of strings
// Strings are always on CPU and must always be copied to python memory
py::array StringTensorToNumpyArray(const Tensor& tensor) {
// Create the result and allocate memory with the right size
py::array result(py::dtype(NPY_OBJECT), tensor.Shape().GetDims());
const auto span = tensor.DataAsSpan<std::string>();
auto* mutable_data = reinterpret_cast<py::object*>(result.mutable_data());
for (size_t i = 0, lim = span.size(); i < lim; ++i) {
mutable_data[i] = py::cast(span[i]);
}
return result;
}
pybind11::array PrimitiveTensorToNumpyOverOrtValue(const OrtValue& ort_value) {
const Tensor& tensor = ort_value.Get<Tensor>();
// The capsule destructor must be stateless
// We create a copy of OrtValue on the heap.
auto memory_release = [](void* data) {
auto* ort_value = reinterpret_cast<OrtValue*>(data);
delete ort_value;
};
const int numpy_type = OnnxRuntimeTensorToNumpyType(tensor.DataType());
auto ort_value_ptr = std::make_unique<OrtValue>(ort_value);
pybind11::capsule caps(ort_value_ptr.get(), memory_release);
ort_value_ptr.release();
// Not using array_t<T> because it may not handle MLFloat16 properly
pybind11::array result(py::dtype(numpy_type), tensor.Shape().GetDims(),
tensor.DataRaw(),
caps);
return result;
}
pybind11::array PrimitiveTensorToNumpyFromDevice(const OrtValue& ort_value, const DataTransferAlternative& dtm) {
const Tensor& tensor = ort_value.Get<Tensor>();
const int numpy_type = OnnxRuntimeTensorToNumpyType(tensor.DataType());
pybind11::array result(py::dtype(numpy_type), tensor.Shape().GetDims());
void* data = result.mutable_data();
if (std::holds_alternative<const DataTransferManager*>(dtm)) {
const DataTransferManager* data_transfer = std::get<const DataTransferManager*>(dtm);
static const OrtMemoryInfo cpu_alloc_info{onnxruntime::CPU, OrtDeviceAllocator};
const auto span = gsl::make_span<char>(reinterpret_cast<char*>(data), tensor.SizeInBytes());
ORT_THROW_IF_ERROR(CopyTensorDataToByteSpan(*data_transfer, tensor, cpu_alloc_info, span));
} else {
std::get<MemCpyFunc>(dtm)(data, tensor.DataRaw(), tensor.SizeInBytes());
}
return result;
}
// In all cases, we may not have access to a DataTransferManager, hence the user may specify functions that
// pretty much does what a DataTransferManager does - copy data from device(s) to the host
void GetPyObjFromTensor(const Tensor& rtensor, py::object& obj,
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
std::vector<npy_intp> npy_dims;
const TensorShape& shape = rtensor.Shape();
py::object GetPyObjFromTensor(const OrtValue& ort_value,
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
ORT_ENFORCE(ort_value.IsTensor(), "This function only supports tensors");
for (size_t n = 0; n < shape.NumDimensions(); ++n) {
npy_dims.push_back(shape[n]);
const auto& tensor = ort_value.Get<Tensor>();
if (tensor.IsDataTypeString()) {
ORT_ENFORCE(tensor.Location().device.Type() == OrtDevice::CPU, "Strings can only be on CPU");
// Create a numpy array of strings (python objects) by copy/converting them
py::array result = StringTensorToNumpyArray(tensor);
return py::cast<py::object>(result);
}
MLDataType dtype = rtensor.DataType();
const int numpy_type = OnnxRuntimeTensorToNumpyType(dtype);
obj = py::reinterpret_steal<py::object>(PyArray_SimpleNew(
narrow<int>(shape.NumDimensions()), npy_dims.data(), numpy_type));
const auto device_type = tensor.Location().device.Type();
// Create an numpy array on top of the OrtValue memory, no copy
if (device_type == OrtDevice::CPU) {
py::array result = PrimitiveTensorToNumpyOverOrtValue(ort_value);
return py::cast<py::object>(result);
}
void* out_ptr = static_cast<void*>(
PyArray_DATA(reinterpret_cast<PyArrayObject*>(obj.ptr())));
if (!data_transfer_manager && !mem_cpy_to_host_functions) {
throw std::runtime_error(
"GetPyObjFromTensor: Either data transfer manager or a "
"function to copy data to the host is needed to convert non-CPU tensor to numpy array");
}
if (numpy_type != NPY_OBJECT) {
// if it is not cpu tensor, need to copy to host
auto device_type = rtensor.Location().device.Type();
if (device_type != OrtDevice::CPU) {
if (!data_transfer_manager && !mem_cpy_to_host_functions)
throw std::runtime_error(
"GetPyObjFromTensor: Either data transfer manager or a "
"function to copy data to the host is needed to convert non-CPU tensor to numpy array");
static const OrtMemoryInfo cpu_alloc_info{onnxruntime::CPU, OrtDeviceAllocator};
// Prefer DataTransferManager if available
if (data_transfer_manager) {
auto span = gsl::make_span<char>(reinterpret_cast<char*>(out_ptr), dtype->Size() * shape.Size());
ORT_THROW_IF_ERROR(CopyTensorDataToByteSpan(
*data_transfer_manager, rtensor, cpu_alloc_info, span));
} else {
auto mem_cpy_to_host = mem_cpy_to_host_functions->find(device_type);
ORT_ENFORCE(mem_cpy_to_host != mem_cpy_to_host_functions->end(),
"Unable to locate a function that can copy data to the host from the device");
ORT_ENFORCE(mem_cpy_to_host->second != 0,
"No function that can copy data to the host from the device provided");
mem_cpy_to_host->second(out_ptr, rtensor.DataRaw(), dtype->Size() * shape.Size());
}
} else
memcpy(out_ptr, rtensor.DataRaw(dtype), dtype->Size() * shape.Size());
py::array result;
if (data_transfer_manager != nullptr) {
result = PrimitiveTensorToNumpyFromDevice(ort_value, data_transfer_manager);
} else {
// Handle string type.
// Copying strings to cpu from device is currently not supported
ORT_ENFORCE(rtensor.Location().device.Type() == OrtDevice::CPU,
"Copying string tensors located on another device to the host is currently not supported");
py::object* outObj = static_cast<py::object*>(out_ptr);
const std::string* src = rtensor.Data<std::string>();
for (int i = 0; i < rtensor.Shape().Size(); i++, src++) {
outObj[i] = py::cast(*src);
}
auto mem_cpy_to_host = mem_cpy_to_host_functions->find(device_type);
ORT_ENFORCE(mem_cpy_to_host != mem_cpy_to_host_functions->end(),
"Unable to locate a function that can copy data to the host from the device");
result = PrimitiveTensorToNumpyFromDevice(ort_value, mem_cpy_to_host->second);
}
return py::cast<py::object>(result);
}
const char* GetDeviceName(const OrtDevice& device) {
@ -292,10 +322,9 @@ py::object AddNonTensor<TensorSeq>(const OrtValue& val,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const auto& seq_tensors = val.Get<TensorSeq>();
py::list py_list;
for (const auto& rtensor : seq_tensors) {
py::object obj;
GetPyObjFromTensor(rtensor.Get<Tensor>(), obj, data_transfer_manager, mem_cpy_to_host_functions);
py_list.append(obj);
for (const auto& ort_value : seq_tensors) {
py::object obj = GetPyObjFromTensor(ort_value, data_transfer_manager, mem_cpy_to_host_functions);
py_list.append(std::move(obj));
}
// XToolChain kills the build
// local variable 'py_list' will be copied despite being returned by name [-Werror,-Wreturn-std-move]
@ -347,10 +376,7 @@ py::object AddNonTensorAsPyObj(const OrtValue& val,
py::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions) {
const Tensor& rtensor = val.Get<Tensor>();
py::object obj;
GetPyObjFromTensor(rtensor, obj, data_transfer_manager, mem_cpy_to_host_functions);
return obj;
return GetPyObjFromTensor(val, data_transfer_manager, mem_cpy_to_host_functions);
}
static std::unique_ptr<onnxruntime::IExecutionProvider> LoadExecutionProvider(
@ -1863,11 +1889,12 @@ including arg name, arg type (contains both type and shape).)pbdoc")
},
R"pbdoc(Load a model saved in ONNX or ORT format.)pbdoc")
.def("run",
[](PyInferenceSession* sess, std::vector<std::string> output_names,
std::map<std::string, py::object> pyfeeds, RunOptions* run_options = nullptr)
-> std::vector<py::object> {
[](PyInferenceSession* sess, const std::vector<std::string>& output_names,
const std::map<std::string, const py::object>& pyfeeds, RunOptions* run_options = nullptr)
-> py::list {
NameMLValMap feeds;
for (auto feed : pyfeeds) {
feeds.reserve(pyfeeds.size());
for (const auto& feed : pyfeeds) {
// No need to process 'None's sent in by the user
// to feed Optional inputs in the graph.
// We just won't include anything in the feed and ORT
@ -1885,6 +1912,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
}
std::vector<OrtValue> fetches;
fetches.reserve(output_names.size());
common::Status status;
{
@ -1897,29 +1925,28 @@ including arg name, arg type (contains both type and shape).)pbdoc")
}
}
std::vector<py::object> rfetch;
rfetch.reserve(fetches.size());
py::list result;
size_t pos = 0;
for (auto fet : fetches) {
for (const auto& fet : fetches) {
if (fet.IsAllocated()) {
if (fet.IsTensor()) {
rfetch.push_back(AddTensorAsPyObj(fet, nullptr, nullptr));
result.append(AddTensorAsPyObj(fet, nullptr, nullptr));
} else if (fet.IsSparseTensor()) {
rfetch.push_back(GetPyObjectFromSparseTensor(pos, fet, nullptr));
result.append(GetPyObjectFromSparseTensor(pos, fet, nullptr));
} else {
rfetch.push_back(AddNonTensorAsPyObj(fet, nullptr, nullptr));
result.append(AddNonTensorAsPyObj(fet, nullptr, nullptr));
}
} else { // Send back None because the corresponding OrtValue was empty
rfetch.push_back(py::none());
result.append(py::none());
}
++pos;
}
return rfetch;
return result;
})
.def("run_async",
[](PyInferenceSession* sess,
std::vector<std::string> output_names,
std::map<std::string, py::object> pyfeeds,
const std::vector<std::string>& output_names,
const std::map<std::string, py::object>& pyfeeds,
PyCallback callback, py::object user_data = {},
RunOptions* run_options = nullptr)
-> void {
@ -1928,7 +1955,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
async_resource->user_data = user_data;
// prepare feeds
async_resource->ReserveFeeds(pyfeeds.size());
for (auto feed : pyfeeds) {
for (const auto& feed : pyfeeds) {
if (!feed.second.is(py::none())) {
OrtValue ml_value;
auto px = sess->GetSessionHandle()->GetModelInputs();
@ -1945,7 +1972,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
}
// prepare fetches
async_resource->ReserveFetches(output_names.size());
for (auto& output_name : output_names) {
for (const auto& output_name : output_names) {
async_resource->fetch_names.push_back(output_name);
async_resource->fetch_names_raw.push_back(async_resource->fetch_names.back().c_str());
async_resource->fetches_raw.push_back({});
@ -1968,15 +1995,17 @@ including arg name, arg type (contains both type and shape).)pbdoc")
/// a Tensor, SparseTensor or a TensorSequence.
.def("run_with_ort_values", [](PyInferenceSession* sess, const py::dict& feeds, const std::vector<std::string>& output_names, RunOptions* run_options = nullptr) -> std::vector<OrtValue> {
NameMLValMap ort_feeds;
ort_feeds.reserve(feeds.size());
// item is always a copy since dict returns a value and not a ref
// and Apple XToolChain barks
for (const auto item : feeds) {
for (const auto& item : feeds) {
auto name = item.first.cast<std::string>();
const OrtValue* ort_value = item.second.cast<const OrtValue*>();
ort_feeds.emplace(name, *ort_value);
}
std::vector<OrtValue> fetches;
fetches.reserve(output_names.size());
{
// release GIL to allow multiple python threads to invoke Run() in parallel.
py::gil_scoped_release release;
@ -2057,8 +2086,9 @@ including arg name, arg type (contains both type and shape).)pbdoc")
})
.def("get_tuning_results", [](PyInferenceSession* sess) -> py::list {
#if !defined(ORT_MINIMAL_BUILD)
auto results = sess->GetSessionHandle()->GetTuningResults();
py::list ret;
for (const auto& trs : sess->GetSessionHandle()->GetTuningResults()) {
for (const auto& trs : results) {
py::dict py_trs;
py_trs["ep"] = trs.ep;
py_trs["results"] = trs.results;

View file

@ -232,10 +232,9 @@ std::unordered_map<std::string, std::unordered_map<std::string, py::object>> Con
py_tensor_state[layer1_item.first] = {};
for (const auto& layer2_item : layer1_item.second) {
assert(layer2_item.second.IsTensor());
py::object obj;
const Tensor& rtensor = layer2_item.second.Get<Tensor>();
GetPyObjFromTensor(rtensor, obj, &data_transfer_manager);
py_tensor_state[layer1_item.first].insert({layer2_item.first, obj});
py::array arr = PrimitiveTensorToNumpyFromDevice(layer2_item.second,
&data_transfer_manager);
py_tensor_state[layer1_item.first].insert({layer2_item.first, py::cast<py::object>(arr)});
}
}
return py_tensor_state;