onnxruntime/onnxruntime/python/onnxruntime_pybind_mlvalue.h
Patrice Vignola 49512e558a
[DML EP] Add I/O binding and If operator (#16859)
Being able to leverage I/O binding for DML and registering `If` for the
DML EP allows us to avoid copying the past/present key/values back and
forth between the CPU and the GPU after every token.

This gives us a 25% performance increase for Dolly V2 with 128 tokens on
an RTX 4090.
2023-07-31 19:45:59 -07:00

129 lines
4.3 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include "onnxruntime_pybind.h" // must use this for the include of <pybind11/pybind11.h>
#include <pybind11/iostream.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include "core/common/logging/logging.h"
#include "core/common/logging/sinks/clog_sink.h"
#include "core/common/logging/sinks/cerr_sink.h"
#include "core/session/environment.h"
#include "core/framework/ort_value.h"
#include "core/session/inference_session.h"
PYBIND11_MAKE_OPAQUE(std::vector<OrtValue>);
namespace onnxruntime {
namespace python {
extern const char* PYTHON_ORTVALUE_OBJECT_NAME;
extern const char* PYTHON_ORTVALUE_NATIVE_OBJECT_ATTR;
bool IsNumericNumpyArray(const pybind11::object& py_object);
bool IsNumpyArray(pybind11::object& obj);
int GetNumpyArrayType(const pybind11::object& obj);
bool IsNumericDType(const pybind11::dtype& dtype);
TensorShape GetShape(const pybind11::array& arr);
int OnnxRuntimeTensorToNumpyType(const DataTypeImpl* tensor_type);
MLDataType NumpyTypeToOnnxRuntimeTensorType(int numpy_type);
using MemCpyFunc = void (*)(void*, const void*, size_t);
void CpuToCpuMemCpy(void*, const void*, size_t);
void CopyDataToTensor(const pybind11::array& py_array, int npy_type, Tensor& tensor, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
pybind11::object AddTensorAsPyObj(const OrtValue& val, const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions);
pybind11::object GetPyObjectFromSparseTensor(size_t pos, const OrtValue& ort_value, const DataTransferManager* data_transfer_manager);
pybind11::object AddNonTensorAsPyObj(const OrtValue& val,
const DataTransferManager* data_transfer_manager,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions);
OrtMemoryInfo GetMemoryInfoPerDeviceType(const OrtDevice& ort_device);
int32_t GetTensorProtoType(const OrtValue& ort_value);
#ifdef USE_CUDA
void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes);
void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction();
bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id);
AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id);
std::unique_ptr<IDataTransfer> GetGPUDataTransfer();
#endif
#ifdef USE_DML
AllocatorPtr GetDmlAllocator(OrtDevice::DeviceId id);
#endif
#ifdef USE_CANN
void CpuToCannMemCpy(void* dst, const void* src, size_t num_bytes);
void CannToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCannToHostMemCpyFunction();
bool IsCannDeviceIdValid(const onnxruntime::logging::Logger& logger, int id);
AllocatorPtr GetCannAllocator(OrtDevice::DeviceId id);
#endif
#ifdef USE_ROCM
bool IsRocmDeviceIdValid(const onnxruntime::logging::Logger& logger, int id);
AllocatorPtr GetRocmAllocator(OrtDevice::DeviceId id);
void CpuToRocmMemCpy(void* dst, const void* src, size_t num_bytes);
void RocmToCpuMemCpy(void* dst, const void* src, size_t num_bytes);
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetRocmToHostMemCpyFunction();
#endif
void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const AllocatorPtr& alloc,
const std::string& name_input, const pybind11::object& value, OrtValue* p_mlvalue,
bool accept_only_numpy_array = false, bool use_numpy_data_memory = true, MemCpyFunc mem_cpy_to_device = CpuToCpuMemCpy);
void GetPyObjFromTensor(const Tensor& rtensor, pybind11::object& obj,
const DataTransferManager* data_transfer_manager = nullptr,
const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* mem_cpy_to_host_functions = nullptr);
template <class T>
struct DecRefFn {
void operator()(T* pyobject) const {
Py_XDECREF(pyobject);
}
};
template <class T>
using UniqueDecRefPtr = std::unique_ptr<T, DecRefFn<T>>;
} // namespace python
} // namespace onnxruntime