onnxruntime/onnxruntime/test/shared_lib/custom_op_utils.h
Tang, Cheng a81faee41e
Multi-stream execution support (#13495)
**Description**: This PR including following works:
1. provide stream and related synchronization abstractions in
onnxruntime.
2. enhance onnxruntime's execution planner / executor / memory arena to
support execute multiple streams in parallel.
3. deprecate the parallel executor for cpu.
4. deprecate the Fence mechanism. 
5. update the cuda / tensorrt EP to support the stream mechanism,
support running different request in different cuda stream.

**Motivation and Context**
- Why is this change required? 
currently, the execution plan is just a linear list of those primitives,
ort will execute them step by step. For any given graph, ORT will
serialize it to a fixed execution order. This sequential execution
design simplifies most scenarios, but it has the following limitations:
1. it is difficult to enable inter-node parallelization, we have a
half-baked parallel executor but it is very difficult to make it work
with GPU.
2. The fence mechanism can work with single gpu stream + cpu thread
case, but when extend to multiple stream, it is difficult to manage the
cross GPU stream synchronizations.
3. our cuda EP rely on the BFCArena to make the memory management work
with the GPU async kernels, but current BFCArena is not aware of the
streams, so it doesn't behavior correctly when run with multiple
streams.

This PR enhance our existing execution plan and executor to support
multiple stream execution. we use an unified algorithm to mange both
single stream and multiple stream scenarios.
This PR mainly focus on the infrastructure support for multiple stream
execution, that is said, given a valid stream assignment, onnxruntime
can execute it correctly. How to generate a good stream assignment for a
given model will be in the future PR.

Co-authored-by: Cheng Tang <chenta@microsoft.com@orttrainingdev9.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: Cheng Tang <chenta@microsoft.com>
Co-authored-by: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
Co-authored-by: cao lei <jslhcl@gmail.com>
Co-authored-by: Lei Cao <leca@microsoft.com>
2022-12-15 07:39:29 -08:00

265 lines
9.2 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/session/onnxruntime_cxx_api.h"
#include <vector>
#ifdef USE_CUDA
#include <cuda_runtime.h>
#endif
struct Input {
const char* name = nullptr;
std::vector<int64_t> dims;
std::vector<float> values;
};
struct MyCustomKernel {
MyCustomKernel(const OrtApi& ort_api, const OrtKernelInfo* /*info*/)
: ort_(ort_api) {
}
void Compute(OrtKernelContext* context);
private:
const OrtApi& ort_;
};
struct MyCustomKernelSecondInputOnCpu {
MyCustomKernelSecondInputOnCpu(const OrtKernelInfo* /*info*/, void* compute_stream)
: compute_stream_(compute_stream) {
}
void Compute(OrtKernelContext* context);
private:
void* compute_stream_;
};
struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
explicit MyCustomOp(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const { return new MyCustomKernel(api, info); };
const char* GetName() const { return "Foo"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
// Both the inputs need to be necessarily of float type
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_{"CPUExecutionProvider"};
};
struct MyCustomOpSecondInputOnCpu : Ort::CustomOpBase<MyCustomOpSecondInputOnCpu, MyCustomKernelSecondInputOnCpu> {
explicit MyCustomOpSecondInputOnCpu(const char* provider, void* compute_stream) : provider_(provider), compute_stream_(compute_stream) {}
void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const { return new MyCustomKernelSecondInputOnCpu(info, compute_stream_); };
const char* GetName() const { return "Foo"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
// Both the inputs need to be necessarily of float type
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtMemType GetInputMemoryType(size_t i) const {
if (i == 1) { return OrtMemTypeCPUInput; }
return OrtMemTypeDefault;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_{"CUDAExecutionProvider"};
void* compute_stream_;
};
struct MyCustomKernelMultipleDynamicInputs {
MyCustomKernelMultipleDynamicInputs(const OrtApi& ort_api, const OrtKernelInfo* /*info*/)
: ort_(ort_api) {
}
void Compute(OrtKernelContext* context);
private:
const OrtApi& ort_;
};
struct MyCustomOpMultipleDynamicInputs : Ort::CustomOpBase<MyCustomOpMultipleDynamicInputs, MyCustomKernelMultipleDynamicInputs> {
explicit MyCustomOpMultipleDynamicInputs(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new MyCustomKernelMultipleDynamicInputs(api, info);
};
const char* GetName() const { return "Foo"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
// Both the inputs are dynamic typed (i.e.) they can be any type and need not be
// homogeneous
return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_;
};
struct MyCustomKernelWithOptionalInput {
MyCustomKernelWithOptionalInput(const OrtKernelInfo* /*info*/) {
}
void Compute(OrtKernelContext* context);
};
struct MyCustomOpWithOptionalInput : Ort::CustomOpBase<MyCustomOpWithOptionalInput, MyCustomKernelWithOptionalInput> {
explicit MyCustomOpWithOptionalInput(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi& /* api */, const OrtKernelInfo* info) const { return new MyCustomKernelWithOptionalInput(info); };
const char* GetName() const { return "FooBar"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 3; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
// The second input (index == 1) is optional
if (index == 1)
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
private:
const char* provider_;
};
struct MyCustomKernelWithAttributes {
MyCustomKernelWithAttributes(const OrtKernelInfo* kernel_info) {
Ort::ConstKernelInfo info{kernel_info};
int_attr_ = info.GetAttribute<int64_t>("int_attr");
float_attr_ = info.GetAttribute<float>("float_attr");
ints_attr_ = info.GetAttributes<int64_t>("ints_attr");
floats_attr_ = info.GetAttributes<float>("floats_attr");
string_arr_ = info.GetAttribute<std::string>("string_attr");
}
void Compute(OrtKernelContext* context);
private:
int64_t int_attr_;
float float_attr_;
std::vector<int64_t> ints_attr_;
std::vector<float> floats_attr_;
std::string string_arr_;
};
struct MyCustomOpWithAttributes : Ort::CustomOpBase<MyCustomOpWithAttributes, MyCustomKernelWithAttributes> {
explicit MyCustomOpWithAttributes(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi&, const OrtKernelInfo* info) const { return new MyCustomKernelWithAttributes(info); };
const char* GetName() const { return "FooBar_Attr"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 1; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_;
};
// Slice array of floats or doubles between [from, to) and save to output
struct SliceCustomOpKernel {
SliceCustomOpKernel(const OrtKernelInfo* /*info*/) {
}
void Compute(OrtKernelContext* context);
};
struct SliceCustomOp : Ort::CustomOpBase<SliceCustomOp, SliceCustomOpKernel> {
explicit SliceCustomOp(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi&, const OrtKernelInfo* info) const {
return new SliceCustomOpKernel(info);
};
const char* GetName() const { return "Slice"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 3; };
ONNXTensorElementDataType GetInputType(size_t index) const {
if (index == 0)
return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; // input array of float or double
else if (index == 1)
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; // slice from
// index 2 (keep compiler happy on Linux)
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; // slice to
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
}
private:
const char* provider_;
};
struct StandaloneCustomKernel {
StandaloneCustomKernel(const OrtKernelInfo* info);
~StandaloneCustomKernel();
void Compute(OrtKernelContext* context);
private:
void InitTopK();
void InvokeTopK(OrtKernelContext* context);
void InitGru();
void InvokeGru(OrtKernelContext* context);
void InitInvokeConv(OrtKernelContext* context); // create Conv and invoke in Compute(...)
Ort::KernelInfo info_copy_{nullptr};
Ort::Op op_add_{nullptr};
Ort::Op op_topk_{nullptr};
Ort::Op op_gru_{nullptr};
};
struct StandaloneCustomOp : Ort::CustomOpBase<StandaloneCustomOp, StandaloneCustomKernel> {
explicit StandaloneCustomOp(const char* provider) : provider_(provider) {}
void* CreateKernel(const OrtApi&, const OrtKernelInfo* info) const { return new StandaloneCustomKernel(info); };
const char* GetName() const { return "Foo"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_;
};