mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-27 22:45:57 +00:00
Address performance regression with const tensors shared across DML partitions
This commit is contained in:
parent
7aed386a2b
commit
1bf441415a
3 changed files with 21 additions and 7 deletions
|
|
@ -35,7 +35,6 @@ using SupportedTypeList = boost::mp11::mp_list<MLFloat16, float, double, int32_t
|
|||
// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory saving
|
||||
// or more CSE optimizations triggered). Should be careful to cover test cases that assume initializer
|
||||
// name did not change after transformation then.
|
||||
static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
|
||||
static constexpr char SHARED_INITIALIZER_PREFIX[] = "ortshared_";
|
||||
|
||||
bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
|
||||
|
|
@ -52,12 +51,12 @@ bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
|
|||
|
||||
int64_t dim_value = dim.dim_value();
|
||||
num_elements *= dim_value;
|
||||
if (num_elements > TENSOR_ELEM_COUNT_THRESHOLD) {
|
||||
if (num_elements > ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (num_elements > 0 && num_elements <= TENSOR_ELEM_COUNT_THRESHOLD) {
|
||||
if (num_elements > 0 && num_elements <= ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@ class ConstantSharing : public GraphTransformer {
|
|||
excluded_initializers_(excluded_initializers) {
|
||||
}
|
||||
|
||||
static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
|
||||
|
||||
private:
|
||||
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "GraphPartitioner.h"
|
||||
#include "core/framework/kernel_type_str_resolver.h"
|
||||
#include "core/framework/kernel_lookup.h"
|
||||
#include "core/optimizer/constant_sharing.h"
|
||||
#include "FusedGraphKernel.h"
|
||||
#include "MLOperatorAuthorImpl.h"
|
||||
#include "DmlGraphFusionHelper.h"
|
||||
|
|
@ -87,11 +88,23 @@ namespace Dml
|
|||
assert(iter != initializerPartitionMap.end());
|
||||
if (iter->second.size() > 1)
|
||||
{
|
||||
if (requiredInitializerMap.find(input) != requiredInitializerMap.end())
|
||||
// By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
|
||||
// to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
|
||||
// The size threshold for this should be no smaller than that used to combine initializers in the constant
|
||||
// sharing transform to prevent that transform from hurting performance.
|
||||
// If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
|
||||
const uint64_t maximumElementsForDuplicationTensor = 64;
|
||||
static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
|
||||
|
||||
uint64_t totalElementCount = 1;
|
||||
for (int i = 0; i < tensor->dims().size(); ++i)
|
||||
{
|
||||
totalElementCount *= tensor->dims()[i];
|
||||
}
|
||||
|
||||
if (totalElementCount <= maximumElementsForDuplicationTensor ||
|
||||
requiredInitializerMap.find(input) != requiredInitializerMap.end())
|
||||
{
|
||||
// The kernel relies on this input to be initialized, and it should be small enough to copy
|
||||
// cheaply. FusedGraphKernel only handles constant CPU inputs through transferred initializers,
|
||||
// rather than ORT, to avoid mismatches in policy or implementation causing failures.
|
||||
isInitializerTransferable[input] = {tensor, false};
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue