Address performance regression with const tensors shared across DML partitions

This commit is contained in:
jeffbloo 2023-05-24 18:09:28 -07:00
parent 7aed386a2b
commit 1bf441415a
3 changed files with 21 additions and 7 deletions

View file

@ -35,7 +35,6 @@ using SupportedTypeList = boost::mp11::mp_list<MLFloat16, float, double, int32_t
// TODO(pengwa): we can gradually increase this threshold if we see more benefits (memory saving
// or more CSE optimizations triggered). Should be careful to cover test cases that assume initializer
// name did not change after transformation then.
static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
static constexpr char SHARED_INITIALIZER_PREFIX[] = "ortshared_";
bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
@ -52,12 +51,12 @@ bool IsAllowedToShare(const ONNX_NAMESPACE::TensorShapeProto* input_shape,
int64_t dim_value = dim.dim_value();
num_elements *= dim_value;
if (num_elements > TENSOR_ELEM_COUNT_THRESHOLD) {
if (num_elements > ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
return false;
}
}
if (num_elements > 0 && num_elements <= TENSOR_ELEM_COUNT_THRESHOLD) {
if (num_elements > 0 && num_elements <= ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD) {
return true;
}

View file

@ -29,6 +29,8 @@ class ConstantSharing : public GraphTransformer {
excluded_initializers_(excluded_initializers) {
}
static constexpr int64_t TENSOR_ELEM_COUNT_THRESHOLD = 8;
private:
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;

View file

@ -7,6 +7,7 @@
#include "GraphPartitioner.h"
#include "core/framework/kernel_type_str_resolver.h"
#include "core/framework/kernel_lookup.h"
#include "core/optimizer/constant_sharing.h"
#include "FusedGraphKernel.h"
#include "MLOperatorAuthorImpl.h"
#include "DmlGraphFusionHelper.h"
@ -87,11 +88,23 @@ namespace Dml
assert(iter != initializerPartitionMap.end());
if (iter->second.size() > 1)
{
if (requiredInitializerMap.find(input) != requiredInitializerMap.end())
// By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
// to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
// The size threshold for this should be no smaller than that used to combine initializers in the constant
// sharing transform to prevent that transform from hurting performance.
// If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
const uint64_t maximumElementsForDuplicationTensor = 64;
static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
uint64_t totalElementCount = 1;
for (int i = 0; i < tensor->dims().size(); ++i)
{
totalElementCount *= tensor->dims()[i];
}
if (totalElementCount <= maximumElementsForDuplicationTensor ||
requiredInitializerMap.find(input) != requiredInitializerMap.end())
{
// The kernel relies on this input to be initialized, and it should be small enough to copy
// cheaply. FusedGraphKernel only handles constant CPU inputs through transferred initializers,
// rather than ORT, to avoid mismatches in policy or implementation causing failures.
isInitializerTransferable[input] = {tensor, false};
}