[CoreML] more performace flag (#22975)

### Description
refactor unsquzee's implementation
add more flags to boost peformance.
add profile flag


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: jicwen <jicwen@YiMacBook-Pro.local>
Co-authored-by: wejoncy <wejoncy@.com>
Co-authored-by: Scott McKay <skottmckay@gmail.com>
This commit is contained in:
wejoncy 2024-12-10 09:35:05 +08:00 committed by GitHub
parent 8f3384b4c1
commit e12421be30
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 173 additions and 36 deletions

View file

@ -47,8 +47,20 @@ enum COREMLFlags {
// and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
// Core ML segments the models compute graph and specializes each segment for the target compute device.
// This process can affect the model loading time and the prediction latency.
// Use this option to tailor the specialization strategy for your model.
static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
// Profile the Core ML MLComputePlan.
// This logs the hardware each operator is dispatched to and the estimated execution time.
// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
#ifdef __cplusplus
extern "C" {

View file

@ -151,7 +151,7 @@ bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBu
return false;
}
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
// To Pass IOS pipeline https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=134&_a=summary
auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && input_params.coreml_version < 7) {

View file

@ -133,9 +133,8 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
return false;
}
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
// to pass https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1563483&view=logs&j=f7cc61a9-cc70-56e7-b06c-4668ca17e426
// ReductionOpTest.ReduceSum_half_bert
#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
// skip ReductionOpTest.ReduceSum_half_bert because reduce_sum will output all zeros
int32_t input_type;
GetType(*input_defs[0], input_type, logger);
if (node.OpType() == "ReduceSum" && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {

View file

@ -13,6 +13,10 @@
#include "core/optimizer/initializer.h"
#include "core/providers/cpu/tensor/unsqueeze.h"
#ifdef __APPLE__
#include <TargetConditionals.h>
#endif
namespace onnxruntime {
namespace coreml {
@ -54,32 +58,50 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
}
}
#if defined(COREML_ENABLE_MLPROGRAM)
void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
const Node& node, const logging::Logger& logger) {
const auto& input_defs(node.InputDefs());
TensorShapeVector axes;
GetAxes(model_builder, node, axes);
std::vector<int64_t> input_shape;
GetShape(*input_defs[0], input_shape, logger);
auto op = model_builder.CreateOperation(node, "reshape");
AddOperationInput(*op, "x", input_defs[0]->Name());
TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
}
#endif
Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const Node& node,
[[maybe_unused]] const logging::Logger& logger) const {
std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
const auto& input_defs(node.InputDefs());
auto* coreml_squeeze = layer->mutable_squeeze();
TensorShapeVector axes;
GetAxes(model_builder, node, axes);
std::vector<int64_t> input_shape;
GetShape(*input_defs[0], input_shape, logger);
#if defined(COREML_ENABLE_MLPROGRAM)
const auto& input_defs(node.InputDefs());
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "reshape";
#if defined(TARGET_CPU_X86_64) && TARGET_CPU_X86_64
// expand_dims has limited requirements for static shape, however, X86_64 has a bug that it can't handle scalar input
if (node.OpType() == "Unsqueeze" && input_defs[0]->Shape()->dim_size() < 2) {
HandleX86ArchUnsqueezeScalarInput(model_builder, node, logger);
return Status::OK();
}
#endif
std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "expand_dims";
std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
AddOperationInput(*op, "x", input_defs[0]->Name());
if (coreml_op_type == "squeeze") {
if (!axes.empty()) {
// coreml squeeze op does support negative axes
AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
}
} else {
TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
if (!axes.empty()) {
// coreml supports negative axes
AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
}
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));

View file

@ -408,7 +408,7 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
: graph_viewer_(graph_viewer),
logger_(logger),
coreml_version_(coreml_version),
coreml_compute_unit_(coreml_options.ComputeUnits()),
coreml_options_(coreml_options),
create_ml_program_(coreml_options.CreateMLProgram()),
model_output_path_(GetModelOutputPath(create_ml_program_)),
onnx_input_names_(std::move(onnx_input_names)),
@ -989,7 +989,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
get_sanitized_io_info(std::move(input_output_info_)),
std::move(scalar_outputs_),
std::move(int64_outputs_),
logger_, coreml_compute_unit_);
logger_, coreml_options_);
} else
#endif
{
@ -999,7 +999,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
std::move(input_output_info_),
std::move(scalar_outputs_),
std::move(int64_outputs_),
logger_, coreml_compute_unit_);
logger_, coreml_options_);
}
return model->LoadModel(); // load using CoreML API, including compilation

View file

@ -7,6 +7,7 @@
#include "core/graph/graph_viewer.h"
#include "core/providers/coreml/builders/coreml_spec.h"
#include "core/providers/coreml/model/model.h"
#include "core/providers/coreml/coreml_options.h"
#if defined(COREML_ENABLE_MLPROGRAM)
// coremltools classes
@ -22,8 +23,6 @@ class StorageWriter;
#endif
namespace onnxruntime {
class CoreMLOptions;
namespace coreml {
class IOpBuilder;
@ -218,7 +217,7 @@ class ModelBuilder {
const GraphViewer& graph_viewer_;
const logging::Logger& logger_;
const int32_t coreml_version_;
const uint32_t coreml_compute_unit_;
CoreMLOptions coreml_options_;
const bool create_ml_program_; // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
const std::string model_output_path_; // create_ml_program_ ? dir for mlpackage : filename for mlmodel

View file

@ -63,11 +63,14 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
{"MLProgram", COREML_FLAG_CREATE_MLPROGRAM},
{"NeuralNetwork", COREML_FLAG_USE_NONE},
};
std::unordered_set<std::string> valid_options = {
const std::unordered_set<std::string_view> valid_options = {
kCoremlProviderOption_MLComputeUnits,
kCoremlProviderOption_ModelFormat,
kCoremlProviderOption_RequireStaticInputShapes,
kCoremlProviderOption_EnableOnSubgraphs,
kCoremlProviderOption_SpecializationStrategy,
kCoremlProviderOption_ProfileComputePlan,
kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
};
// Validate the options
for (const auto& option : options) {
@ -90,6 +93,16 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
require_static_shape_ = option.second == "1";
} else if (kCoremlProviderOption_EnableOnSubgraphs == option.first) {
enable_on_subgraph_ = option.second == "1";
} else if (kCoremlProviderOption_SpecializationStrategy == option.first) {
if (option.second != "Default" && option.second != "FastPrediction") {
ORT_THROW("Invalid value for option ", option.first, ": ", option.second,
". Valid values are Default and FastPrediction.");
}
strategy_ = option.second;
} else if (kCoremlProviderOption_ProfileComputePlan == option.first) {
profile_compute_plan_ = option.second == "1";
} else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
allow_low_precision_accumulation_on_gpu_ = option.second == "1";
}
}
}

View file

@ -14,6 +14,9 @@ class CoreMLOptions {
bool create_mlprogram_{false};
bool enable_on_subgraph_{false};
uint32_t compute_units_{0};
std::string strategy_;
bool profile_compute_plan_{false};
bool allow_low_precision_accumulation_on_gpu_{false};
public:
explicit CoreMLOptions(uint32_t coreml_flags);
@ -25,6 +28,9 @@ class CoreMLOptions {
bool CreateMLProgram() const { return create_mlprogram_; }
bool EnableOnSubgraph() const { return enable_on_subgraph_; }
uint32_t ComputeUnits(uint32_t specific_flag = 0xffffffff) const { return compute_units_ & specific_flag; }
bool AllowLowPrecisionAccumulationOnGPU() const { return allow_low_precision_accumulation_on_gpu_; }
bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }
private:
void ValidateAndParseProviderOption(const ProviderOptions& options);

View file

@ -18,6 +18,7 @@
#endif
namespace onnxruntime {
class CoreMLOptions;
namespace coreml {
class Execution;
@ -53,7 +54,7 @@ class Model {
std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
std::unordered_set<std::string>&& scalar_outputs,
std::unordered_set<std::string>&& int64_outputs,
const logging::Logger& logger, uint32_t coreml_compute_unit);
const logging::Logger& logger, const CoreMLOptions& coreml_options);
~Model();
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);

View file

@ -25,6 +25,7 @@
#include "core/providers/coreml/model/host_utils.h"
#include "core/providers/coreml/model/objc_str_utils.h"
#include "core/providers/coreml/shape_utils.h"
#include "core/providers/coreml/coreml_options.h"
// force the linker to create a dependency on the CoreML framework so that in MAUI usage we don't need
// to manually do this
@ -300,6 +301,53 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
return Status::OK();
}
// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
// The macro API_AVAILABLE should also be fine.
// Otherwise, the compiler will complain `MLComputePlan` is not defined.
// we define __clang_analyzer__ here is for bypass static analysis
void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
if (@available(macOS 14.4, iOS 17.4, *)) {
[MLComputePlan loadContentsOfURL:compileUrl
configuration:config
completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
if (!computePlan) {
NSLog(@"Error loading compute plan: %@", error);
// Handle error.
return;
}
MLModelStructureProgram* program = computePlan.modelStructure.program;
if (!program) {
NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
return;
}
MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
if (!mainFunction) {
NSLog(@"Error loading main function from program");
return;
}
NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
for (MLModelStructureProgramOperation* operation in operations) {
// Get the compute device usage for the operation.
MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
// Get the estimated cost of executing the operation.
MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
if (![operation.operatorName isEqualToString:@"const"]) {
NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
}
}
}];
} else {
NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
}
#endif
}
// Internal Execution class
// This class is part of the model class and handles the calls into CoreML. Specifically, it performs
// 1. Compile the model by given path for execution
@ -307,7 +355,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
// 3. The compiled model will be removed in dealloc or removed using cleanup function
class Execution {
public:
Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options);
~Execution();
Status LoadModel();
@ -320,13 +368,13 @@ class Execution {
NSString* coreml_model_path_{nil};
NSString* compiled_model_path_{nil};
const logging::Logger& logger_;
uint32_t coreml_compute_unit_{0};
CoreMLOptions coreml_options_;
MLModel* model_{nil};
};
Execution::Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_compute_unit)
Execution::Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options)
: logger_(logger),
coreml_compute_unit_(coreml_compute_unit) {
coreml_options_(coreml_options) {
@autoreleasepool {
coreml_model_path_ = util::Utf8StringToNSString(path.c_str());
}
@ -395,17 +443,41 @@ Status Execution::LoadModel() {
compiled_model_path_ = [compileUrl path];
MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_ONLY) {
uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
if (coreml_compute_unit & COREML_FLAG_USE_CPU_ONLY) {
config.computeUnits = MLComputeUnitsCPUOnly;
} else if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_AND_GPU) {
} else if (coreml_compute_unit & COREML_FLAG_USE_CPU_AND_GPU) {
config.computeUnits = MLComputeUnitsCPUAndGPU;
} else if (coreml_compute_unit_ & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
} else if (coreml_compute_unit & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
config.computeUnits = MLComputeUnitsCPUAndNeuralEngine; // Apple Neural Engine
} else {
config.computeUnits = MLComputeUnitsAll;
}
if (coreml_options_.AllowLowPrecisionAccumulationOnGPU()) {
config.allowLowPrecisionAccumulationOnGPU = YES;
}
// Set the specialization strategy to FastPrediction for macOS 10.15+
// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
// Same as above comments for why we are checking __clang_major__.
// we define __clang_analyzer__ here is for bypass static analysis
#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
if (HAS_COREML8_OR_LATER) {
MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
if (coreml_options_.UseStrategy("FastPrediction")) {
optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
config.optimizationHints = optimizationHints;
} else if (coreml_options_.UseStrategy("Default")) {
optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
config.optimizationHints = optimizationHints;
}
}
#endif
if (coreml_options_.ProfileComputePlan()) {
ProfileComputePlan(compileUrl, config);
}
model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];
if (error != nil || model_ == nil) {
@ -524,8 +596,8 @@ Model::Model(const std::string& path,
std::unordered_set<std::string>&& scalar_outputs,
std::unordered_set<std::string>&& int64_outputs,
const logging::Logger& logger,
uint32_t coreml_flags)
: execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
const CoreMLOptions& coreml_options)
: execution_(std::make_unique<Execution>(path, logger, coreml_options)),
model_input_names_(std::move(model_input_names)),
model_output_names_(std::move(model_output_names)),
input_output_info_(std::move(input_output_info)),

View file

@ -4,6 +4,7 @@
#include "core/providers/coreml/model/model.h"
namespace onnxruntime {
class CoreMLOptions;
namespace coreml {
class Execution {};
@ -15,7 +16,7 @@ Model::Model(const std::string& /*path*/,
std::unordered_set<std::string>&& scalar_outputs,
std::unordered_set<std::string>&& int64_outputs,
const logging::Logger& /*logger*/,
uint32_t /*coreml_flags*/)
const CoreMLOptions& /*coreml_flags*/)
: execution_(std::make_unique<Execution>()),
model_input_names_(std::move(model_input_names)),
model_output_names_(std::move(model_output_names)),

View file

@ -135,6 +135,9 @@ namespace perftest {
"\t [CoreML only] [MLComputeUnits]:[CPUAndNeuralEngine CPUAndGPU ALL CPUOnly] Specify to limit the backend device used to run the model.\n"
"\t [CoreML only] [AllowStaticInputShapes]:[0 1].\n"
"\t [CoreML only] [EnableOnSubgraphs]:[0 1].\n"
"\t [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
"\t [CoreML only] [ProfileComputePlan]:[0 1].\n"
"\t [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
"\t [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
"\n"
"\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"

View file

@ -346,7 +346,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
static const std::unordered_set<std::string> available_keys = {kCoremlProviderOption_MLComputeUnits,
kCoremlProviderOption_ModelFormat,
kCoremlProviderOption_RequireStaticInputShapes,
kCoremlProviderOption_EnableOnSubgraphs};
kCoremlProviderOption_EnableOnSubgraphs,
kCoremlProviderOption_SpecializationStrategy,
kCoremlProviderOption_ProfileComputePlan,
kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU};
ParseSessionConfigs(ov_string, provider_options, available_keys);
std::unordered_map<std::string, std::string> available_options = {
@ -364,6 +367,12 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
(provider_option.second == "1" || provider_option.second == "0")) {
} else if (provider_option.first == kCoremlProviderOption_EnableOnSubgraphs &&
(provider_option.second == "0" || provider_option.second == "1")) {
} else if (provider_option.first == kCoremlProviderOption_SpecializationStrategy &&
(provider_option.second == "Default" || provider_option.second == "FastPrediction")) {
} else if (provider_option.first == kCoremlProviderOption_ProfileComputePlan &&
(provider_option.second == "0" || provider_option.second == "1")) {
} else if (provider_option.first == kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU &&
(provider_option.second == "0" || provider_option.second == "1")) {
} else {
ORT_THROW("Invalid value for option ", provider_option.first, ": ", provider_option.second);
}