Label encoder opset4 (#17977)

### Description
<!-- Describe your changes. -->
Implements LabelEncoder as per `ai.onnx.ml` opset 4 for the upcoming
ONNX 1.15 release. ~~This currently depends on a new ONNX release
candidate and so is marked as draft in the meantime.~~


### Motivation and Context
Closes https://github.com/microsoft/onnxruntime/issues/17602
This commit is contained in:
Aditya Goel 2024-01-12 20:43:44 +00:00 committed by GitHub
parent 55b046e97e
commit dcd6d4cad6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 1586 additions and 812 deletions

View file

@ -425,7 +425,8 @@ Do not modify directly.*
|DictVectorizer|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = map(int64,tensor(double)), map(int64,tensor(float)), map(int64,tensor(string)), map(string,tensor(double)), map(string,tensor(float)), map(string,tensor(int64))<br/> **T2** = tensor(double), tensor(float), tensor(int64), tensor(string)|
|FeatureVectorizer|*in* X:**T1**<br> *out* Y:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
|Imputer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(int64)|
|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|2+|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|4+|**T1** = tensor(double), tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(double), tensor(float), tensor(int16), tensor(int64), tensor(string)|
|||[2, 3]|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
|||1|**T1** = tensor(int64), tensor(string)<br/> **T2** = tensor(int64), tensor(string)|
|LinearClassifier|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
|LinearRegressor|*in* X:**T**<br> *out* Y:**tensor(float)**|1+|**T** = tensor(float)|

File diff suppressed because it is too large Load diff

View file

@ -10,14 +10,12 @@ namespace onnxruntime {
namespace ml {
ONNX_CPU_OPERATOR_VERSIONED_ML_KERNEL(
LabelEncoder,
1, 1,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
DataTypeImpl::GetTensorType<int64_t>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
DataTypeImpl::GetTensorType<int64_t>()})
LabelEncoder, 1, 1,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
DataTypeImpl::GetTensorType<int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
DataTypeImpl::GetTensorType<int64_t>()})
.SinceVersion(1, 2),
LabelEncoder);
@ -39,12 +37,11 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
// map isn't going to change so get end() once instead of calling inside the for_each loop
const auto map_end = string_to_int_map_.end();
std::for_each(input.begin(), input.end(),
[&out, &map_end, this](const std::string& value) {
auto map_to = string_to_int_map_.find(value);
*out = map_to == map_end ? default_int_ : map_to->second;
++out;
});
std::for_each(input.begin(), input.end(), [&out, &map_end, this](const std::string& value) {
auto map_to = string_to_int_map_.find(value);
*out = map_to == map_end ? default_int_ : map_to->second;
++out;
});
} else {
if (!Y.IsDataTypeString())
return Status(ONNXRUNTIME, FAIL, "Input of tensor(int64) must have output of tensor(string)");
@ -55,169 +52,346 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
const auto map_end = int_to_string_map_.end();
std::for_each(input.begin(), input.end(),
[&out, &map_end, this](const int64_t& value) {
auto map_to = int_to_string_map_.find(value);
*out = map_to == map_end ? default_string_ : map_to->second;
++out;
});
std::for_each(input.begin(), input.end(), [&out, &map_end, this](const int64_t& value) {
auto map_to = int_to_string_map_.find(value);
*out = map_to == map_end ? default_string_ : map_to->second;
++out;
});
}
return Status::OK();
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
float_string,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, float_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_2<float, std::string>);
template <>
void LabelEncoder_2<float, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_floats";
_value_field_name = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
};
key_field_name_ = "keys_floats";
value_field_name_ = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
string_float,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, string_float,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_2<std::string, float>);
template <>
void LabelEncoder_2<std::string, float>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_strings";
_value_field_name = "values_floats";
info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
};
key_field_name_ = "keys_strings";
value_field_name_ = "values_floats";
info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
int64_float,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, int64_float,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_2<std::int64_t, float>);
template <>
void LabelEncoder_2<std::int64_t, float>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_int64s";
_value_field_name = "values_floats";
info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
};
key_field_name_ = "keys_int64s";
value_field_name_ = "values_floats";
info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
float_int64,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, float_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_2<float, std::int64_t>);
template <>
void LabelEncoder_2<float, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_floats";
_value_field_name = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
};
key_field_name_ = "keys_floats";
value_field_name_ = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, (std::int64_t)-1);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
string_string,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, string_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_2<std::string, std::string>)
template <>
void LabelEncoder_2<std::string, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_strings";
_value_field_name = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
};
key_field_name_ = "keys_strings";
value_field_name_ = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
float_float,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, float_float,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_2<float, float>)
template <>
void LabelEncoder_2<float, float>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_floats";
_value_field_name = "values_floats";
info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
};
key_field_name_ = "keys_floats";
value_field_name_ = "values_floats";
info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
int64_string,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, int64_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_2<std::int64_t, std::string>)
template <>
void LabelEncoder_2<std::int64_t, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_int64s";
_value_field_name = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
};
key_field_name_ = "keys_int64s";
value_field_name_ = "values_strings";
info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
string_int64,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, string_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_2<std::string, std::int64_t>)
template <>
void LabelEncoder_2<std::string, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_strings";
_value_field_name = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
};
key_field_name_ = "keys_strings";
value_field_name_ = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder,
2,
int64_int64,
KernelDefBuilder().TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
LabelEncoder, 2, 3, int64_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_2<std::int64_t, std::int64_t>)
template <>
void LabelEncoder_2<std::int64_t, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
_key_field_name = "keys_int64s";
_value_field_name = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
};
key_field_name_ = "keys_int64s";
value_field_name_ = "values_int64s";
info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, int64_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_4<std::int64_t, std::int64_t>)
template <>
void LabelEncoder_4<std::int64_t, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_int64s";
value_field_name_ = "values_int64s";
default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, int64_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_4<std::int64_t, std::string>)
template <>
void LabelEncoder_4<std::int64_t, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_int64s";
value_field_name_ = "values_strings";
default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, int64_float,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_4<std::int64_t, float>)
template <>
void LabelEncoder_4<std::int64_t, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_int64s";
value_field_name_ = "values_floats";
default_value_ = GetDefault(kernel_info, "default_float", 0.f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, float_float,
KernelDefBuilder()
.TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_4<float, float>)
template <>
void LabelEncoder_4<float, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_floats";
value_field_name_ = "values_floats";
default_value_ = GetDefault(kernel_info, "default_float", -0.f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, float_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_4<float, std::string>)
template <>
void LabelEncoder_4<float, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_floats";
value_field_name_ = "values_strings";
default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, float_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_4<float, std::int64_t>)
template <>
void LabelEncoder_4<float, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_floats";
value_field_name_ = "values_int64s";
default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, string_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_4<std::string, std::int64_t>)
template <>
void LabelEncoder_4<std::string, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_strings";
value_field_name_ = "values_int64s";
default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, string_float,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
LabelEncoder_4<std::string, float>)
template <>
void LabelEncoder_4<std::string, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_strings";
value_field_name_ = "values_floats";
default_value_ = GetDefault(kernel_info, "default_float", 0.f);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, string_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_4<std::string, std::string>)
template <>
void LabelEncoder_4<std::string, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_strings";
value_field_name_ = "values_strings";
default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, string_int16,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int16_t>()}),
LabelEncoder_4<std::string, std::int16_t>)
template <>
void LabelEncoder_4<std::string, std::int16_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_strings";
default_value_ = static_cast<std::int16_t>(GetDefault(kernel_info, "", static_cast<std::int16_t>(-1)));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, double_double,
KernelDefBuilder()
.TypeConstraint("T1",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
.TypeConstraint("T2",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
LabelEncoder_4<double, double>)
template <>
void LabelEncoder_4<double, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
default_value_ = GetDefault(kernel_info, "default_float", -0.);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, double_string,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
LabelEncoder_4<double, std::string>)
template <>
void LabelEncoder_4<double, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
value_field_name_ = "values_strings";
default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, string_double,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
LabelEncoder_4<std::string, double>)
template <>
void LabelEncoder_4<std::string, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_strings";
default_value_ = GetDefault(kernel_info, "default_float", -0.);
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, double_int64,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
LabelEncoder_4<double, std::int64_t>)
template <>
void LabelEncoder_4<double, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
value_field_name_ = "values_int64s";
default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
}
ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
LabelEncoder, 4, int64_double,
KernelDefBuilder()
.TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
LabelEncoder_4<std::int64_t, double>)
template <>
void LabelEncoder_4<std::int64_t, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
key_field_name_ = "keys_int64s";
default_value_ = GetDefault(kernel_info, "default_float", -0.);
}
} // namespace ml
} // namespace onnxruntime

View file

@ -6,6 +6,8 @@
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cpu/ml/ml_common.h"
#include "core/framework/tensorprotoutils.h"
#include "core/common/safeint.h"
namespace onnxruntime {
namespace ml {
@ -53,57 +55,182 @@ class LabelEncoder_2 final : public OpKernel {
std::vector<TKey> keys;
std::vector<TValue> values;
ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(_key_field_name, keys));
ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(_value_field_name, values));
ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(key_field_name_, keys));
ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(value_field_name_, values));
auto num_keys = keys.size();
auto num_values = values.size();
ORT_ENFORCE(num_keys == num_values,
"The ", _key_field_name, " and ", _value_field_name, " attribtues in LabelEncoder ",
"(name: ", info.node().Name(), ") must have the same length. ",
"However, the number of key is ", num_keys, " and the number of ",
"values is ", num_values, ".");
_map.reserve(num_keys);
for (size_t i = 0; i < num_keys; ++i)
_map.emplace(keys[i], values[i]);
ORT_ENFORCE(num_keys == num_values, "The ", key_field_name_, " and ", value_field_name_,
" attributes in LabelEncoder ", "(name: ", info.node().Name(), ") must have the same length. ",
"However, the number of key is ", num_keys, " and the number of ", "values is ", num_values, ".");
map_.reserve(num_keys);
for (size_t i = 0; i < num_keys; ++i) map_.emplace(keys[i], values[i]);
}
Status Compute(OpKernelContext* context) const override {
const auto* tensor_pointer = context->Input<Tensor>(0);
if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const Tensor& X = *tensor_pointer;
const TensorShape& shape = X.Shape();
Tensor& Y = *context->Output(0, shape);
const auto* X = context->Input<Tensor>(0);
const TensorShape& shape = X->Shape();
auto* Y = context->Output(0, shape);
auto input = X.template DataAsSpan<TKey>();
auto output = Y.template MutableDataAsSpan<TValue>();
for (int64_t i = 0; i < shape.Size(); ++i) {
const auto found = _map.find(input[onnxruntime::narrow<size_t>(i)]);
if (found == _map.end())
output[onnxruntime::narrow<size_t>(i)] = _default_value;
else
output[onnxruntime::narrow<size_t>(i)] = found->second;
auto input = X->template DataAsSpan<TKey>();
auto output = Y->template MutableDataAsSpan<TValue>();
auto input_iter = input.begin();
auto output_iter = output.begin();
while (input_iter != input.end()) {
const auto found = map_.find(*input_iter);
*output_iter = found == map_.end() ? default_value_ : found->second;
++output_iter;
++input_iter;
}
return Status::OK();
}
private:
// Specialize this method to set attribute names. For example, if keys' type
// is 64-bit integer, _key_field_name should be "keys_int64s". Field names
// is 64-bit integer, key_field_name_ should be "keys_int64s". Field names
// for other types can be found in ONNX spec.
void InitializeSomeFields(const OpKernelInfo& info);
// A collection of key-value pairs. Each (a_key, a_value) pair
// means that the "a_key" in the input would be mapped to "a_value".
// If _map doesn't contain "a_key", we use _default_value as its output.
InlinedHashMap<TKey, TValue> _map;
TValue _default_value;
// If map_ doesn't contain "a_key", we use default_value_ as its output.
InlinedHashMap<TKey, TValue> map_;
TValue default_value_;
// ONNX attribute name to load keys.
std::string _key_field_name;
std::string key_field_name_;
// ONNX attribute name to load values.
std::string _value_field_name;
std::string value_field_name_;
};
template <typename T>
std::vector<T> GetAttribute(const OpKernelInfo& info, const std::string& name, const std::string& tensor_name) {
if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
std::vector<T> attrs;
if (info.GetAttrs<T>(name, attrs).IsOK()) {
return attrs;
}
}
ONNX_NAMESPACE::TensorProto attr_tensor_proto;
auto result = info.GetAttr(tensor_name, &attr_tensor_proto);
if (name.empty()) {
ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name);
} else {
ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name, " or ", name);
}
SafeInt<int64_t> element_count(1);
for (auto dim : attr_tensor_proto.dims()) {
element_count *= dim;
}
const SafeInt<size_t> tensor_size(element_count);
std::vector<T> out(tensor_size);
result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), out.data(), tensor_size);
ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack tensor attribute ", name);
return out;
}
template <typename T>
T GetDefault(const OpKernelInfo& info, const std::string& attr_name, const T& backup) {
ONNX_NAMESPACE::TensorProto attr_tensor_proto;
auto result = info.GetAttr("default_tensor", &attr_tensor_proto);
if (result.IsOK() && utils::HasDataType(attr_tensor_proto)) {
T default_value;
result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), &default_value, 1);
ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack default tensor ", attr_name);
return default_value;
} else if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
T default_value;
result = info.GetAttr<T>(attr_name, &default_value);
if (result.IsOK()) {
return default_value;
}
}
return backup;
}
// We don't make use of InlinedHashMap since we make use of a custom hash and equality function.
// Introducing new template parameters in inlined_containers_fwd.h creates compilation errors
// (see https://github.com/microsoft/onnxruntime/pull/17977#discussion_r1446510961).
#ifndef DISABLE_ABSEIL
template <typename T>
using HashFunc = absl::container_internal::hash_default_hash<T>;
template <typename T>
using EqualFunc = absl::container_internal::hash_default_eq<T>;
template <typename K, typename V, typename Hash, typename Equal>
using HashMap = absl::flat_hash_map<K, V, Hash, Equal>;
#else
template <typename T>
using HashFunc = std::hash<T>;
template <typename T>
using EqualFunc = std::equal_to<T>;
template <typename K, typename V, typename Hash, typename Equal>
using HashMap = std::unordered_map<K, V, Hash, Equal>;
#endif // DISABLE_ABSEIL
template <typename T>
struct NaNHash {
size_t operator()(const T& value) const {
if constexpr (std::is_floating_point_v<T>) {
if (std::isnan(value)) {
return 0;
}
}
return HashFunc<T>{}(value);
}
};
template <typename T>
struct NaNEqual {
bool operator()(const T& lhs, const T& rhs) const {
if constexpr (std::is_floating_point_v<T>) {
if (std::isnan(lhs) && std::isnan(rhs)) {
return true;
}
}
return EqualFunc<T>{}(lhs, rhs);
}
};
template <typename TKey, typename TValue>
class LabelEncoder_4 final : public OpKernel {
public:
LabelEncoder_4(const OpKernelInfo& kernel_info) : OpKernel(kernel_info) {
InitializeAttrFields(kernel_info);
auto keys = GetAttribute<TKey>(kernel_info, key_field_name_, "keys_tensor");
auto values = GetAttribute<TValue>(kernel_info, value_field_name_, "values_tensor");
ORT_ENFORCE(keys.size() == values.size(), "Keys and values must have the same length.");
for (size_t i = 0; i < keys.size(); ++i) {
map_.emplace(keys[i], values[i]);
}
}
Status Compute(OpKernelContext* context) const override {
const auto* X = context->Input<Tensor>(0);
const TensorShape& shape = X->Shape();
auto* Y = context->Output(0, shape);
auto input = X->template DataAsSpan<TKey>();
auto output = Y->template MutableDataAsSpan<TValue>();
auto input_iter = input.begin();
auto output_iter = output.begin();
while (input_iter != input.end()) {
const auto found = map_.find(*input_iter);
*output_iter = found == map_.end() ? default_value_ : found->second;
++output_iter;
++input_iter;
}
return Status::OK();
}
private:
void InitializeAttrFields(const OpKernelInfo& kernel_info);
HashMap<TKey, TValue, NaNHash<TKey>, NaNEqual<TKey>> map_;
TValue default_value_;
std::string key_field_name_;
std::string value_field_name_;
};
} // namespace ml
} // namespace onnxruntime

View file

@ -8,7 +8,8 @@ namespace onnxruntime {
namespace test {
template <typename TInput, typename TOutput>
static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input, const std::vector<TOutput>& output) {
static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input,
const std::vector<TOutput>& output) {
OpTester test("LabelEncoder", 1, onnxruntime::kMLDomain);
static const std::vector<std::string> labels = {"Beer", "Wine", "Tequila"};
@ -231,5 +232,284 @@ TEST(LabelEncoder, FloatToFloatOpset2) {
test.Run();
}
TEST(LabelEncoder, Int64toInt64Opset4) {
std::vector<std::int64_t> dims{1, 5};
std::vector<int64_t> input{1, 2, 3, 4, 5};
std::vector<int64_t> output{12, 13, 14, 15, 42};
std::vector<int64_t> key_data{1, 2, 3, 4};
std::vector<int64_t> value_data{12, 13, 14, 15};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
test.AddAttribute("keys_int64s", key_data);
test.AddAttribute("values_int64s", value_data);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
default_proto.add_dims(1);
default_proto.add_int64_data(42);
test.AddAttribute("default_tensor", default_proto);
test.AddInput<int64_t>("X", dims, input);
test.AddOutput<int64_t>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, StringtoInt16Opset4) {
std::vector<std::int64_t> dims{1, 5};
const std::vector<std::string> input{"a", "b", "d", "c", "g"};
const std::vector<int16_t> output{0, 1, 42, 2, 42};
const std::vector<std::string> key_data{"a", "b", "c"};
const std::vector<int16_t> value_data{0, 1, 2};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
test.AddAttribute("keys_strings", key_data);
ONNX_NAMESPACE::TensorProto values_proto;
values_proto.set_name("values_tensor");
values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
values_proto.add_dims(value_data.size());
for (const auto value : value_data) {
values_proto.add_int32_data(value);
}
test.AddAttribute("values_tensor", values_proto);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
default_proto.add_dims(1);
default_proto.add_int32_data(42);
test.AddAttribute("default_tensor", default_proto);
test.AddInput<std::string>("X", dims, input);
test.AddOutput<int16_t>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, Int64toStringOpset4) {
std::vector<std::int64_t> dims{1, 5};
std::vector<int64_t> input{1, 2, 3, 4, 5};
std::vector<std::string> output{"Hello", "world", "_Unused", "onnxruntime", "!"};
std::vector<int64_t> key_data{1, 2, 4, 5};
std::vector<std::string> value_data{"Hello", "world", "onnxruntime", "!"};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
ONNX_NAMESPACE::TensorProto keys_proto;
keys_proto.set_name("keys_tensor");
keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
keys_proto.add_dims(key_data.size());
for (const auto key : key_data) {
keys_proto.add_int64_data(key);
}
test.AddAttribute("keys_tensor", keys_proto);
ONNX_NAMESPACE::TensorProto values_proto;
values_proto.set_name("values_tensor");
values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
values_proto.add_dims(value_data.size());
for (const auto& value : value_data) {
values_proto.add_string_data(value);
}
test.AddAttribute("values_tensor", values_proto);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
default_proto.add_dims(1);
default_proto.add_string_data("_Unused");
test.AddAttribute("default_tensor", default_proto);
test.AddInput<int64_t>("X", dims, input);
test.AddOutput<std::string>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, StringToFloatOpset4) {
std::vector<std::int64_t> dims{1, 5};
std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
std::vector<float> output{3.14f, 2.0f, -0.0f, 2.718f, 5.0f};
std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
std::vector<float> value_data{3.14f, 2.0f, 2.718f, 5.0f};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
ONNX_NAMESPACE::TensorProto keys_proto;
keys_proto.set_name("keys_tensor");
keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
keys_proto.add_dims(key_data.size());
for (const auto& key : key_data) {
keys_proto.add_string_data(key);
}
test.AddAttribute("keys_tensor", keys_proto);
ONNX_NAMESPACE::TensorProto values_proto;
values_proto.set_name("values_tensor");
values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
values_proto.add_dims(value_data.size());
for (const auto& value : value_data) {
values_proto.add_float_data(value);
}
test.AddAttribute("values_tensor", values_proto);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
default_proto.add_dims(1);
default_proto.add_float_data(-0.0f);
test.AddAttribute("default_tensor", default_proto);
test.AddInput<std::string>("X", dims, input);
test.AddOutput<float>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, StringToDoubleOpset4) {
std::vector<std::int64_t> dims{1, 5};
std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
std::vector<double> output{0.1, 1.1231e30, -0.0, 2.718, 5.0};
std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
std::vector<double> value_data{0.1, 1.1231e30, 2.718, 5.0};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
ONNX_NAMESPACE::TensorProto keys_proto;
keys_proto.set_name("keys_tensor");
keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
keys_proto.add_dims(key_data.size());
for (const auto& key : key_data) {
keys_proto.add_string_data(key);
}
test.AddAttribute("keys_tensor", keys_proto);
ONNX_NAMESPACE::TensorProto values_proto;
values_proto.set_name("values_tensor");
values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
values_proto.add_dims(value_data.size());
for (const auto& value : value_data) {
values_proto.add_double_data(value);
}
test.AddAttribute("values_tensor", values_proto);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
default_proto.add_dims(1);
default_proto.add_double_data(-0.0);
test.AddAttribute("default_tensor", default_proto);
test.AddInput<std::string>("X", dims, input);
test.AddOutput<double>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, TensorBasedAttributesOpset4) {
std::vector<std::int64_t> dims{1, 5};
std::vector<int64_t> input{1, 2, 3, 4, 5};
std::vector<int64_t> output{12, 13, 14, 15, 42};
std::vector<int64_t> key_data{1, 2, 3, 4};
std::vector<int64_t> value_data{12, 13, 14, 15};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
ONNX_NAMESPACE::TensorProto keys_proto;
keys_proto.set_name("keys_tensor");
keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
keys_proto.add_dims(key_data.size());
for (const auto key : key_data) {
keys_proto.add_int64_data(key);
}
test.AddAttribute("keys_tensor", keys_proto);
ONNX_NAMESPACE::TensorProto values_proto;
values_proto.set_name("values_tensor");
values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
values_proto.add_dims(value_data.size());
for (const auto value : value_data) {
values_proto.add_int64_data(value);
}
test.AddAttribute("values_tensor", values_proto);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
default_proto.add_dims(1);
default_proto.add_int64_data(42);
test.AddAttribute("default_tensor", default_proto);
test.AddInput<int64_t>("X", dims, input);
test.AddOutput<int64_t>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, NaNsMappedTogetherOpset4) {
std::vector<std::int64_t> dims{1, 6};
std::vector<float> input{3.14f, std::nanf("1"), 2.718f, std::nanf("2"), 5.f, -1.f};
std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
std::vector<float> key_data{3.14f, 2.718f, 5.0f, std::nanf("3")};
std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
test.AddAttribute("keys_floats", key_data);
test.AddAttribute("values_strings", value_data);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
default_proto.add_dims(1);
default_proto.add_string_data("onnxruntime");
test.AddAttribute("default_tensor", default_proto);
test.AddInput<float>("X", dims, input);
test.AddOutput<std::string>("Y", dims, output);
test.Run();
}
TEST(LabelEncoder, DoubleNaNsMappedTogetherOpset4) {
std::vector<std::int64_t> dims{1, 6};
std::vector<double> input{3.14, std::nan("1"), 2.718, std::nan("2"), 5.0, -1};
std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
std::vector<double> key_data{3.14, 2.718, 5.0, std::nan("3")};
std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
ONNX_NAMESPACE::TensorProto keys_proto;
keys_proto.set_name("keys_tensor");
keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
keys_proto.add_dims(key_data.size());
for (const auto key : key_data) {
keys_proto.add_double_data(key);
}
test.AddAttribute("keys_tensor", keys_proto);
test.AddAttribute("values_strings", value_data);
ONNX_NAMESPACE::TensorProto default_proto;
default_proto.set_name("default_tensor");
default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
default_proto.add_dims(1);
default_proto.add_string_data("onnxruntime");
test.AddAttribute("default_tensor", default_proto);
test.AddInput<double>("X", dims, input);
test.AddOutput<std::string>("Y", dims, output);
test.Run();
}
} // namespace test
} // namespace onnxruntime

View file

@ -235,10 +235,6 @@
"^test_resize_upsample_sizes_nearest_not_larger_cuda",
"^test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cuda",
// onnx 1.15 (opset 20) new and updated op tests
"^test_ai_onnx_ml_label_encoder_string_int",
"^test_ai_onnx_ml_label_encoder_string_int_no_default",
"^test_ai_onnx_ml_label_encoder_tensor_mapping",
"^test_ai_onnx_ml_label_encoder_tensor_value_only_mapping",
"^test_image_decoder_decode_bmp_rgb",
"^test_image_decoder_decode_jpeg2k_rgb",
"^test_image_decoder_decode_jpeg_bgr",