mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
Add timeseries imputer transformer featurizer kernel (#2813)
Make kernels non-template. Add input constraint for learnt data. Fixup tests. Add two more featurizers along with tests. Tests fail. min_max_scalar_transformer robust_scalar_transformer Fix tests serialized stream by prepending version bytes. Add inputation_marker_transfomer and the test. Fix up float/double type designations. Added label_encoder_transformer along with a test. string_throw case is broken at the momement. Fix labelencodertransfomer_test.cc string_throw case Rename maxabsscalertransformer_test.cc Add MissingDummiesTransformer along with the test. Update manifest. Add TimeSeriesImputerTransformer definition, implementation and tests
This commit is contained in:
parent
48e042868f
commit
afa48b7e13
6 changed files with 622 additions and 7 deletions
|
|
@ -450,7 +450,7 @@
|
|||
{
|
||||
"component": {
|
||||
"git": {
|
||||
"commitHash": "a11f5002af58a03d5902b13ef65c84cedb499024",
|
||||
"commitHash": "573070aeeb77e267da2579ac1d75d92c688bbe97",
|
||||
"repositoryUrl": "https://github.com/microsoft/FeaturizersLibrary.git"
|
||||
},
|
||||
"type": "git"
|
||||
|
|
|
|||
3
cmake/external/featurizers.cmake
vendored
3
cmake/external/featurizers.cmake
vendored
|
|
@ -3,7 +3,7 @@
|
|||
# This source code should not depend on the onnxruntime and may be built independently
|
||||
|
||||
set(featurizers_URL "https://github.com/microsoft/FeaturizersLibrary.git")
|
||||
set(featurizers_TAG "a11f5002af58a03d5902b13ef65c84cedb499024")
|
||||
set(featurizers_TAG "573070aeeb77e267da2579ac1d75d92c688bbe97")
|
||||
|
||||
set(featurizers_pref FeaturizersLibrary)
|
||||
set(featurizers_ROOT ${PROJECT_SOURCE_DIR}/external/${featurizers_pref})
|
||||
|
|
@ -24,6 +24,7 @@ if (WIN32)
|
|||
BINARY_DIR ${featurizers_BINARY_DIR}
|
||||
CMAKE_ARGS -Dfeaturizers_MSVC_STATIC_RUNTIME=${onnxruntime_MSVC_STATIC_RUNTIME}
|
||||
INSTALL_COMMAND ""
|
||||
|
||||
)
|
||||
else()
|
||||
ExternalProject_Add(featurizers_lib
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@ static void RegisterMinMaxScalarFeaturizerVer1();
|
|||
static void RegisterMissingDummiesFeaturizerVer1();
|
||||
static void RegisterRobustScalarFeaturizerVer1();
|
||||
static void RegisterStringFeaturizerVer1();
|
||||
static void RegisterTimeSeriesImputerFeaturizerVer1();
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ----------------------------------------------------------------------
|
||||
|
|
@ -55,6 +56,7 @@ void RegisterMSFeaturizersSchemas() {
|
|||
RegisterMissingDummiesFeaturizerVer1();
|
||||
RegisterRobustScalarFeaturizerVer1();
|
||||
RegisterStringFeaturizerVer1();
|
||||
RegisterTimeSeriesImputerFeaturizerVer1();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
|
@ -212,7 +214,7 @@ void RegisterDateTimeFeaturizerVer1() {
|
|||
case 0:
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT32, output);
|
||||
break;
|
||||
case 1: // fall through
|
||||
case 1: // fall through
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
|
|
@ -223,11 +225,11 @@ void RegisterDateTimeFeaturizerVer1() {
|
|||
case 9:
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output);
|
||||
break;
|
||||
case 10: // fall through
|
||||
case 10: // fall through
|
||||
case 11:
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT16, output);
|
||||
break;
|
||||
case 12: // fall through
|
||||
case 12: // fall through
|
||||
case 13:
|
||||
case 14:
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_UINT8, output);
|
||||
|
|
@ -595,7 +597,6 @@ void RegisterRobustScalarFeaturizerVer1() {
|
|||
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT32 ||
|
||||
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT64 ||
|
||||
input_elem_type == ONNX_NAMESPACE::TensorProto_DataType_DOUBLE) {
|
||||
ctx.getOutputType(0)->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, 0);
|
||||
} else {
|
||||
fail_type_inference("input 1 is expected to have a accepted type");
|
||||
|
|
@ -648,7 +649,178 @@ void RegisterStringFeaturizerVer1() {
|
|||
.TypeAndShapeInferenceFunction(
|
||||
[](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_STRING, 0);
|
||||
propagateShapeFromInputToOutput(ctx, 1, 0);
|
||||
if (hasInputShape(ctx, 1)) {
|
||||
propagateShapeFromInputToOutput(ctx, 1, 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void RegisterTimeSeriesImputerFeaturizerVer1() {
|
||||
static const char* doc = R"DOC(
|
||||
Imputes rows and column values such that the generated output does not contain any
|
||||
time gaps per grain (based on the time gaps encountered during training) and that
|
||||
all missing column values are populated according to a strategy (forward fill,
|
||||
backward fill, mode, etc.).
|
||||
|
||||
This Featurizer is unique in that it will produce 0:N rows per invocation, depending upon the
|
||||
input data.
|
||||
|
||||
C++-style pseudo signature:
|
||||
template <typename... GrainColValueTs, typename... DataColValueTs>
|
||||
std::vector<
|
||||
std::tuple<
|
||||
bool, // true if the row was added
|
||||
std::chrono::system_clock::time_point,
|
||||
std::tuple<GrainColValueTs...>,
|
||||
std::tuple<DataColValueTs...>
|
||||
>
|
||||
> execute(
|
||||
std::chrono::system_clock::time_point const &value,
|
||||
std::tuple<GrainColValueTs...> const &grain,
|
||||
std::tuple<DataColValueTs...> const &colData
|
||||
);
|
||||
|
||||
Examples:
|
||||
During training, the time period was found to be 1 day...
|
||||
|
||||
Input:
|
||||
+------+-------+------------------+-------------------+
|
||||
| time | grain | forward fill col | backward fill col |
|
||||
+======+=======+==================+===================+
|
||||
| 1 | A | 10 | None |
|
||||
+------+-------+------------------+-------------------+
|
||||
| 2 | A | None | 200 |
|
||||
+------+-------+------------------+-------------------+
|
||||
| 1 | B | -10 | -100 |
|
||||
+------+-------+------------------+-------------------+
|
||||
| 4 | A | 40 | 400 |
|
||||
+------+-------+------------------+-------------------+
|
||||
| 6 | A | 60 | 600 |
|
||||
+------+-------+------------------+-------------------+
|
||||
| 3 | B | -30 | -300 |
|
||||
+------+-------+------------------+-------------------+
|
||||
|
||||
Output:
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| Added | time | grain | forward fill col | backward fill col |
|
||||
+=======+======+=======+==================+===================+
|
||||
| false | 1 | A | 10 | 200 (from 2) |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| false | 2 | A | 10 (from 1) | 200 |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| true | 3 | A | 10 (from 2) | 400 (from 4) |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| false | 4 | A | 40 | 400 |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| true | 5 | A | 40 (from 4) | 600 (from 6) |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| false | 6 | A | 60 | 600 |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| false | 1 | B | -10 | -100 |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| true | 2 | B | -10 (from 1) | -300 (from 3) |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
| false | 3 | B | -30 | -300 |
|
||||
+-------+------+-------+------------------+-------------------+
|
||||
)DOC";
|
||||
|
||||
MS_FEATURIZERS_OPERATOR_SCHEMA(TimeSeriesImputerTransformer)
|
||||
.SinceVersion(1)
|
||||
.SetDomain(kMSFeaturizersDomain)
|
||||
.SetDoc(doc)
|
||||
.Input(
|
||||
0,
|
||||
"State",
|
||||
"State generated during training that is used for prediction",
|
||||
"T0")
|
||||
.Input(
|
||||
1,
|
||||
"Times",
|
||||
"Tensor of timestamps in seconds since epoch [R] where R is a number of rows.",
|
||||
"T1")
|
||||
.Input(
|
||||
2,
|
||||
"Keys",
|
||||
"Composite keys tensor of shape [R][K]. R is the same as Input(1)",
|
||||
"T2")
|
||||
.Input(
|
||||
3,
|
||||
"Data",
|
||||
"It is a data tensor of shape [R][C] where R - rows and C - columns. R must be the same with Input(1)",
|
||||
"T2")
|
||||
.Output(
|
||||
0,
|
||||
"Added",
|
||||
"Tensor of boolean with a shape of [IR]. Contains a boolean for each row in the result where true represents added row.",
|
||||
"T3")
|
||||
.Output(
|
||||
1,
|
||||
"ImputedTimes",
|
||||
"This is a tensor of timestamps in seconds since epoch of shape [IR], where IR is the number of output rows.",
|
||||
"T1")
|
||||
.Output(
|
||||
2,
|
||||
"ImputedKeys",
|
||||
"Contains keys along with the imputed keys. Tensor of shape [IR][K].",
|
||||
"T2")
|
||||
.Output(
|
||||
3,
|
||||
"ImputedData",
|
||||
"Tensor of shape [IR][C] where IR is the number of rows in the output."
|
||||
"C is the number of columns.",
|
||||
"T2")
|
||||
.TypeConstraint(
|
||||
"T0",
|
||||
{"tensor(uint8)"},
|
||||
"No information is available")
|
||||
.TypeConstraint(
|
||||
"T1",
|
||||
{"tensor(int64)"},
|
||||
"Represents number of seconds since epoch")
|
||||
.TypeConstraint(
|
||||
"T2",
|
||||
{"tensor(string)"},
|
||||
"Output data")
|
||||
.TypeConstraint(
|
||||
"T3",
|
||||
{"tensor(bool)"},
|
||||
"Boolean Tensor")
|
||||
.TypeAndShapeInferenceFunction(
|
||||
[](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_BOOL, 0);
|
||||
propagateElemTypeFromDtypeToOutput(ctx, ONNX_NAMESPACE::TensorProto_DataType_INT64, 1);
|
||||
// Number of output rows is not known
|
||||
ONNX_NAMESPACE::TensorShapeProto shape_0_1;
|
||||
shape_0_1.add_dim();
|
||||
ONNX_NAMESPACE::updateOutputShape(ctx, 0, shape_0_1);
|
||||
ONNX_NAMESPACE::updateOutputShape(ctx, 1, shape_0_1);
|
||||
|
||||
// Keys
|
||||
propagateElemTypeFromInputToOutput(ctx, 2, 2);
|
||||
// Keys shape
|
||||
if (hasInputShape(ctx, 2)) {
|
||||
const auto& input2_shape = getInputShape(ctx, 2);
|
||||
if (input2_shape.dim_size() != 2) {
|
||||
fail_shape_inference("Expecting keys to have 2 dimensions");
|
||||
}
|
||||
ONNX_NAMESPACE::TensorShapeProto shape;
|
||||
shape.add_dim();
|
||||
*shape.add_dim() = input2_shape.dim(1);
|
||||
ONNX_NAMESPACE::updateOutputShape(ctx, 2, shape);
|
||||
}
|
||||
|
||||
// Data shape
|
||||
propagateElemTypeFromInputToOutput(ctx, 3, 3);
|
||||
if (hasInputShape(ctx, 3)) {
|
||||
const auto& input3_shape = getInputShape(ctx, 3);
|
||||
if (input3_shape.dim_size() != 2) {
|
||||
fail_shape_inference("Expecting data to have 2 dimensions");
|
||||
}
|
||||
ONNX_NAMESPACE::TensorShapeProto shape;
|
||||
shape.add_dim();
|
||||
*shape.add_dim() = input3_shape.dim(1);
|
||||
ONNX_NAMESPACE::updateOutputShape(ctx, 3, shape);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,244 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/data_types.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include "Featurizers/TimeSeriesImputerFeaturizer.h"
|
||||
#include "Archive.h"
|
||||
|
||||
namespace ft = Microsoft::Featurizer::Featurizers;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace featurizers {
|
||||
|
||||
namespace timeseries_imputer_details {
|
||||
|
||||
inline std::chrono::system_clock::time_point ToTimePoint(int64_t secs) {
|
||||
return std::chrono::system_clock::from_time_t(secs);
|
||||
}
|
||||
|
||||
inline int64_t ToSecs(const std::chrono::system_clock::time_point& tp) {
|
||||
using namespace std::chrono;
|
||||
return duration_cast<seconds>(tp.time_since_epoch()).count();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct ToString {
|
||||
std::string operator()(T val) const {
|
||||
return std::to_string(val);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ToString<std::string> {
|
||||
const std::string& operator()(const std::string& val) const {
|
||||
return val;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ToStringOptional {
|
||||
nonstd::optional<std::string> operator()(T val) const {
|
||||
nonstd::optional<std::string> result;
|
||||
if (std::isnan(val)) {
|
||||
return result;
|
||||
}
|
||||
result = std::to_string(val);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct ToStringOptional<std::string> {
|
||||
nonstd::optional<std::string> operator()(std::string val) const {
|
||||
return (val.empty()) ? nonstd::optional<std::string>() : nonstd::optional<std::string>(std::move(val));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct FromString;
|
||||
|
||||
template <>
|
||||
struct FromString<std::string> {
|
||||
const std::string& operator()(const std::string& val) const {
|
||||
return val;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FromString<float> {
|
||||
float operator()(const std::string& val) const {
|
||||
char* str_end = nullptr;
|
||||
const char* str = val.c_str();
|
||||
float result = std::strtof(str, &str_end);
|
||||
if (str == str_end) {
|
||||
ORT_THROW("Resulting key string is not convertible to float: ", val);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FromString<double> {
|
||||
double operator()(const std::string& val) const {
|
||||
const char* str = val.c_str();
|
||||
char* str_end = nullptr;
|
||||
double result = std::strtod(str, &str_end);
|
||||
if (str == str_end) {
|
||||
ORT_THROW("Resulting key string is not convertible to double: ", val);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
template <typename T>
|
||||
struct FromStringOptional {
|
||||
T operator()(const nonstd::optional<std::string>& val) const {
|
||||
if (val.has_value()) {
|
||||
return FromString<T>()(*val);
|
||||
}
|
||||
return std::numeric_limits<T>::quiet_NaN();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct FromStringOptional<std::string> {
|
||||
std::string operator()(const nonstd::optional<std::string>& val) const {
|
||||
if (val.has_value()) {
|
||||
return *val;
|
||||
}
|
||||
return std::string();
|
||||
}
|
||||
};
|
||||
} // namespace timeseries_imputer_details
|
||||
|
||||
template <typename T>
|
||||
struct TimeSeriesImputerTransformerImpl {
|
||||
void operator()(OpKernelContext* ctx, int64_t rows) {
|
||||
const auto& state = *ctx->Input<Tensor>(0);
|
||||
const uint8_t* const state_data = state.template Data<uint8_t>();
|
||||
|
||||
const auto& times = *ctx->Input<Tensor>(1);
|
||||
const auto& keys = *ctx->Input<Tensor>(2);
|
||||
const auto& data = *ctx->Input<Tensor>(3);
|
||||
|
||||
const int64_t keys_per_row = keys.Shape()[1];
|
||||
const int64_t columns = data.Shape()[1];
|
||||
|
||||
using namespace timeseries_imputer_details;
|
||||
|
||||
using OutputType = std::tuple<bool, std::chrono::system_clock::time_point,
|
||||
std::vector<std::string>, std::vector<nonstd::optional<std::string>>>;
|
||||
std::vector<OutputType> output_rows;
|
||||
std::function<void(OutputType)> callback_fn;
|
||||
callback_fn = [&output_rows](OutputType value) -> void {
|
||||
output_rows.emplace_back(std::move(value));
|
||||
};
|
||||
|
||||
Microsoft::Featurizer::Archive archive(state_data, state.Shape().Size());
|
||||
ft::Components::TimeSeriesImputerEstimator::Transformer transformer(archive);
|
||||
|
||||
const int64_t* times_data = times.template Data<int64_t>();
|
||||
const T* const keys_data = keys.template Data<T>();
|
||||
const T* const data_data = data.template Data<T>();
|
||||
|
||||
// for each row get timestamp, get all keys, get all data and feed it
|
||||
for (int64_t row = 0; row < rows; ++row) {
|
||||
const T* const key_row_data = keys_data + (row * keys_per_row);
|
||||
const T* const keys_row_end = key_row_data + keys_per_row;
|
||||
std::vector<std::string> str_keys;
|
||||
std::transform(key_row_data, keys_row_end, std::back_inserter(str_keys),
|
||||
ToString<T>());
|
||||
|
||||
std::vector<nonstd::optional<std::string>> str_data;
|
||||
const T* const data_row = data_data + (row * columns);
|
||||
const T* const data_row_end = data_row + columns;
|
||||
std::transform(data_row, data_row_end, std::back_inserter(str_data),
|
||||
ToStringOptional<T>());
|
||||
|
||||
auto tuple_row = std::make_tuple(ToTimePoint(*times_data), std::move(str_keys), std::move(str_data));
|
||||
|
||||
transformer.execute(tuple_row, callback_fn);
|
||||
++times_data;
|
||||
}
|
||||
|
||||
transformer.flush(callback_fn);
|
||||
|
||||
// Compute output shapes now
|
||||
// Number of outputs is the number of rows,
|
||||
int64_t output_rows_num = static_cast<int64_t>(output_rows.size());
|
||||
TensorShape rows_shape({output_rows_num});
|
||||
TensorShape keys_shape({output_rows_num, keys_per_row});
|
||||
TensorShape data_shape({output_rows_num, columns});
|
||||
|
||||
auto* added_output = ctx->Output(0, rows_shape)->template MutableData<bool>();
|
||||
auto* time_output = ctx->Output(1, rows_shape)->template MutableData<int64_t>();
|
||||
auto* keys_output = ctx->Output(2, keys_shape)->template MutableData<T>();
|
||||
auto* data_output = ctx->Output(3, data_shape)->template MutableData<T>();
|
||||
|
||||
for (const auto& out : output_rows) {
|
||||
*added_output++ = std::get<0>(out);
|
||||
*time_output++ = ToSecs(std::get<1>(out));
|
||||
const auto& imputed_keys = std::get<2>(out);
|
||||
ORT_ENFORCE(static_cast<int64_t>(imputed_keys.size()) == keys_per_row,
|
||||
"resulting number of keys: ", imputed_keys.size(), " expected: ", keys_per_row);
|
||||
const auto& imputed_data = std::get<3>(out);
|
||||
ORT_ENFORCE(static_cast<int64_t>(imputed_data.size()) == columns,
|
||||
"resulting number of columns: ", imputed_data.size(), " expected: ", columns);
|
||||
keys_output = std::transform(imputed_keys.cbegin(), imputed_keys.cend(), keys_output,
|
||||
FromString<T>());
|
||||
data_output = std::transform(imputed_data.cbegin(), imputed_data.cend(), data_output,
|
||||
FromStringOptional<T>());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class TimeSeriesImputerTransformer final : public OpKernel {
|
||||
public:
|
||||
explicit TimeSeriesImputerTransformer(const OpKernelInfo& info) : OpKernel(info) {
|
||||
}
|
||||
|
||||
static Status CheckBatches(int64_t rows, const TensorShape& shape) {
|
||||
if (shape.NumDimensions() == 2) {
|
||||
ORT_RETURN_IF_NOT(rows == shape[0], "Number of rows does not match");
|
||||
} else {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expect shape of [R][C]");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Compute(OpKernelContext* ctx) const override {
|
||||
const auto& times = *ctx->Input<Tensor>(1);
|
||||
const auto& times_shape = times.Shape();
|
||||
ORT_RETURN_IF_NOT(times_shape.NumDimensions() == 1, "Times must have shape [B][R] or [R]");
|
||||
int64_t rows = times_shape[0];
|
||||
|
||||
const auto& keys = *ctx->Input<Tensor>(2);
|
||||
ORT_RETURN_IF_ERROR(CheckBatches(rows, keys.Shape()));
|
||||
const auto& data = *ctx->Input<Tensor>(3);
|
||||
ORT_RETURN_IF_ERROR(CheckBatches(rows, data.Shape()));
|
||||
|
||||
auto data_type = data.GetElementType();
|
||||
ORT_RETURN_IF_NOT(keys.GetElementType() == data_type, "Keys and data must have the same datatype");
|
||||
|
||||
TimeSeriesImputerTransformerImpl<std::string>()(ctx, rows);
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
ONNX_OPERATOR_KERNEL_EX(
|
||||
TimeSeriesImputerTransformer,
|
||||
kMSFeaturizersDomain,
|
||||
1,
|
||||
kCpuExecutionProvider,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("T0", DataTypeImpl::GetTensorType<uint8_t>())
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>())
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<std::string>()),
|
||||
TimeSeriesImputerTransformer);
|
||||
} // namespace featurizers
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -19,6 +19,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomai
|
|||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MissingDummiesTransformer);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, RobustScalarTransformer);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, StringTransformer);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, TimeSeriesImputerTransformer);
|
||||
|
||||
Status RegisterCpuMSFeaturizersKernels(KernelRegistry& kernel_registry) {
|
||||
static const BuildKernelCreateInfoFn function_table[] = {
|
||||
|
|
@ -31,6 +32,7 @@ Status RegisterCpuMSFeaturizersKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, MissingDummiesTransformer)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, RobustScalarTransformer)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, StringTransformer)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSFeaturizersDomain, 1, TimeSeriesImputerTransformer)>,
|
||||
};
|
||||
|
||||
for (auto& function_table_entry : function_table) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,196 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
|
||||
#include "Featurizers/TimeSeriesImputerFeaturizer.h"
|
||||
#include "Featurizers/TestHelpers.h"
|
||||
#include "Archive.h"
|
||||
|
||||
namespace NS = Microsoft::Featurizer;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
|
||||
inline std::chrono::system_clock::time_point GetTimePoint(std::chrono::system_clock::time_point tp, int unitsToAdd, std::string = "days") {
|
||||
return tp + std::chrono::minutes(unitsToAdd * (60 * 24));
|
||||
}
|
||||
|
||||
inline int64_t GetTimeSecs(std::chrono::system_clock::time_point tp) {
|
||||
using namespace std::chrono;
|
||||
return time_point_cast<seconds>(tp).time_since_epoch().count();
|
||||
}
|
||||
|
||||
using InputType = std::tuple<
|
||||
std::chrono::system_clock::time_point,
|
||||
std::vector<std::string>,
|
||||
std::vector<nonstd::optional<std::string>>>;
|
||||
|
||||
using TransformedType = std::vector<
|
||||
std::tuple<
|
||||
bool,
|
||||
std::chrono::system_clock::time_point,
|
||||
std::vector<std::string>,
|
||||
std::vector<nonstd::optional<std::string>>>>;
|
||||
|
||||
std::vector<uint8_t> GetStream(const std::vector<std::vector<InputType>>& trainingBatches,
|
||||
const std::vector<NS::TypeId>& colsToImputeDataTypes,
|
||||
bool supressError, NS::Featurizers::Components::TimeSeriesImputeStrategy tsImputeStrategy) {
|
||||
using TSImputerEstimator = NS::Featurizers::TimeSeriesImputerEstimator;
|
||||
|
||||
NS::AnnotationMapsPtr const pAllColumnAnnotations(NS::CreateTestAnnotationMapsPtr(1));
|
||||
TSImputerEstimator estimator(pAllColumnAnnotations, colsToImputeDataTypes, supressError, tsImputeStrategy);
|
||||
|
||||
NS::TestHelpers::Train<TSImputerEstimator, InputType>(estimator, trainingBatches);
|
||||
TSImputerEstimator::TransformerUniquePtr pTransformer(estimator.create_transformer());
|
||||
|
||||
NS::Archive ar;
|
||||
pTransformer->save(ar);
|
||||
return ar.commit();
|
||||
}
|
||||
|
||||
static void AddInputs(OpTester& test, const std::vector<std::vector<InputType>>& trainingBatches,
|
||||
const std::vector<InputType>& inferenceBatches, const std::vector<NS::TypeId>& colsToImputeDataTypes,
|
||||
bool supressError, NS::Featurizers::Components::TimeSeriesImputeStrategy tsImputeStrategy) {
|
||||
auto stream = GetStream(
|
||||
trainingBatches,
|
||||
colsToImputeDataTypes,
|
||||
supressError,
|
||||
tsImputeStrategy);
|
||||
|
||||
auto dim = static_cast<int64_t>(stream.size());
|
||||
test.AddInput<uint8_t>("State", {dim}, stream);
|
||||
|
||||
std::vector<int64_t> times;
|
||||
std::vector<std::string> keys;
|
||||
std::vector<std::string> data;
|
||||
|
||||
using namespace std::chrono;
|
||||
for (const auto& infb : inferenceBatches) {
|
||||
times.push_back(time_point_cast<seconds>(std::get<0>(infb)).time_since_epoch().count());
|
||||
keys.insert(keys.end(), std::get<1>(infb).cbegin(), std::get<1>(infb).cend());
|
||||
std::transform(std::get<2>(infb).cbegin(), std::get<2>(infb).cend(), std::back_inserter(data),
|
||||
[](const nonstd::optional<std::string>& opt) -> std::string {
|
||||
if (opt.has_value()) return *opt;
|
||||
return std::string();
|
||||
});
|
||||
}
|
||||
|
||||
// Should have equal amount of keys per row
|
||||
ASSERT_TRUE(keys.size() % times.size() == 0);
|
||||
ASSERT_TRUE(data.size() % times.size() == 0);
|
||||
test.AddInput<int64_t>("Times", {static_cast<int64_t>(times.size())}, times);
|
||||
test.AddInput<std::string>("Keys", {static_cast<int64_t>(times.size()), static_cast<int64_t>(keys.size() / times.size())}, keys);
|
||||
test.AddInput<std::string>("Data", {static_cast<int64_t>(times.size()), static_cast<int64_t>(data.size() / times.size())}, data);
|
||||
}
|
||||
|
||||
void AddOutputs(OpTester& test, const std::initializer_list<bool>& added, const std::initializer_list<std::chrono::system_clock::time_point>& times,
|
||||
const std::vector<std::string>& keys, const std::vector<std::string>& data) {
|
||||
ASSERT_TRUE(keys.size() % times.size() == 0);
|
||||
ASSERT_TRUE(data.size() % times.size() == 0);
|
||||
|
||||
std::vector<int64_t> times_int64;
|
||||
std::transform(times.begin(), times.end(), std::back_inserter(times_int64), GetTimeSecs);
|
||||
|
||||
test.AddOutput<bool>("Added", {static_cast<int64_t>(added.size())}, added);
|
||||
test.AddOutput<int64_t>("ImputedTimes", {static_cast<int64_t>(times.size())}, times_int64);
|
||||
test.AddOutput<std::string>("ImputedKeys", {static_cast<int64_t>(times.size()), static_cast<int64_t>(keys.size() / times.size())}, keys);
|
||||
test.AddOutput<std::string>("ImputedData", {static_cast<int64_t>(times.size()), static_cast<int64_t>(data.size() / times.size())}, data);
|
||||
}
|
||||
|
||||
TEST(FeaturizersTests, RowImputation_1_grain_no_gaps) {
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
auto tp_0 = GetTimePoint(now, 0);
|
||||
auto tp_1 = GetTimePoint(now, 1);
|
||||
auto tp_2 = GetTimePoint(now, 2);
|
||||
auto tuple_1 = std::make_tuple(tp_0, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_2 = std::make_tuple(tp_1, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_3 = std::make_tuple(tp_2, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"15.0", nonstd::optional<std::string>{}});
|
||||
|
||||
std::vector<InputType> inferenceBatches = {tuple_1,
|
||||
tuple_2,
|
||||
tuple_3};
|
||||
|
||||
OpTester test("TimeSeriesImputerTransformer", 1, onnxruntime::kMSFeaturizersDomain);
|
||||
|
||||
AddInputs(test, {inferenceBatches}, inferenceBatches,
|
||||
{NS::TypeId::Float64, NS::TypeId::Float64}, false, NS::Featurizers::Components::TimeSeriesImputeStrategy::Forward);
|
||||
AddOutputs(test, {false, false, false}, {tp_0, tp_1, tp_2},
|
||||
{"a", "a", "a"}, {"14.5", "18", "14.5", "12", "15.0", "12"});
|
||||
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(FeaturizersTests, RowImputation_1_grain_2_gaps) {
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
auto tp_0 = GetTimePoint(now, 0);
|
||||
auto tp_1 = GetTimePoint(now, 1);
|
||||
auto tp_2 = GetTimePoint(now, 2);
|
||||
auto tp_3 = GetTimePoint(now, 3);
|
||||
|
||||
auto tuple_0 = std::make_tuple(tp_0, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_1 = std::make_tuple(tp_1, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_3 = std::make_tuple(tp_3, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "15.0"});
|
||||
|
||||
OpTester test("TimeSeriesImputerTransformer", 1, onnxruntime::kMSFeaturizersDomain);
|
||||
AddInputs(test, {{tuple_0, tuple_1}}, {tuple_0, tuple_3},
|
||||
{NS::TypeId::Float64, NS::TypeId::Float64}, false, NS::Featurizers::Components::TimeSeriesImputeStrategy::Forward);
|
||||
|
||||
AddOutputs(test, {false, true, true, false}, {tp_0, tp_1, tp_2, tp_3},
|
||||
{"a", "a", "a", "a"}, {"14.5", "18", "14.5", "18", "14.5", "18", "14.5", "15.0"});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(FeaturizersTests, RowImputation_2_grains_no_gaps_input_interleaved) {
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
auto tp_0 = GetTimePoint(now, 0);
|
||||
auto tp_1 = GetTimePoint(now, 1);
|
||||
auto tp_5 = GetTimePoint(now, 5);
|
||||
auto tp_6 = GetTimePoint(now, 6);
|
||||
|
||||
auto tuple_0 = std::make_tuple(tp_0, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_5 = std::make_tuple(tp_5, std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_5_inf = std::make_tuple(GetTimePoint(now, 5), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5", "118"});
|
||||
auto tuple_1 = std::make_tuple(tp_1, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_6 = std::make_tuple(tp_6, std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_6_inf = std::make_tuple(GetTimePoint(now, 6), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "112"});
|
||||
|
||||
OpTester test("TimeSeriesImputerTransformer", 1, onnxruntime::kMSFeaturizersDomain);
|
||||
AddInputs(test, {{tuple_0, tuple_5, tuple_1, tuple_6}}, {tuple_0, tuple_5_inf, tuple_1, tuple_6_inf},
|
||||
{NS::TypeId::Float64, NS::TypeId::Float64}, false, NS::Featurizers::Components::TimeSeriesImputeStrategy::Forward);
|
||||
|
||||
AddOutputs(test, {false, false, false, false}, {tp_0, tp_5, tp_1, tp_6},
|
||||
{"a", "b", "a", "b"}, {"14.5", "18", "114.5", "118", "14.5", "12", "114.5", "112"});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(FeaturizersTests, RowImputation_2_grains_1_gap_input_interleaved) {
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
auto tp_0 = GetTimePoint(now, 0);
|
||||
auto tp_1 = GetTimePoint(now, 1);
|
||||
auto tp_2 = GetTimePoint(now, 2);
|
||||
auto tp_5 = GetTimePoint(now, 5);
|
||||
auto tp_6 = GetTimePoint(now, 6);
|
||||
auto tp_7 = GetTimePoint(now, 7);
|
||||
|
||||
auto tuple_0 = std::make_tuple(tp_0, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_2 = std::make_tuple(GetTimePoint(now, 2), std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_5 = std::make_tuple(tp_5, std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"14.5", "18"});
|
||||
auto tuple_5_inf = std::make_tuple(tp_5, std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{"114.5", "118"});
|
||||
auto tuple_1 = std::make_tuple(tp_1, std::vector<std::string>{"a"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_6 = std::make_tuple(tp_6, std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "12"});
|
||||
auto tuple_7 = std::make_tuple(GetTimePoint(now, 7), std::vector<std::string>{"b"}, std::vector<nonstd::optional<std::string>>{nonstd::optional<std::string>{}, "112"});
|
||||
|
||||
OpTester test("TimeSeriesImputerTransformer", 1, onnxruntime::kMSFeaturizersDomain);
|
||||
AddInputs(test, {{tuple_0, tuple_5, tuple_1, tuple_6}}, {tuple_0, tuple_5_inf, tuple_2, tuple_7},
|
||||
{NS::TypeId::Float64, NS::TypeId::Float64}, false, NS::Featurizers::Components::TimeSeriesImputeStrategy::Forward);
|
||||
|
||||
AddOutputs(test, {false, false, true, false, true, false}, {tp_0, tp_5, tp_1, tp_2, tp_6, tp_7},
|
||||
{"a", "b", "a", "a", "b", "b"}, {"14.5", "18", "114.5", "118", "14.5", "18", "14.5", "12", "114.5", "118", "114.5", "112"});
|
||||
|
||||
test.Run();
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
Loading…
Reference in a new issue